From 246df5b90b5cd193dbe77c58e5e34545269de9ed Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 03:44:03 -0400 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"{\"schema\":\"decodex/commit/1\",\"summary\":\"Modularize real-world benchmark tests\",\"authority\":\"manual\"}","authority":"manual"} --- .../real_world_job_benchmark/core_archival.rs | 243 +-------------- .../core_archival_authority.rs | 76 +++++ .../core_archival_context.rs | 101 +++++++ .../core_archival_scoring.rs | 71 +++++ .../manifest_summary.rs | 199 +------------ .../manifest_summary_scenario.rs | 118 ++++++++ .../manifest_summary_status.rs | 76 +++++ .../markdown_rendering.rs | 230 +-------------- .../markdown_rendering_external_adapters.rs | 175 +++++++++++ .../markdown_rendering_generated.rs | 61 ++++ .../memory_evolution.rs | 217 +------------- .../memory_evolution_mutations.rs | 111 +++++++ .../memory_evolution_scoring.rs | 107 +++++++ .../memory_summary.rs | 278 +----------------- .../memory_summary_failures.rs | 168 +++++++++++ .../memory_summary_markdown.rs | 46 +++ .../memory_summary_scoring.rs | 66 +++++ .../proactive_brief.rs | 216 +------------- .../proactive_brief_failures.rs | 91 ++++++ .../proactive_brief_markdown.rs | 46 +++ .../proactive_brief_scoring.rs | 83 ++++++ .../scheduled_memory.rs | 264 +---------------- .../scheduled_memory_failures.rs | 130 ++++++++ .../scheduled_memory_markdown.rs | 46 +++ .../scheduled_memory_scoring.rs | 91 ++++++ .../trace_replay_reports_qmd_trace_replay.rs | 263 +---------------- .../trace_replay_adoption_json.rs | 54 ++++ .../trace_replay_diagnostics_json.rs | 145 +++++++++ .../trace_replay_markdown_assertions.rs | 23 ++ .../trace_replay_viewer_boundaries.rs | 39 +++ 30 files changed, 1954 insertions(+), 1880 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/core_archival_authority.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/core_archival_scoring.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_scenario.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_status.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_external_adapters.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_mutations.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_scoring.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/memory_summary_failures.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/memory_summary_markdown.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/memory_summary_scoring.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_failures.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_markdown.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_scoring.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_failures.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_markdown.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_scoring.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_adoption_json.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_diagnostics_json.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_markdown_assertions.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_viewer_boundaries.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/core_archival.rs b/apps/elf-eval/tests/real_world_job_benchmark/core_archival.rs index 76c4dc79..c05cc275 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/core_archival.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/core_archival.rs @@ -1,240 +1,3 @@ -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { - let report = support::run_json_report_from(support::core_archival_memory_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!( - report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(14) - ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(14)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); - - let suites = support::array_at(&report, "/suites")?; - let core = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; - - assert_eq!(core.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(core.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); - - let jobs = support::array_at(&report, "/jobs")?; - - for job_id in [ - "core-archival-core-block-attachment-001", - "core-archival-core-block-scope-001", - "core-archival-core-block-provenance-001", - "core-archival-stale-core-detection-001", - "core-archival-archival-fallback-001", - "core-archival-project-decision-recovery-001", - ] { - let job = support::find_by_field(jobs, "/job_id", job_id)?; - - assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("core_archival_memory")); - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); - } - - let scope = support::find_by_field(jobs, "/job_id", "core-archival-core-block-scope-001")?; - let decision = - support::find_by_field(jobs, "/job_id", "core-archival-project-decision-recovery-001")?; - - assert_eq!(scope.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); - assert_eq!(scope.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); - assert_eq!(scope.pointer("/scope_violation_count").and_then(Value::as_u64), Some(0)); - assert!( - decision - .pointer("/produced_answer") - .and_then(Value::as_str) - .is_some_and(|content| content.contains("Letta remains blocked or not_tested")) - ); - assert!( - support::array_at(decision, "/produced_evidence")? - .iter() - .any(|id| id.as_str() == Some("decision-letta-export-boundary")) - ); - - Ok(()) -} - -#[test] -fn memory_authority_benchmark_covers_entity_history_and_core_archive_strengths() -> Result<()> { - let report = support::run_json_report_from(support::real_world_memory_fixture_dir())?; - - assert_eq!( - report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), - Some(4) - ); - - let suites = support::array_at(&report, "/suites")?; - let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; - let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; - - assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - memory_evolution.pointer("/history_readback_encoded_count").and_then(Value::as_u64), - Some(3) - ); - assert_eq!(core_archival.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); - - let jobs = support::array_at(&report, "/jobs")?; - let preference = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; - let core_attachment = - support::find_by_field(jobs, "/job_id", "core-archival-core-block-attachment-001")?; - let archival_fallback = - support::find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; - - assert_eq!(preference.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - preference.pointer("/evolution/history_readback_encoded").and_then(Value::as_bool), - Some(true) - ); - assert!(support::array_contains_str(preference, "/evolution/history_event_types", "update")?); - assert_eq!(core_attachment.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(archival_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); - - let adapters = support::array_at(&report, "/external_adapters/adapters")?; - let mem0 = support::find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?; - let letta = support::find_by_field(adapters, "/adapter_id", "letta_research_gate")?; - let mem0_scenarios = support::array_at(mem0, "/scenarios")?; - let mem0_history = - support::find_by_field(mem0_scenarios, "/scenario_id", "preference_correction_history")?; - let mem0_entity = - support::find_by_field(mem0_scenarios, "/scenario_id", "entity_scoped_personalization")?; - - assert_eq!(mem0_history.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(mem0_entity.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(mem0_history.pointer("/comparison_outcome").and_then(Value::as_str), Some("loss")); - assert_eq!(mem0_entity.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); - - let letta_scenarios = support::array_at(letta, "/scenarios")?; - let letta_core = - support::find_by_field(letta_scenarios, "/scenario_id", "core_block_attachment_readback")?; - let letta_fallback = - support::find_by_field(letta_scenarios, "/scenario_id", "archival_fallback_readback")?; - - for scenario in [letta_core, letta_fallback] { - assert_eq!( - scenario.pointer("/suite_id").and_then(Value::as_str), - Some("core_archival_memory") - ); - assert_eq!(scenario.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - scenario.pointer("/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); - } - - Ok(()) -} - -#[test] -fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { - let report = support::run_json_report_from(support::context_trajectory_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!( - report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(3) - ); - - let suites = support::array_at(&report, "/suites")?; - let context = support::find_by_field(suites, "/suite_id", "context_trajectory")?; - - assert_eq!(context.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); - - let jobs = support::array_at(&report, "/jobs")?; - let staged = support::find_by_field( - jobs, - "/job_id", - "context-trajectory-openviking-staged-retrieval-001", - )?; - let hierarchy = support::find_by_field( - jobs, - "/job_id", - "context-trajectory-openviking-hierarchy-selection-001", - )?; - let recursive = support::find_by_field( - jobs, - "/job_id", - "context-trajectory-openviking-recursive-expansion-001", - )?; - - assert_eq!(staged.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(hierarchy.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(recursive.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - staged.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("openviking.stage_artifact_gate") - ); - assert_eq!( - hierarchy.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("openviking.hierarchy_artifact_gate") - ); - assert_eq!( - recursive.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("openviking.recursive_expansion_gate") - ); - - let staged_stages = support::array_at(staged, "/trace_explainability/stages")?; - let staged_gate = - support::find_by_field(staged_stages, "/stage_name", "openviking.stage_artifact_gate")?; - - assert!(support::array_contains_str(staged_gate, "/dropped_evidence", "trajectory-win-decoy")?); - - let hierarchy_stages = support::array_at(hierarchy, "/trace_explainability/stages")?; - let hierarchy_gate = support::find_by_field( - hierarchy_stages, - "/stage_name", - "openviking.hierarchy_artifact_gate", - )?; - - assert!(support::array_contains_str( - hierarchy_gate, - "/dropped_evidence", - "hierarchy-design-win-decoy" - )?); - - let recursive_stages = support::array_at(recursive, "/trace_explainability/stages")?; - let recursive_gate = support::find_by_field( - recursive_stages, - "/stage_name", - "openviking.recursive_expansion_gate", - )?; - - assert!(support::array_contains_str( - recursive_gate, - "/dropped_evidence", - "recursive-expansion-win-decoy" - )?); - assert!( - staged.pointer("/reason").and_then(Value::as_str).is_some_and( - |reason| reason.contains("same-corpus output returns expected evidence ids") - ) - ); - - Ok(()) -} +mod core_archival_authority; +mod core_archival_context; +mod core_archival_scoring; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/core_archival_authority.rs b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_authority.rs new file mode 100644 index 00000000..8d7229f3 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_authority.rs @@ -0,0 +1,76 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn memory_authority_benchmark_covers_entity_history_and_core_archive_strengths() -> Result<()> { + let report = support::run_json_report_from(support::real_world_memory_fixture_dir())?; + + assert_eq!( + report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), + Some(4) + ); + + let suites = support::array_at(&report, "/suites")?; + let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; + let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memory_evolution.pointer("/history_readback_encoded_count").and_then(Value::as_u64), + Some(3) + ); + assert_eq!(core_archival.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = support::array_at(&report, "/jobs")?; + let preference = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + let core_attachment = + support::find_by_field(jobs, "/job_id", "core-archival-core-block-attachment-001")?; + let archival_fallback = + support::find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; + + assert_eq!(preference.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + preference.pointer("/evolution/history_readback_encoded").and_then(Value::as_bool), + Some(true) + ); + assert!(support::array_contains_str(preference, "/evolution/history_event_types", "update")?); + assert_eq!(core_attachment.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(archival_fallback.pointer("/status").and_then(Value::as_str), Some("pass")); + + let adapters = support::array_at(&report, "/external_adapters/adapters")?; + let mem0 = support::find_by_field(adapters, "/adapter_id", "mem0_openmemory_live_baseline")?; + let letta = support::find_by_field(adapters, "/adapter_id", "letta_research_gate")?; + let mem0_scenarios = support::array_at(mem0, "/scenarios")?; + let mem0_history = + support::find_by_field(mem0_scenarios, "/scenario_id", "preference_correction_history")?; + let mem0_entity = + support::find_by_field(mem0_scenarios, "/scenario_id", "entity_scoped_personalization")?; + + assert_eq!(mem0_history.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(mem0_entity.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(mem0_history.pointer("/comparison_outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(mem0_entity.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + + let letta_scenarios = support::array_at(letta, "/scenarios")?; + let letta_core = + support::find_by_field(letta_scenarios, "/scenario_id", "core_block_attachment_readback")?; + let letta_fallback = + support::find_by_field(letta_scenarios, "/scenario_id", "archival_fallback_readback")?; + + for scenario in [letta_core, letta_fallback] { + assert_eq!( + scenario.pointer("/suite_id").and_then(Value::as_str), + Some("core_archival_memory") + ); + assert_eq!(scenario.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + scenario.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs new file mode 100644 index 00000000..b0e4a426 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs @@ -0,0 +1,101 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { + let report = support::run_json_report_from(support::context_trajectory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(3) + ); + + let suites = support::array_at(&report, "/suites")?; + let context = support::find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(context.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = support::array_at(&report, "/jobs")?; + let staged = support::find_by_field( + jobs, + "/job_id", + "context-trajectory-openviking-staged-retrieval-001", + )?; + let hierarchy = support::find_by_field( + jobs, + "/job_id", + "context-trajectory-openviking-hierarchy-selection-001", + )?; + let recursive = support::find_by_field( + jobs, + "/job_id", + "context-trajectory-openviking-recursive-expansion-001", + )?; + + assert_eq!(staged.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(hierarchy.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(recursive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + staged.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.stage_artifact_gate") + ); + assert_eq!( + hierarchy.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.hierarchy_artifact_gate") + ); + assert_eq!( + recursive.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("openviking.recursive_expansion_gate") + ); + + let staged_stages = support::array_at(staged, "/trace_explainability/stages")?; + let staged_gate = + support::find_by_field(staged_stages, "/stage_name", "openviking.stage_artifact_gate")?; + + assert!(support::array_contains_str(staged_gate, "/dropped_evidence", "trajectory-win-decoy")?); + + let hierarchy_stages = support::array_at(hierarchy, "/trace_explainability/stages")?; + let hierarchy_gate = support::find_by_field( + hierarchy_stages, + "/stage_name", + "openviking.hierarchy_artifact_gate", + )?; + + assert!(support::array_contains_str( + hierarchy_gate, + "/dropped_evidence", + "hierarchy-design-win-decoy" + )?); + + let recursive_stages = support::array_at(recursive, "/trace_explainability/stages")?; + let recursive_gate = support::find_by_field( + recursive_stages, + "/stage_name", + "openviking.recursive_expansion_gate", + )?; + + assert!(support::array_contains_str( + recursive_gate, + "/dropped_evidence", + "recursive-expansion-win-decoy" + )?); + assert!( + staged.pointer("/reason").and_then(Value::as_str).is_some_and( + |reason| reason.contains("same-corpus output returns expected evidence ids") + ) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/core_archival_scoring.rs b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_scoring.rs new file mode 100644 index 00000000..955f6347 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_scoring.rs @@ -0,0 +1,71 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { + let report = support::run_json_report_from(support::core_archival_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(14) + ); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(14)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + + let suites = support::array_at(&report, "/suites")?; + let core = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core.pointer("/encoded_job_count").and_then(Value::as_u64), Some(6)); + + let jobs = support::array_at(&report, "/jobs")?; + + for job_id in [ + "core-archival-core-block-attachment-001", + "core-archival-core-block-scope-001", + "core-archival-core-block-provenance-001", + "core-archival-stale-core-detection-001", + "core-archival-archival-fallback-001", + "core-archival-project-decision-recovery-001", + ] { + let job = support::find_by_field(jobs, "/job_id", job_id)?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("core_archival_memory")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + } + + let scope = support::find_by_field(jobs, "/job_id", "core-archival-core-block-scope-001")?; + let decision = + support::find_by_field(jobs, "/job_id", "core-archival-project-decision-recovery-001")?; + + assert_eq!(scope.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(scope.pointer("/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert!( + decision + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|content| content.contains("Letta remains blocked or not_tested")) + ); + assert!( + support::array_at(decision, "/produced_evidence")? + .iter() + .any(|id| id.as_str() == Some("decision-letta-export-boundary")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary.rs b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary.rs index 6ec85cb5..d166b435 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary.rs @@ -1,3 +1,6 @@ +mod manifest_summary_scenario; +mod manifest_summary_status; + use serde_json::Value; pub(super) fn assert_external_adapter_manifest_summary(report: &Value) { @@ -48,198 +51,6 @@ pub(super) fn assert_external_adapter_manifest_summary(report: &Value) { Some(14) ); - assert_external_adapter_manifest_status_summary(report); - assert_external_adapter_manifest_scenario_summary(report); -} - -fn assert_external_adapter_manifest_status_summary(report: &Value) { - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/pass") - .and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/wrong_result") - .and_then(Value::as_u64), - Some(6) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/lifecycle_fail") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/incomplete") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/blocked") - .and_then(Value::as_u64), - Some(10) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/overall_status_counts/not_encoded") - .and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/capability_status_counts/mocked") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/capability_status_counts/unsupported") - .and_then(Value::as_u64), - Some(6) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/suite_status_counts/blocked") - .and_then(Value::as_u64), - Some(29) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/suite_status_counts/pass") - .and_then(Value::as_u64), - Some(27) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/suite_status_counts/incomplete") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/suite_status_counts/not_encoded") - .and_then(Value::as_u64), - Some(37) - ); -} - -fn assert_external_adapter_manifest_scenario_summary(report: &Value) { - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/real") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/mocked") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/unsupported") - .and_then(Value::as_u64), - Some(3) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/blocked") - .and_then(Value::as_u64), - Some(24) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/incomplete") - .and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") - .and_then(Value::as_u64), - Some(6) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/lifecycle_fail") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/pass") - .and_then(Value::as_u64), - Some(23) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") - .and_then(Value::as_u64), - Some(13) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_position_counts/wins") - .and_then(Value::as_u64), - Some(10) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_position_counts/ties") - .and_then(Value::as_u64), - Some(11) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_position_counts/loses") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_position_counts/untested") - .and_then(Value::as_u64), - Some(53) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/win") - .and_then(Value::as_u64), - Some(10) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/tie") - .and_then(Value::as_u64), - Some(11) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/loss") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") - .and_then(Value::as_u64), - Some(19) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") - .and_then(Value::as_u64), - Some(29) - ); - assert_eq!( - report - .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") - .and_then(Value::as_u64), - Some(5) - ); + manifest_summary_status::assert_external_adapter_manifest_status_summary(report); + manifest_summary_scenario::assert_external_adapter_manifest_scenario_summary(report); } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_scenario.rs b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_scenario.rs new file mode 100644 index 00000000..4aff7169 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_scenario.rs @@ -0,0 +1,118 @@ +use serde_json::Value; + +pub(super) fn assert_external_adapter_manifest_scenario_summary(report: &Value) { + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/real") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/mocked") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/unsupported") + .and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/blocked") + .and_then(Value::as_u64), + Some(24) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/incomplete") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/lifecycle_fail") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/pass") + .and_then(Value::as_u64), + Some(23) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(13) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/wins") + .and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/ties") + .and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/loses") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_position_counts/untested") + .and_then(Value::as_u64), + Some(53) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/win") + .and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/tie") + .and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/loss") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/not_tested") + .and_then(Value::as_u64), + Some(19) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") + .and_then(Value::as_u64), + Some(29) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/scenario_outcome_counts/non_goal") + .and_then(Value::as_u64), + Some(5) + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_status.rs b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_status.rs new file mode 100644 index 00000000..c2d01772 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/manifest_summary/manifest_summary_status.rs @@ -0,0 +1,76 @@ +use serde_json::Value; + +pub(super) fn assert_external_adapter_manifest_status_summary(report: &Value) { + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/pass") + .and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/wrong_result") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/lifecycle_fail") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/incomplete") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/blocked") + .and_then(Value::as_u64), + Some(10) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/overall_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/mocked") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/capability_status_counts/unsupported") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/blocked") + .and_then(Value::as_u64), + Some(29) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/pass") + .and_then(Value::as_u64), + Some(27) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/incomplete") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/external_adapters/summary/suite_status_counts/not_encoded") + .and_then(Value::as_u64), + Some(37) + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering.rs index 825ce9d8..32de0325 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering.rs @@ -1,228 +1,2 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::{Result, eyre}; -use serde_json::Value; - -use crate::support; - -#[test] -fn generated_json_report_renders_markdown() -> Result<()> { - let report = support::run_json_report()?; - let temp_dir = env::temp_dir().join(format!("elf-real-world-job-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("report.json"); - let markdown_path = temp_dir.join("report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("# Real-World Job Benchmark Report")); - assert!(markdown.contains("work_resume")); - assert!(markdown.contains("Capture And Integration Coverage")); - assert!(markdown.contains("External Adapter Coverage")); - assert!(markdown.contains("live-baseline-only")); - assert!(markdown.contains("live real-world")); - assert!(markdown.contains("does not convert live-baseline retrieval results")); - assert!(markdown.contains("fixture-backed")); - assert!(markdown.contains("Answer Type")); - assert!(markdown.contains("Caveat Required")); - assert!(markdown.contains("Refusal Required")); - assert!(markdown.contains("agentmemory-style hook capture")); - assert!(markdown.contains("xy844-current-worktree")); - assert!(markdown.contains("Existing live-baseline reports remain valid")); - assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=53`")); - assert!(markdown.contains( - "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=19, blocked=29, non_goal=5`" - )); - assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); - assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); - - Ok(()) -} - -#[test] -fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { - let mut report = support::run_json_report()?; - let adapters = report - .pointer_mut("/external_adapters/adapters") - .and_then(Value::as_array_mut) - .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; - let adapter = adapters - .iter_mut() - .find(|adapter| { - adapter.pointer("/adapter_id").and_then(Value::as_str) - == Some("agentmemory_live_baseline") - }) - .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; - - support::set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; - support::set_json_pointer( - adapter, - "/scenarios/0/comparison_outcome", - serde_json::json!("loss"), - )?; - support::set_json_pointer( - &mut report, - "/external_adapters/summary/scenario_position_counts", - serde_json::json!({ - "wins": 2, - "ties": 4, - "loses": 2, - "untested": 10 - }), - )?; - support::set_json_pointer( - &mut report, - "/external_adapters/summary/scenario_outcome_counts", - serde_json::json!({ - "win": 2, - "tie": 4, - "loss": 2, - "not_tested": 7, - "blocked": 1, - "non_goal": 2 - }), - )?; - - let temp_dir = - env::temp_dir().join(format!("elf-real-world-loss-scenario-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("report.json"); - let markdown_path = temp_dir.join("report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=2, untested=10`")); - assert!(markdown.contains( - "Scenario comparison outcomes: `win=2, tie=4, loss=2, not_tested=7, blocked=1, non_goal=2`" - )); - assert!(markdown.contains( - "| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `loss` |" - )); - - Ok(()) -} - -#[test] -fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenarios() -> Result<()> { - let mut report = support::run_json_report()?; - let adapters = report - .pointer_mut("/external_adapters/adapters") - .and_then(Value::as_array_mut) - .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; - - for adapter in adapters { - support::set_json_pointer(adapter, "/scenarios", serde_json::json!([]))?; - } - - support::set_json_pointer( - &mut report, - "/external_adapters/summary/scenario_status_counts", - serde_json::json!({ - "real": 0, - "mocked": 0, - "unsupported": 0, - "blocked": 0, - "incomplete": 0, - "wrong_result": 0, - "lifecycle_fail": 0, - "pass": 0, - "not_encoded": 0 - }), - )?; - support::set_json_pointer( - &mut report, - "/external_adapters/summary/scenario_position_counts", - serde_json::json!({ - "wins": 0, - "ties": 0, - "loses": 0, - "untested": 0 - }), - )?; - support::set_json_pointer( - &mut report, - "/external_adapters/summary/scenario_outcome_counts", - serde_json::json!({ - "win": 0, - "tie": 0, - "loss": 0, - "not_tested": 0, - "blocked": 0, - "non_goal": 0 - }), - )?; - - let temp_dir = - env::temp_dir().join(format!("elf-real-world-no-scenario-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("report.json"); - let markdown_path = temp_dir.join("report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("External Adapter Coverage")); - assert!(!markdown.contains("Scenario coverage statuses:")); - assert!(!markdown.contains("ELF scenario positions:")); - assert!(!markdown.contains("Scenario comparison outcomes:")); - assert!(!markdown.contains("### Adapter Scenario Judgments")); - - Ok(()) -} +mod markdown_rendering_external_adapters; +mod markdown_rendering_generated; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_external_adapters.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_external_adapters.rs new file mode 100644 index 00000000..1f098547 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_external_adapters.rs @@ -0,0 +1,175 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn external_adapter_markdown_renders_nonzero_scenario_losses() -> Result<()> { + let mut report = support::run_json_report()?; + let adapters = report + .pointer_mut("/external_adapters/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; + let adapter = adapters + .iter_mut() + .find(|adapter| { + adapter.pointer("/adapter_id").and_then(Value::as_str) + == Some("agentmemory_live_baseline") + }) + .ok_or_else(|| eyre::eyre!("missing agentmemory adapter"))?; + + support::set_json_pointer(adapter, "/scenarios/0/elf_position", serde_json::json!("loses"))?; + support::set_json_pointer( + adapter, + "/scenarios/0/comparison_outcome", + serde_json::json!("loss"), + )?; + support::set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_position_counts", + serde_json::json!({ + "wins": 2, + "ties": 4, + "loses": 2, + "untested": 10 + }), + )?; + support::set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 2, + "tie": 4, + "loss": 2, + "not_tested": 7, + "blocked": 1, + "non_goal": 2 + }), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-loss-scenario-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("ELF scenario positions: `wins=2, ties=4, loses=2, untested=10`")); + assert!(markdown.contains( + "Scenario comparison outcomes: `win=2, tie=4, loss=2, not_tested=7, blocked=1, non_goal=2`" + )); + assert!(markdown.contains( + "| `agentmemory_live_baseline` | `basic_same_corpus_retrieval` | `retrieval` | `pass` | `loss` |" + )); + + Ok(()) +} + +#[test] +fn external_adapter_markdown_omits_scenario_summary_when_manifest_has_no_scenarios() -> Result<()> { + let mut report = support::run_json_report()?; + let adapters = report + .pointer_mut("/external_adapters/adapters") + .and_then(Value::as_array_mut) + .ok_or_else(|| eyre::eyre!("missing external adapter records"))?; + + for adapter in adapters { + support::set_json_pointer(adapter, "/scenarios", serde_json::json!([]))?; + } + + support::set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_status_counts", + serde_json::json!({ + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 0, + "incomplete": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "pass": 0, + "not_encoded": 0 + }), + )?; + support::set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_position_counts", + serde_json::json!({ + "wins": 0, + "ties": 0, + "loses": 0, + "untested": 0 + }), + )?; + support::set_json_pointer( + &mut report, + "/external_adapters/summary/scenario_outcome_counts", + serde_json::json!({ + "win": 0, + "tie": 0, + "loss": 0, + "not_tested": 0, + "blocked": 0, + "non_goal": 0 + }), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-no-scenario-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("External Adapter Coverage")); + assert!(!markdown.contains("Scenario coverage statuses:")); + assert!(!markdown.contains("ELF scenario positions:")); + assert!(!markdown.contains("Scenario comparison outcomes:")); + assert!(!markdown.contains("### Adapter Scenario Judgments")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs new file mode 100644 index 00000000..f5a395c8 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs @@ -0,0 +1,61 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn generated_json_report_renders_markdown() -> Result<()> { + let report = support::run_json_report()?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-job-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("# Real-World Job Benchmark Report")); + assert!(markdown.contains("work_resume")); + assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("External Adapter Coverage")); + assert!(markdown.contains("live-baseline-only")); + assert!(markdown.contains("live real-world")); + assert!(markdown.contains("does not convert live-baseline retrieval results")); + assert!(markdown.contains("fixture-backed")); + assert!(markdown.contains("Answer Type")); + assert!(markdown.contains("Caveat Required")); + assert!(markdown.contains("Refusal Required")); + assert!(markdown.contains("agentmemory-style hook capture")); + assert!(markdown.contains("xy844-current-worktree")); + assert!(markdown.contains("Existing live-baseline reports remain valid")); + assert!(markdown.contains("### Adapter Scenario Judgments")); + assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=53`")); + assert!(markdown.contains( + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=19, blocked=29, non_goal=5`" + )); + assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); + assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution.rs index 86f83564..dbf35dc2 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution.rs @@ -1,215 +1,2 @@ -use std::{env, fs, process}; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<()> { - let report = support::run_json_report_from(support::evolution_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/evolution/history_readback_encoded_count").and_then(Value::as_u64), - Some(1) - ); - - let suites = support::array_at(&report, "/suites")?; - let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; - - assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - memory_evolution.pointer("/history_readback_encoded_count").and_then(Value::as_u64), - Some(1) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let preference_job = - support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; - let relation_job = - support::find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; - - assert_eq!( - preference_job.pointer("/evolution/history_readback_encoded").and_then(Value::as_bool), - Some(true) - ); - assert!(support::array_contains_str(preference_job, "/evolution/history_event_types", "add")?); - assert!(support::array_contains_str( - preference_job, - "/evolution/history_event_types", - "update" - )?); - assert!(support::array_contains_str( - preference_job, - "/evolution/history_event_types", - "ignore" - )?); - assert_eq!( - preference_job - .pointer("/evolution/history_requires_note_version_links") - .and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - preference_job.pointer("/evolution/selected_current_evidence/0").and_then(Value::as_str), - Some("pref-current-concise-rationale") - ); - assert_eq!( - preference_job.pointer("/evolution/selected_historical_evidence/0").and_then(Value::as_str), - Some("pref-old-terse-bullets") - ); - assert_eq!( - preference_job.pointer("/evolution/selected_rationale_evidence/0").and_then(Value::as_str), - Some("pref-update-rationale") - ); - assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - relation_job.pointer("/evolution/temporal_validity_encoded").and_then(Value::as_bool), - Some(true) - ); - - let follow_ups = support::array_at(&report, "/follow_ups")?; - - assert!(follow_ups.is_empty()); - - Ok(()) -} - -#[test] -fn memory_evolution_conflict_still_fails_when_selected_evidence_is_not_narrated() -> Result<()> { - let fixture_path = - support::evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); - let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; - - support::set_json_pointer( - &mut fixture, - "/corpus/adapter_response/answer/evidence_ids", - serde_json::json!([ - "pref-current-concise-rationale", - "pref-old-terse-bullets", - "pref-update-rationale" - ]), - )?; - support::set_json_pointer( - &mut fixture, - "/corpus/adapter_response/answer/claims", - serde_json::json!([ - { - "claim_id": "current_preference", - "text": "Use concise prose with explicit evidence before bullets.", - "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], - "confidence": "high" - }, - { - "claim_id": "preference_update_rationale", - "text": "The preference changed because terse bullets hid rationale.", - "evidence_ids": ["pref-update-rationale"], - "confidence": "high" - } - ]), - )?; - - let temp_dir = - env::temp_dir().join(format!("elf-real-world-memory-conflict-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("conflict.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(job.pointer("/evolution/conflict_detection_count").and_then(Value::as_u64), Some(0)); - assert!(support::array_contains_str( - job, - "/evolution/selected_but_not_narrated_evidence", - "pref-old-terse-bullets" - )?); - - Ok(()) -} - -#[test] -fn memory_evolution_counts_stale_answer_when_old_fact_is_answered_as_current() -> Result<()> { - let fixture_path = - support::evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); - let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; - - support::set_json_pointer( - &mut fixture, - "/corpus/adapter_response/answer/content", - Value::String( - "Use terse bullet-only benchmark updates as the current preference.".to_string(), - ), - )?; - support::set_json_pointer( - &mut fixture, - "/corpus/adapter_response/answer/evidence_ids", - serde_json::json!(["pref-old-terse-bullets"]), - )?; - support::set_json_pointer( - &mut fixture, - "/corpus/adapter_response/answer/claims", - serde_json::json!([ - { - "claim_id": "current_preference", - "text": "Use terse bullet-only benchmark updates as the current preference.", - "evidence_ids": ["pref-old-terse-bullets"], - "confidence": "high" - } - ]), - )?; - - let temp_dir = - env::temp_dir().join(format!("elf-real-world-memory-stale-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("stale_preference.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - - assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(job.pointer("/evolution/stale_answer_count").and_then(Value::as_u64), Some(1)); - - Ok(()) -} +mod memory_evolution_mutations; +mod memory_evolution_scoring; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_mutations.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_mutations.rs new file mode 100644 index 00000000..c1b778d0 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_mutations.rs @@ -0,0 +1,111 @@ +use std::{env, fs, process}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn memory_evolution_conflict_still_fails_when_selected_evidence_is_not_narrated() -> Result<()> { + let fixture_path = + support::evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + support::set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!([ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ]), + )?; + support::set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_preference", + "text": "Use concise prose with explicit evidence before bullets.", + "evidence_ids": ["pref-current-concise-rationale", "pref-update-rationale"], + "confidence": "high" + }, + { + "claim_id": "preference_update_rationale", + "text": "The preference changed because terse bullets hid rationale.", + "evidence_ids": ["pref-update-rationale"], + "confidence": "high" + } + ]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-conflict-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("conflict.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(job.pointer("/evolution/conflict_detection_count").and_then(Value::as_u64), Some(0)); + assert!(support::array_contains_str( + job, + "/evolution/selected_but_not_narrated_evidence", + "pref-old-terse-bullets" + )?); + + Ok(()) +} +#[test] +fn memory_evolution_counts_stale_answer_when_old_fact_is_answered_as_current() -> Result<()> { + let fixture_path = + support::evolution_fixture_dir().join("preference_changed_current_vs_historical.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + support::set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/content", + Value::String( + "Use terse bullet-only benchmark updates as the current preference.".to_string(), + ), + )?; + support::set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["pref-old-terse-bullets"]), + )?; + support::set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([ + { + "claim_id": "current_preference", + "text": "Use terse bullet-only benchmark updates as the current preference.", + "evidence_ids": ["pref-old-terse-bullets"], + "confidence": "high" + } + ]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-stale-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_preference.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(job.pointer("/evolution/stale_answer_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_scoring.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_scoring.rs new file mode 100644 index 00000000..3d8510a4 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_evolution_scoring.rs @@ -0,0 +1,107 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn memory_evolution_fixtures_report_temporal_and_staleness_metrics() -> Result<()> { + let report = support::run_json_report_from(support::evolution_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/evolution/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/evolution/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = support::array_at(&report, "/suites")?; + let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; + + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memory_evolution.pointer("/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + memory_evolution.pointer("/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let preference_job = + support::find_by_field(jobs, "/job_id", "memory-evolution-preference-001")?; + let relation_job = + support::find_by_field(jobs, "/job_id", "memory-evolution-relation-temporal-001")?; + + assert_eq!( + preference_job.pointer("/evolution/history_readback_encoded").and_then(Value::as_bool), + Some(true) + ); + assert!(support::array_contains_str(preference_job, "/evolution/history_event_types", "add")?); + assert!(support::array_contains_str( + preference_job, + "/evolution/history_event_types", + "update" + )?); + assert!(support::array_contains_str( + preference_job, + "/evolution/history_event_types", + "ignore" + )?); + assert_eq!( + preference_job + .pointer("/evolution/history_requires_note_version_links") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + preference_job.pointer("/evolution/selected_current_evidence/0").and_then(Value::as_str), + Some("pref-current-concise-rationale") + ); + assert_eq!( + preference_job.pointer("/evolution/selected_historical_evidence/0").and_then(Value::as_str), + Some("pref-old-terse-bullets") + ); + assert_eq!( + preference_job.pointer("/evolution/selected_rationale_evidence/0").and_then(Value::as_str), + Some("pref-update-rationale") + ); + assert_eq!(relation_job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + relation_job.pointer("/evolution/temporal_validity_encoded").and_then(Value::as_bool), + Some(true) + ); + + let follow_ups = support::array_at(&report, "/follow_ups")?; + + assert!(follow_ups.is_empty()); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_summary.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary.rs index 42a1efc1..55df428f 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/memory_summary.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary.rs @@ -1,275 +1,3 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn memory_summary_fixtures_score_reviewable_source_trace_contract() -> Result<()> { - let report = support::run_json_report_from(support::memory_summary_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/memory_summary/summary_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/memory_summary/entry_count").and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report - .pointer("/summary/memory_summary/covered_required_category_count") - .and_then(Value::as_u64), - Some(6) - ); - assert_eq!( - report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/memory_summary/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/memory_summary/rationale_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/memory_summary/unsupported_derived_entry_count") - .and_then(Value::as_u64), - Some(1) - ); - - let suites = support::array_at(&report, "/suites")?; - let memory_summary = support::find_by_field(suites, "/suite_id", "memory_summary")?; - - assert_eq!(memory_summary.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(memory_summary.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); - - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(job.pointer("/memory_summary/top_of_mind_count").and_then(Value::as_u64), Some(1)); - assert_eq!(job.pointer("/memory_summary/tombstone_ref_count").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn memory_summary_markdown_renders_source_trace_metrics() -> Result<()> { - let report = support::run_json_report_from(support::memory_summary_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-real-world-memory-summary-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("memory-summary-report.json"); - let markdown_path = temp_dir.join("memory-summary-report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("Memory Summary Metrics")); - assert!(markdown.contains("memory-summary-source-trace-001")); - assert!(markdown.contains("Memory summary source-ref coverage")); - assert!(markdown.contains("Invalid Top-of-Mind")); - assert!(markdown.contains("Derived Unsupported")); - - Ok(()) -} - -#[test] -fn memory_summary_fixture_fails_stale_top_of_mind_entries() -> Result<()> { - let fixture_path = - support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["category"] = - Value::String("top_of_mind".to_string()); - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["freshness"] - ["status"] = Value::String("current".to_string()); - - let temp_dir = - env::temp_dir().join(format!("elf-memory-summary-stale-current-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("stale_current_summary.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn memory_summary_fixture_fails_tombstoned_top_of_mind_entries() -> Result<()> { - let fixture_path = - support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["category"] = - Value::String("top_of_mind".to_string()); - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] - ["status"] = Value::String("current".to_string()); - - let temp_dir = env::temp_dir() - .join(format!("elf-memory-summary-tombstone-current-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write( - temp_dir.join("tombstone_current_summary.json"), - serde_json::to_vec_pretty(&fixture)?, - )?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn memory_summary_fixture_fails_untraced_derived_profile_entries() -> Result<()> { - let fixture_path = - support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["unsupported_claim_flags"] = - Value::Array(Vec::new()); - - let temp_dir = - env::temp_dir().join(format!("elf-memory-summary-untraced-derived-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write( - temp_dir.join("untraced_derived_summary.json"), - serde_json::to_vec_pretty(&fixture)?, - )?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); - assert_eq!( - job.pointer("/memory_summary/derived_missing_source_or_unsupported_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn memory_summary_fixture_fails_unsupported_current_derived_entries() -> Result<()> { - let fixture_path = - support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["source_refs"] = - Value::Array(vec![Value::String("summary-contract-non-parity-boundary".to_string())]); - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["freshness"] - ["status"] = Value::String("current".to_string()); - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["rationale"] - ["decision"] = Value::String("included".to_string()); - - let temp_dir = env::temp_dir() - .join(format!("elf-memory-summary-unsupported-current-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write( - temp_dir.join("unsupported_current_summary.json"), - serde_json::to_vec_pretty(&fixture)?, - )?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/memory_summary/unsupported_current_entry_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn memory_summary_fixture_fails_tombstone_entries_without_tombstone_refs() -> Result<()> { - let fixture_path = - support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] - ["tombstone_refs"] = Value::Array(Vec::new()); - - let temp_dir = - env::temp_dir().join(format!("elf-memory-summary-tombstone-refs-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write( - temp_dir.join("missing_tombstone_refs_summary.json"), - serde_json::to_vec_pretty(&fixture)?, - )?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/memory_summary/freshness_coverage").and_then(Value::as_f64), - Some(0.857) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} +mod memory_summary_failures; +mod memory_summary_markdown; +mod memory_summary_scoring; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_failures.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_failures.rs new file mode 100644 index 00000000..c205d08f --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_failures.rs @@ -0,0 +1,168 @@ +use std::{env, fs, process}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn memory_summary_fixture_fails_stale_top_of_mind_entries() -> Result<()> { + let fixture_path = + support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][2]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_summary.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn memory_summary_fixture_fails_tombstoned_top_of_mind_entries() -> Result<()> { + let fixture_path = + support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["category"] = + Value::String("top_of_mind".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-tombstone-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("tombstone_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn memory_summary_fixture_fails_untraced_derived_profile_entries() -> Result<()> { + let fixture_path = + support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["unsupported_claim_flags"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-untraced-derived-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("untraced_derived_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/memory_summary/derived_missing_source_or_unsupported_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn memory_summary_fixture_fails_unsupported_current_derived_entries() -> Result<()> { + let fixture_path = + support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["source_refs"] = + Value::Array(vec![Value::String("summary-contract-non-parity-boundary".to_string())]); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][6]["rationale"] + ["decision"] = Value::String("included".to_string()); + + let temp_dir = env::temp_dir() + .join(format!("elf-memory-summary-unsupported-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("unsupported_current_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/unsupported_current_entry_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn memory_summary_fixture_fails_tombstone_entries_without_tombstone_refs() -> Result<()> { + let fixture_path = + support::memory_summary_fixture_dir().join("reviewable_summary_source_trace.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["memory_summaries"][0]["entries"][4]["freshness"] + ["tombstone_refs"] = Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-memory-summary-tombstone-refs-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write( + temp_dir.join("missing_tombstone_refs_summary.json"), + serde_json::to_vec_pretty(&fixture)?, + )?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(0.857) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_markdown.rs new file mode 100644 index 00000000..7001de00 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_markdown.rs @@ -0,0 +1,46 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn memory_summary_markdown_renders_source_trace_metrics() -> Result<()> { + let report = support::run_json_report_from(support::memory_summary_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-memory-summary-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("memory-summary-report.json"); + let markdown_path = temp_dir.join("memory-summary-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Memory Summary Metrics")); + assert!(markdown.contains("memory-summary-source-trace-001")); + assert!(markdown.contains("Memory summary source-ref coverage")); + assert!(markdown.contains("Invalid Top-of-Mind")); + assert!(markdown.contains("Derived Unsupported")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_scoring.rs b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_scoring.rs new file mode 100644 index 00000000..d9e2759d --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/memory_summary_scoring.rs @@ -0,0 +1,66 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn memory_summary_fixtures_score_reviewable_source_trace_contract() -> Result<()> { + let report = support::run_json_report_from(support::memory_summary_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/memory_summary/summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/entry_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/covered_required_category_count") + .and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/rationale_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/memory_summary/unsupported_derived_entry_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = support::array_at(&report, "/suites")?; + let memory_summary = support::find_by_field(suites, "/suite_id", "memory_summary")?; + + assert_eq!(memory_summary.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_summary.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); + + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "memory-summary-source-trace-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/memory_summary/top_of_mind_count").and_then(Value::as_u64), Some(1)); + assert_eq!(job.pointer("/memory_summary/tombstone_ref_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief.rs b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief.rs index 2e434fb7..72d726fb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief.rs @@ -1,213 +1,3 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn proactive_brief_fixtures_score_source_linked_suggestions() -> Result<()> { - let report = support::run_json_report_from(support::proactive_brief_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/proactive_brief/brief_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/action_rationale_coverage") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/invalid_current_suggestion_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/tombstone_violation_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/rejected_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/deferred_count").and_then(Value::as_u64), - Some(2) - ); - - let suites = support::array_at(&report, "/suites")?; - let proactive = support::find_by_field(suites, "/suite_id", "proactive_brief")?; - - assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - - let jobs = support::array_at(&report, "/jobs")?; - let daily = support::find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; - let private = - support::find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; - - assert_eq!(daily.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - daily.pointer("/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert!( - report - .pointer("/follow_ups/0/title") - .and_then(Value::as_str) - .is_some_and(|title| title.contains("XY-930")) - ); - - Ok(()) -} - -#[test] -fn proactive_brief_markdown_renders_source_and_freshness_metrics() -> Result<()> { - let report = support::run_json_report_from(support::proactive_brief_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-real-world-proactive-brief-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("proactive-brief-report.json"); - let markdown_path = temp_dir.join("proactive-brief-report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("Proactive Brief Metrics")); - assert!(markdown.contains("proactive-daily-project-brief-001")); - assert!(markdown.contains("Proactive evidence-ref coverage")); - assert!(markdown.contains("Invalid Current")); - assert!(markdown.contains("Tombstone Violations")); - - Ok(()) -} - -#[test] -fn proactive_brief_fixture_fails_unsupported_suggestions() -> Result<()> { - let fixture_path = support::proactive_brief_fixture_dir().join("daily_project_brief.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["evidence_refs"] = - Value::Array(Vec::new()); - - let temp_dir = - env::temp_dir().join(format!("elf-proactive-unsupported-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("unsupported_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); - assert_eq!( - job.pointer("/proactive_brief/untraced_suggestion_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn proactive_brief_fixture_fails_stale_decisions_presented_current() -> Result<()> { - let fixture_path = support::proactive_brief_fixture_dir().join("stale_decision_audit.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] - ["status"] = Value::String("current".to_string()); - - let temp_dir = - env::temp_dir().join(format!("elf-proactive-stale-current-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("stale_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "proactive-stale-decision-audit-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/proactive_brief/invalid_current_suggestion_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn proactive_brief_fixture_fails_tombstone_ttl_violations() -> Result<()> { - let fixture_path = - support::proactive_brief_fixture_dir().join("stale_plan_preference_warning.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] - ["status"] = Value::String("current".to_string()); - fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["action"] - ["decision"] = Value::String("recommend".to_string()); - - let temp_dir = env::temp_dir().join(format!("elf-proactive-tombstone-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("tombstone_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = - support::find_by_field(jobs, "/job_id", "proactive-stale-plan-preference-warning-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/proactive_brief/tombstone_violation_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} +mod proactive_brief_failures; +mod proactive_brief_markdown; +mod proactive_brief_scoring; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_failures.rs b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_failures.rs new file mode 100644 index 00000000..a2eaebb9 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_failures.rs @@ -0,0 +1,91 @@ +use std::{env, fs, process}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn proactive_brief_fixture_fails_unsupported_suggestions() -> Result<()> { + let fixture_path = support::proactive_brief_fixture_dir().join("daily_project_brief.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-unsupported-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("unsupported_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/proactive_brief/untraced_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn proactive_brief_fixture_fails_stale_decisions_presented_current() -> Result<()> { + let fixture_path = support::proactive_brief_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-proactive-stale-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stale_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "proactive-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/invalid_current_suggestion_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn proactive_brief_fixture_fails_tombstone_ttl_violations() -> Result<()> { + let fixture_path = + support::proactive_brief_fixture_dir().join("stale_plan_preference_warning.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + fixture["corpus"]["adapter_response"]["answer"]["proactive_briefs"][0]["suggestions"][0]["action"] + ["decision"] = Value::String("recommend".to_string()); + + let temp_dir = env::temp_dir().join(format!("elf-proactive-tombstone-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("tombstone_current_brief.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = + support::find_by_field(jobs, "/job_id", "proactive-stale-plan-preference-warning-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/proactive_brief/tombstone_violation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_markdown.rs new file mode 100644 index 00000000..a84014af --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_markdown.rs @@ -0,0 +1,46 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn proactive_brief_markdown_renders_source_and_freshness_metrics() -> Result<()> { + let report = support::run_json_report_from(support::proactive_brief_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-proactive-brief-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("proactive-brief-report.json"); + let markdown_path = temp_dir.join("proactive-brief-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Proactive Brief Metrics")); + assert!(markdown.contains("proactive-daily-project-brief-001")); + assert!(markdown.contains("Proactive evidence-ref coverage")); + assert!(markdown.contains("Invalid Current")); + assert!(markdown.contains("Tombstone Violations")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_scoring.rs b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_scoring.rs new file mode 100644 index 00000000..318eb1b1 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/proactive_brief_scoring.rs @@ -0,0 +1,83 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn proactive_brief_fixtures_score_source_linked_suggestions() -> Result<()> { + let report = support::run_json_report_from(support::proactive_brief_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/proactive_brief/brief_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/rejected_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/deferred_count").and_then(Value::as_u64), + Some(2) + ); + + let suites = support::array_at(&report, "/suites")?; + let proactive = support::find_by_field(suites, "/suite_id", "proactive_brief")?; + + assert_eq!(proactive.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(proactive.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = support::array_at(&report, "/jobs")?; + let daily = support::find_by_field(jobs, "/job_id", "proactive-daily-project-brief-001")?; + let private = + support::find_by_field(jobs, "/job_id", "proactive-private-corpus-refresh-blocked-001")?; + + assert_eq!(daily.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + daily.pointer("/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory.rs b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory.rs index 3095f487..93653fa6 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory.rs @@ -1,261 +1,3 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::{Result, eyre}; -use serde_json::Value; - -use crate::support; - -#[test] -fn scheduled_memory_fixtures_score_task_trace_gate() -> Result<()> { - let report = support::run_json_report_from(support::scheduled_memory_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/action_rationale_coverage") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/invalid_current_output_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/tombstone_violation_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/source_mutation_count").and_then(Value::as_u64), - Some(0) - ); - - let suites = support::array_at(&report, "/suites")?; - let scheduled = support::find_by_field(suites, "/suite_id", "scheduled_memory")?; - - assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - - let jobs = support::array_at(&report, "/jobs")?; - let weekly = - support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; - let private = support::find_by_field( - jobs, - "/job_id", - "scheduled-private-provider-scheduler-blocked-001", - )?; - - assert_eq!(weekly.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert!( - report - .pointer("/follow_ups/0/title") - .and_then(Value::as_str) - .is_some_and(|title| title.contains("XY-930")) - ); - - Ok(()) -} - -#[test] -fn scheduled_memory_markdown_renders_trace_metrics() -> Result<()> { - let report = support::run_json_report_from(support::scheduled_memory_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-real-world-scheduled-memory-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("scheduled-memory-report.json"); - let markdown_path = temp_dir.join("scheduled-memory-report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("Scheduled Memory Metrics")); - assert!(markdown.contains("scheduled-weekly-project-status-summary-001")); - assert!(markdown.contains("Scheduled memory evidence-ref coverage")); - assert!(markdown.contains("Trace Coverage")); - assert!(markdown.contains("Source Mutations")); - - Ok(()) -} - -#[test] -fn scheduled_memory_fixture_fails_missing_execution_trace() -> Result<()> { - let fixture_path = - support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0] - .as_object_mut() - .ok_or_else(|| eyre::eyre!("missing scheduled task object"))? - .remove("execution_trace"); - - let temp_dir = - env::temp_dir().join(format!("elf-scheduled-missing-trace-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("missing_trace.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = - support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/scheduled_memory/trace_complete_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn scheduled_memory_fixture_fails_untraced_outputs() -> Result<()> { - let fixture_path = - support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = - Value::Array(Vec::new()); - - let temp_dir = - env::temp_dir().join(format!("elf-scheduled-untraced-output-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("untraced_output.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = - support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); - assert_eq!( - job.pointer("/scheduled_memory/untraced_output_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn scheduled_memory_fixture_fails_superseded_sources_presented_current() -> Result<()> { - let fixture_path = support::scheduled_memory_fixture_dir().join("stale_decision_audit.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = - serde_json::json!(["scheduled-old-consolidation-only-decision"]); - fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["freshness"] - ["status"] = Value::String("current".to_string()); - - let temp_dir = - env::temp_dir().join(format!("elf-scheduled-superseded-current-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("superseded_current.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "scheduled-stale-decision-audit-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer("/scheduled_memory/invalid_current_output_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn scheduled_memory_fixture_fails_source_mutation() -> Result<()> { - let fixture_path = - support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); - let mut fixture = support::load_json(&fixture_path)?; - - fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["source_mutations"] = serde_json::json!([ - { - "table": "memory_notes", - "op": "update", - "note_id": "scheduled-weekly-current-gate" - } - ]); - - let temp_dir = - env::temp_dir().join(format!("elf-scheduled-source-mutation-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("source_mutation.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let jobs = support::array_at(&report, "/jobs")?; - let job = - support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; - - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("lifecycle_fail")); - assert_eq!( - job.pointer("/scheduled_memory/source_mutation_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(report.pointer("/summary/lifecycle_fail").and_then(Value::as_u64), Some(1)); - - Ok(()) -} +mod scheduled_memory_failures; +mod scheduled_memory_markdown; +mod scheduled_memory_scoring; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_failures.rs b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_failures.rs new file mode 100644 index 00000000..815552f6 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_failures.rs @@ -0,0 +1,130 @@ +use std::{env, fs, process}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn scheduled_memory_fixture_fails_missing_execution_trace() -> Result<()> { + let fixture_path = + support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0] + .as_object_mut() + .ok_or_else(|| eyre::eyre!("missing scheduled task object"))? + .remove("execution_trace"); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-missing-trace-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("missing_trace.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = + support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/trace_complete_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn scheduled_memory_fixture_fails_untraced_outputs() -> Result<()> { + let fixture_path = + support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + Value::Array(Vec::new()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-untraced-output-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("untraced_output.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = + support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!( + job.pointer("/scheduled_memory/untraced_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn scheduled_memory_fixture_fails_superseded_sources_presented_current() -> Result<()> { + let fixture_path = support::scheduled_memory_fixture_dir().join("stale_decision_audit.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["evidence_refs"] = + serde_json::json!(["scheduled-old-consolidation-only-decision"]); + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["outputs"][0]["freshness"] + ["status"] = Value::String("current".to_string()); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-superseded-current-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("superseded_current.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "scheduled-stale-decision-audit-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/scheduled_memory/invalid_current_output_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + + Ok(()) +} +#[test] +fn scheduled_memory_fixture_fails_source_mutation() -> Result<()> { + let fixture_path = + support::scheduled_memory_fixture_dir().join("weekly_project_status_summary.json"); + let mut fixture = support::load_json(&fixture_path)?; + + fixture["corpus"]["adapter_response"]["answer"]["scheduled_tasks"][0]["source_mutations"] = serde_json::json!([ + { + "table": "memory_notes", + "op": "update", + "note_id": "scheduled-weekly-current-gate" + } + ]); + + let temp_dir = + env::temp_dir().join(format!("elf-scheduled-source-mutation-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("source_mutation.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let jobs = support::array_at(&report, "/jobs")?; + let job = + support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("lifecycle_fail")); + assert_eq!( + job.pointer("/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(report.pointer("/summary/lifecycle_fail").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_markdown.rs new file mode 100644 index 00000000..8da37943 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_markdown.rs @@ -0,0 +1,46 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn scheduled_memory_markdown_renders_trace_metrics() -> Result<()> { + let report = support::run_json_report_from(support::scheduled_memory_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-scheduled-memory-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("scheduled-memory-report.json"); + let markdown_path = temp_dir.join("scheduled-memory-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Scheduled Memory Metrics")); + assert!(markdown.contains("scheduled-weekly-project-status-summary-001")); + assert!(markdown.contains("Scheduled memory evidence-ref coverage")); + assert!(markdown.contains("Trace Coverage")); + assert!(markdown.contains("Source Mutations")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_scoring.rs b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_scoring.rs new file mode 100644 index 00000000..fb4d3c7b --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/scheduled_memory_scoring.rs @@ -0,0 +1,91 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn scheduled_memory_fixtures_score_task_trace_gate() -> Result<()> { + let report = support::run_json_report_from(support::scheduled_memory_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + let suites = support::array_at(&report, "/suites")?; + let scheduled = support::find_by_field(suites, "/suite_id", "scheduled_memory")?; + + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scheduled.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + let jobs = support::array_at(&report, "/jobs")?; + let weekly = + support::find_by_field(jobs, "/job_id", "scheduled-weekly-project-status-summary-001")?; + let private = support::find_by_field( + jobs, + "/job_id", + "scheduled-private-provider-scheduler-blocked-001", + )?; + + assert_eq!(weekly.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + weekly.pointer("/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!(private.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + report + .pointer("/follow_ups/0/title") + .and_then(Value::as_str) + .is_some_and(|title| title.contains("XY-930")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay.rs b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay.rs index 9ca9b1ff..47e965b2 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay.rs @@ -1,3 +1,8 @@ +mod trace_replay_adoption_json; +mod trace_replay_diagnostics_json; +mod trace_replay_markdown_assertions; +mod trace_replay_viewer_boundaries; + use std::fs; use color_eyre::Result; @@ -18,8 +23,8 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() support::competitor_strength_adoption_report_json_path()?, )?)?; - assert_trace_replay_diagnostics_json(&report)?; - assert_trace_replay_diagnostics_markdown(&markdown); + trace_replay_diagnostics_json::assert_trace_replay_diagnostics_json(&report)?; + trace_replay_markdown_assertions::assert_trace_replay_diagnostics_markdown(&markdown); assert!(readme.contains("ELF/qmd Trace Replay Diagnostics Report - June 11, 2026")); assert!(benchmarking_index.contains("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")); @@ -29,7 +34,7 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() assert!(adoption_report.contains("Letta scenario rows remain")); assert!(adoption_report.contains("blocked or `not_tested`")); - assert_trace_replay_viewer_blocker_boundaries( + trace_replay_viewer_boundaries::assert_trace_replay_viewer_blocker_boundaries( &readme, &markdown, &adoption_report, @@ -49,257 +54,7 @@ fn qmd_trace_replay_diagnostics_report_preserves_claim_boundaries() -> Result<() } )); - assert_trace_replay_adoption_json(&adoption_json)?; - - Ok(()) -} - -fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.trace_replay_diagnostics_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923")); - assert_eq!( - support::string_array_at(report, "/outcome_terms")?, - ["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned) - ); - assert_eq!( - report.pointer("/summary/retrieval_correctness").and_then(Value::as_str), - Some("tie") - ); - assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2)); - assert_eq!( - report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64), - Some(4) - ); - assert_eq!(report.pointer("/summary/outcome_counts/win").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/outcome_counts/tie").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1)); - - assert_trace_replay_diagnostics_scenarios(report) -} - -fn assert_trace_replay_diagnostics_scenarios(report: &Value) -> Result<()> { - let scenarios = support::array_at(report, "/scenario_outcomes")?; - let retrieval = - support::find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?; - let top10 = - support::find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?; - let replay = support::find_by_field(scenarios, "/scenario_id", "replay_command_locality")?; - let trace_surface = support::find_by_field( - scenarios, - "/scenario_id", - "trace_admin_replay_surface_availability", - )?; - let operator_trace = - support::find_by_field(scenarios, "/scenario_id", "operator_debug_trace_hydration")?; - let operator_replay = support::find_by_field( - scenarios, - "/scenario_id", - "operator_debug_replay_command_availability", - )?; - let operator_candidate = support::find_by_field( - scenarios, - "/scenario_id", - "operator_debug_candidate_drop_visibility", - )?; - let operator_repair = - support::find_by_field(scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; - let operator_selected = support::find_by_field( - scenarios, - "/scenario_id", - "operator_debug_selected_but_not_narrated", - )?; - let expansion = - support::find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?; - let dense_sparse = - support::find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?; - let fusion = support::find_by_field(scenarios, "/scenario_id", "fusion_attribution")?; - let rerank = support::find_by_field(scenarios, "/scenario_id", "rerank_attribution")?; - let candidate_drop = - support::find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?; - let selected = support::find_by_field( - scenarios, - "/scenario_id", - "selected_but_not_narrated_wrong_results", - )?; - let tombstone = - support::find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?; - - assert_eq!(scenarios.len(), 16); - assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss")); - assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss")); - assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie")); - assert_eq!( - operator_trace.pointer("/evidence_class").and_then(Value::as_str), - Some("live_real_world") - ); - assert_eq!(operator_trace.pointer("/result_type").and_then(Value::as_str), Some("pass")); - assert_eq!(operator_trace.pointer("/outcome").and_then(Value::as_str), Some("win")); - assert_eq!(operator_replay.pointer("/outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(operator_candidate.pointer("/outcome").and_then(Value::as_str), Some("win")); - assert!(support::array_contains_str( - operator_candidate, - "/typed_non_pass_states", - "retrieved_but_dropped" - )?); - assert_eq!(operator_repair.pointer("/outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(operator_selected.pointer("/outcome").and_then(Value::as_str), Some("win")); - assert!(support::array_contains_str( - operator_selected, - "/typed_non_pass_states", - "selected_but_not_narrated" - )?); - assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); - assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); - assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); - assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal")); - assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal")); - assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); - assert!(support::array_contains_str( - candidate_drop, - "/typed_non_pass_states", - "retrieved_but_dropped" - )?); - assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); - assert!(support::array_contains_str( - selected, - "/typed_non_pass_states", - "selected_but_not_narrated" - )?); - assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win")); - assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result")); - assert!(support::array_contains_str( - report, - "/wrong_result_diagnostics/qmd_missing_evidence", - "delete-tombstone" - )?); - assert!(support::array_contains_str( - report, - "/claim_boundaries", - "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay." - )?); - assert!(support::array_contains_str( - report, - "/claim_boundaries", - "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity." - )?); - assert!(support::array_contains_str( - report, - "/claim_boundaries", - "Do not claim qmd beats ELF as a memory system overall." - )?); - - Ok(()) -} - -fn assert_trace_replay_diagnostics_markdown(markdown: &str) { - assert!(markdown.contains("Retrieval correctness is still tied")); - assert!(markdown.contains("| Default top-10 candidate artifact |")); - assert!(markdown.contains("| Replay command locality |")); - assert!( - markdown - .contains("| Operator-debug trace hydration | `live_real_world` | `pass` | `win` |") - ); - assert!(markdown.contains( - "| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` |" - )); - assert!(markdown.contains( - "| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` |" - )); - assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |")); - assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |")); - assert!(markdown.contains("`retrieved_but_dropped` | Defined globally as `not_tested`")); - assert!(markdown.contains("npx tsx src/cli/qmd.ts query")); - assert!(markdown.contains("cargo run -p elf-eval -- --config-a")); - assert!(markdown.contains("cargo make real-world-job-operator-ux-live-adapters")); - assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall")); - assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); -} - -fn assert_trace_replay_viewer_blocker_boundaries( - readme: &str, - markdown: &str, - adoption_report: &str, - report: &Value, - adoption_json: &Value, -) -> Result<()> { - let checked_surfaces = [ - support::collapse_whitespace(readme), - support::collapse_whitespace(markdown), - support::collapse_whitespace(adoption_report), - report.to_string(), - adoption_json.to_string(), - ]; - - for surface in checked_surfaces { - assert!(!surface.contains("blocked or not encoded")); - } - - assert!( - support::collapse_whitespace(readme) - .contains("claude-mem viewer flows remain blocked until Docker-contained") - ); - assert!( - support::collapse_whitespace(markdown) - .contains("claude-mem UI repair paths remain blocked until Docker-contained") - ); - assert!( - support::collapse_whitespace(adoption_report) - .contains("claude-mem viewer workflows remain blocked until Docker-contained") - ); - - Ok(()) -} - -fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { - let local_debug = support::find_by_field( - support::array_at(adoption, "/scenario_outcomes")?, - "/scenario_id", - "local_debug_replay_ux", - )?; - let operator_debug = support::find_by_field( - support::array_at(adoption, "/scenario_outcomes")?, - "/scenario_id", - "operator_debugging_viewer_ux", - )?; - - assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss")); - assert!( - local_debug - .pointer("/measured_claim") - .and_then(Value::as_str) - .is_some_and(|claim| claim.contains("qmd stronger on immediate top-10")) - ); - assert!(support::array_contains_str( - local_debug, - "/command_artifacts", - "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" - )?); - assert!(support::array_contains_str( - adoption, - "/claim_boundaries/not_allowed", - "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win." - )?); - assert_eq!(operator_debug.pointer("/outcome").and_then(Value::as_str), Some("win")); - assert!( - operator_debug - .pointer("/measured_claim") - .and_then(Value::as_str) - .is_some_and(|claim| claim.contains("narrow live operator-debug win over qmd")) - ); - assert!(support::array_contains_str( - operator_debug, - "/command_artifacts", - "tmp/real-world-job/operator-ux-live-adapters/summary.json" - )?); - assert!(support::array_contains_str( - adoption, - "/claim_boundaries/not_allowed", - "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." - )?); + trace_replay_adoption_json::assert_trace_replay_adoption_json(&adoption_json)?; Ok(()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_adoption_json.rs b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_adoption_json.rs new file mode 100644 index 00000000..77067c5d --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_adoption_json.rs @@ -0,0 +1,54 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_trace_replay_adoption_json(adoption: &Value) -> Result<()> { + let local_debug = support::find_by_field( + support::array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "local_debug_replay_ux", + )?; + let operator_debug = support::find_by_field( + support::array_at(adoption, "/scenario_outcomes")?, + "/scenario_id", + "operator_debugging_viewer_ux", + )?; + + assert_eq!(local_debug.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert!( + local_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("qmd stronger on immediate top-10")) + ); + assert!(support::array_contains_str( + local_debug, + "/command_artifacts", + "docs/evidence/benchmarking/2026-06-11-elf-qmd-trace-replay-diagnostics-report.md" + )?); + assert!(support::array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim qmd's trace/replay artifact win is a broad qmd-over-ELF memory-system or retrieval-quality win." + )?); + assert_eq!(operator_debug.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!( + operator_debug + .pointer("/measured_claim") + .and_then(Value::as_str) + .is_some_and(|claim| claim.contains("narrow live operator-debug win over qmd")) + ); + assert!(support::array_contains_str( + operator_debug, + "/command_artifacts", + "tmp/real-world-job/operator-ux-live-adapters/summary.json" + )?); + assert!(support::array_contains_str( + adoption, + "/claim_boundaries/not_allowed", + "Do not claim ELF broadly beats OpenMemory or claude-mem viewer UX from the narrow ELF/qmd operator-debug slice." + )?); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_diagnostics_json.rs b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_diagnostics_json.rs new file mode 100644 index 00000000..4e901bd8 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_diagnostics_json.rs @@ -0,0 +1,145 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_trace_replay_diagnostics_json(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.trace_replay_diagnostics_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-923")); + assert_eq!( + support::string_array_at(report, "/outcome_terms")?, + ["win", "tie", "loss", "not_tested", "blocked", "non_goal"].map(str::to_owned) + ); + assert_eq!( + report.pointer("/summary/retrieval_correctness").and_then(Value::as_str), + Some("tie") + ); + assert_eq!(report.pointer("/summary/outcome_counts/loss").and_then(Value::as_u64), Some(2)); + assert_eq!( + report.pointer("/summary/outcome_counts/not_tested").and_then(Value::as_u64), + Some(4) + ); + assert_eq!(report.pointer("/summary/outcome_counts/win").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/outcome_counts/tie").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/outcome_counts/non_goal").and_then(Value::as_u64), Some(1)); + + assert_trace_replay_diagnostics_scenarios(report) +} + +fn assert_trace_replay_diagnostics_scenarios(report: &Value) -> Result<()> { + let scenarios = support::array_at(report, "/scenario_outcomes")?; + let retrieval = + support::find_by_field(scenarios, "/scenario_id", "retrieval_correctness_guardrail")?; + let top10 = + support::find_by_field(scenarios, "/scenario_id", "default_top10_candidate_artifact")?; + let replay = support::find_by_field(scenarios, "/scenario_id", "replay_command_locality")?; + let trace_surface = support::find_by_field( + scenarios, + "/scenario_id", + "trace_admin_replay_surface_availability", + )?; + let operator_trace = + support::find_by_field(scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let operator_replay = support::find_by_field( + scenarios, + "/scenario_id", + "operator_debug_replay_command_availability", + )?; + let operator_candidate = support::find_by_field( + scenarios, + "/scenario_id", + "operator_debug_candidate_drop_visibility", + )?; + let operator_repair = + support::find_by_field(scenarios, "/scenario_id", "operator_debug_repair_action_clarity")?; + let operator_selected = support::find_by_field( + scenarios, + "/scenario_id", + "operator_debug_selected_but_not_narrated", + )?; + let expansion = + support::find_by_field(scenarios, "/scenario_id", "query_expansion_attribution")?; + let dense_sparse = + support::find_by_field(scenarios, "/scenario_id", "dense_sparse_channel_attribution")?; + let fusion = support::find_by_field(scenarios, "/scenario_id", "fusion_attribution")?; + let rerank = support::find_by_field(scenarios, "/scenario_id", "rerank_attribution")?; + let candidate_drop = + support::find_by_field(scenarios, "/scenario_id", "candidate_drop_diagnostics")?; + let selected = support::find_by_field( + scenarios, + "/scenario_id", + "selected_but_not_narrated_wrong_results", + )?; + let tombstone = + support::find_by_field(scenarios, "/scenario_id", "evidence_absent_tombstone_diagnostics")?; + + assert_eq!(scenarios.len(), 16); + assert_eq!(retrieval.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(top10.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(replay.pointer("/outcome").and_then(Value::as_str), Some("loss")); + assert_eq!(trace_surface.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!( + operator_trace.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!(operator_trace.pointer("/result_type").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_trace.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(operator_replay.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_candidate.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(support::array_contains_str( + operator_candidate, + "/typed_non_pass_states", + "retrieved_but_dropped" + )?); + assert_eq!(operator_repair.pointer("/outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(operator_selected.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert!(support::array_contains_str( + operator_selected, + "/typed_non_pass_states", + "selected_but_not_narrated" + )?); + assert_eq!(expansion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(dense_sparse.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(fusion.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert_eq!(rerank.pointer("/result_type").and_then(Value::as_str), Some("non_goal")); + assert_eq!(rerank.pointer("/outcome").and_then(Value::as_str), Some("non_goal")); + assert_eq!(candidate_drop.pointer("/outcome").and_then(Value::as_str), Some("not_tested")); + assert!(support::array_contains_str( + candidate_drop, + "/typed_non_pass_states", + "retrieved_but_dropped" + )?); + assert_eq!(selected.pointer("/result_type").and_then(Value::as_str), Some("wrong_result")); + assert!(support::array_contains_str( + selected, + "/typed_non_pass_states", + "selected_but_not_narrated" + )?); + assert_eq!(tombstone.pointer("/outcome").and_then(Value::as_str), Some("win")); + assert_eq!(tombstone.pointer("/qmd_status").and_then(Value::as_str), Some("wrong_result")); + assert!(support::array_contains_str( + report, + "/wrong_result_diagnostics/qmd_missing_evidence", + "delete-tombstone" + )?); + assert!(support::array_contains_str( + report, + "/claim_boundaries", + "qmd currently wins the default local-debug artifact surface: top-10 rows plus short CLI replay." + )?); + assert!(support::array_contains_str( + report, + "/claim_boundaries", + "ELF narrowly wins the live operator-debug trace hydration and candidate-drop visibility slice against qmd; qmd still ties replay-command and repair-action clarity." + )?); + assert!(support::array_contains_str( + report, + "/claim_boundaries", + "Do not claim qmd beats ELF as a memory system overall." + )?); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_markdown_assertions.rs b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_markdown_assertions.rs new file mode 100644 index 00000000..c3080f29 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_markdown_assertions.rs @@ -0,0 +1,23 @@ +pub(super) fn assert_trace_replay_diagnostics_markdown(markdown: &str) { + assert!(markdown.contains("Retrieval correctness is still tied")); + assert!(markdown.contains("| Default top-10 candidate artifact |")); + assert!(markdown.contains("| Replay command locality |")); + assert!( + markdown + .contains("| Operator-debug trace hydration | `live_real_world` | `pass` | `win` |") + ); + assert!(markdown.contains( + "| Operator-debug replay command availability | `live_real_world` | `pass` | `tie` |" + )); + assert!(markdown.contains( + "| Operator-debug candidate-drop visibility | `live_real_world` | `pass` | `win` |" + )); + assert!(markdown.contains("| Rerank attribution | `live_baseline_only` | `non_goal` |")); + assert!(markdown.contains("| Candidate-drop diagnostics | `research_gate` | `not_encoded` |")); + assert!(markdown.contains("`retrieved_but_dropped` | Defined globally as `not_tested`")); + assert!(markdown.contains("npx tsx src/cli/qmd.ts query")); + assert!(markdown.contains("cargo run -p elf-eval -- --config-a")); + assert!(markdown.contains("cargo make real-world-job-operator-ux-live-adapters")); + assert!(markdown.contains("Do not claim qmd beats ELF as a memory system overall")); + assert!(markdown.contains("Do not score rerank superiority from a qmd `--no-rerank` run")); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_viewer_boundaries.rs b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_viewer_boundaries.rs new file mode 100644 index 00000000..da20aa5e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/trace_replay_reports_qmd_trace_replay/trace_replay_viewer_boundaries.rs @@ -0,0 +1,39 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_trace_replay_viewer_blocker_boundaries( + readme: &str, + markdown: &str, + adoption_report: &str, + report: &Value, + adoption_json: &Value, +) -> Result<()> { + let checked_surfaces = [ + support::collapse_whitespace(readme), + support::collapse_whitespace(markdown), + support::collapse_whitespace(adoption_report), + report.to_string(), + adoption_json.to_string(), + ]; + + for surface in checked_surfaces { + assert!(!surface.contains("blocked or not encoded")); + } + + assert!( + support::collapse_whitespace(readme) + .contains("claude-mem viewer flows remain blocked until Docker-contained") + ); + assert!( + support::collapse_whitespace(markdown) + .contains("claude-mem UI repair paths remain blocked until Docker-contained") + ); + assert!( + support::collapse_whitespace(adoption_report) + .contains("claude-mem viewer workflows remain blocked until Docker-contained") + ); + + Ok(()) +}