From 51dd4674733d1ca9b2cc48b3969497e9ceaae42e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 30 Jun 2026 21:11:38 -0400 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Split real-world benchmark report modules","authority":"manual"} --- .../closeout_reports.rs | 718 +----------------- .../closeout_reports_agent_knowledge.rs | 165 ++++ .../closeout_reports_competitor_strength.rs | 116 +++ .../closeout_reports_graph_rag.rs | 137 ++++ .../closeout_reports_helpers.rs | 43 ++ .../closeout_reports_openmemory.rs | 86 +++ .../closeout_reports_workspace.rs | 202 +++++ .../external_adapters.rs | 634 +--------------- .../external_adapters_first_generation.rs | 180 +++++ .../external_adapters_fixture.rs | 77 ++ .../external_adapters_graph_gates.rs | 73 ++ .../external_adapters_letta.rs | 78 ++ .../external_adapters_live_sweep.rs | 68 ++ .../external_adapters_operator_debug.rs | 139 ++++ 14 files changed, 1397 insertions(+), 1319 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_agent_knowledge.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_competitor_strength.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_graph_rag.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_helpers.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_openmemory.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_workspace.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_first_generation.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_fixture.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_graph_gates.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_letta.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_live_sweep.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/external_adapters_operator_debug.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports.rs index 8c7f406b..10c588eb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports.rs @@ -1,712 +1,6 @@ -use std::fs; - -use color_eyre::{Result, eyre}; -use serde_json::Value; - -use crate::support; - -#[test] -fn agent_knowledge_os_closeout_benchmark_preserves_full_matrix_boundaries() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::agent_knowledge_os_closeout_benchmark_report_json_path()?, - )?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.agent_knowledge_os_closeout_benchmark_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1023")); - assert_eq!( - report.pointer("/summary/strongest_measured_integrated_product").and_then(Value::as_str), - Some("ELF integrated Agent Knowledge OS") - ); - assert_eq!( - report.pointer("/all_project_fixture_rerun/status").and_then(Value::as_str), - Some("pass") - ); - assert_eq!( - report.pointer("/all_project_fixture_rerun/job_count").and_then(Value::as_u64), - Some(62) - ); - assert_eq!(report.pointer("/all_project_fixture_rerun/pass").and_then(Value::as_u64), Some(55)); - assert_eq!(report.pointer("/summary/product_count").and_then(Value::as_u64), Some(19)); - assert_eq!(report.pointer("/summary/scenario_count").and_then(Value::as_u64), Some(6)); - assert_eq!( - report - .pointer("/summary/not_every_product_has_complete_live_coverage") - .and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - report.pointer("/summary/evidence_class_counts/pass").and_then(Value::as_u64), - Some(9) - ); - assert_eq!( - report.pointer("/summary/evidence_class_counts/not_tested").and_then(Value::as_u64), - Some(78) - ); - - let scenarios = support::array_at(&report, "/supported_scenarios")?; - let matrix = support::array_at(&report, "/product_matrix")?; - - for scenario in [ - "source_library_ingest_hydration", - "memory_authority_history_read_profiles", - "knowledge_workspace_pages", - "temporal_topic_graph_lite", - "dreaming_review_queue", - "recall_debug_panel", - ] { - support::find_by_field(scenarios, "/id", scenario)?; - } - - let elf = support::find_by_field(matrix, "/product", "ELF")?; - - for scenario in [ - "source_library_ingest_hydration", - "memory_authority_history_read_profiles", - "knowledge_workspace_pages", - "temporal_topic_graph_lite", - "dreaming_review_queue", - "recall_debug_panel", - ] { - assert_eq!( - elf.pointer(&format!("/statuses/{scenario}")).and_then(Value::as_str), - Some("pass") - ); - } - - let qmd = support::find_by_field(matrix, "/product", "qmd")?; - - assert_eq!( - qmd.pointer("/statuses/recall_debug_panel").and_then(Value::as_str), - Some("wrong_result") - ); - assert!( - qmd.pointer("/strongest_advantage") - .and_then(Value::as_str) - .is_some_and(|value| value.contains("weighted fusion")) - ); - - for product in ["VectifyAI PageIndex", "VectifyAI OpenKB"] { - let row = support::find_by_field(matrix, "/product", product)?; - - assert_eq!(row.pointer("/coverage").and_then(Value::as_str), Some("reference_only")); - assert_eq!( - row.pointer("/statuses/knowledge_workspace_pages").and_then(Value::as_str), - Some("not_tested") - ); - } - - assert_eq!( - report.pointer("/claim_boundaries/no_broad_superiority_claim").and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - report - .pointer("/claim_boundaries/reference_only_projects_do_not_count_as_pass") - .and_then(Value::as_bool), - Some(true) - ); - assert!(support::array_contains_str( - &report, - "/source_evidence", - "https://github.com/VectifyAI/PageIndex" - )?); - assert!(support::array_contains_str( - &report, - "/source_evidence", - "https://github.com/VectifyAI/OpenKB" - )?); - - Ok(()) -} - -#[test] -fn agent_knowledge_os_closeout_benchmark_wires_docs_and_optimization_queue() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::agent_knowledge_os_closeout_benchmark_report_json_path()?, - )?)?; - let markdown = - fs::read_to_string(support::agent_knowledge_os_closeout_benchmark_report_markdown_path()?)?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - let queue = support::array_at(&report, "/optimization_queue")?; - - for item in queue { - assert_eq!(item.pointer("/generated_from_delta").and_then(Value::as_bool), Some(true)); - } - for key in [ - "pageindex_openkb_source_library_adapter", - "qmd_retrieval_knobs_and_short_replay", - "operator_knowledge_library_ui", - "openviking_context_trajectory_artifacts", - "graph_rag_temporal_adapter_matrix", - ] { - let item = support::find_by_field(queue, "/key", key)?; - - assert_eq!(item.pointer("/generated_from_delta").and_then(Value::as_bool), Some(true)); - } - - assert!(markdown.contains("ELF is the strongest measured integrated product")); - assert!(markdown.contains("complete live coverage")); - assert!(markdown.contains("VectifyAI PageIndex")); - assert!(markdown.contains("VectifyAI OpenKB")); - assert!(markdown.contains("Do not claim ELF broadly beats every competitor")); - assert!( - benchmarking_index.contains("2026-06-20-agent-knowledge-os-closeout-benchmark-report.md") - ); - assert!(readme.contains("Agent Knowledge OS closeout after XY-1023")); - assert!(readme.contains("62 jobs, 55 pass")); - assert!(readme.contains("VectifyAI PageIndex/OpenKB")); - assert!(readme.contains("strongest measured integrated")); - - Ok(()) -} - -#[test] -fn p2_knowledge_workspace_closeout_preserves_pageindex_openkb_boundaries() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::p2_knowledge_workspace_pageindex_openkb_closeout_report_json_path()?, - )?)?; - let markdown = fs::read_to_string( - support::p2_knowledge_workspace_pageindex_openkb_closeout_report_markdown_path()?, - )?; - let makefile = fs::read_to_string(support::workspace_root()?.join("Makefile.toml"))?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - let benchmark_runbook = fs::read_to_string( - support::workspace_root()? - .join("docs") - .join("runbook") - .join("benchmarking") - .join("real_world_agent_memory_benchmark.md"), - )?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.p2_knowledge_workspace_pageindex_openkb_closeout_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1066")); - assert_eq!( - report.pointer("/self_assessment/verdict").and_then(Value::as_str), - Some("pass_with_reference_only_competitor_boundary") - ); - assert_eq!(report.pointer("/typed_state_summary/pass").and_then(Value::as_u64), Some(2)); - assert_eq!( - report.pointer("/typed_state_summary/wrong_result").and_then(Value::as_u64), - Some(0) - ); - assert_eq!(report.pointer("/typed_state_summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/typed_state_summary/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/typed_state_summary/not_tested").and_then(Value::as_u64), Some(2)); - - let results = support::array_at(&report, "/elf_same_corpus_results")?; - let source_library = support::find_by_field(results, "/suite", "source_library")?; - let knowledge = support::find_by_field(results, "/suite", "knowledge_compilation")?; - - assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(source_library.pointer("/jobs").and_then(Value::as_u64), Some(2)); - assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(knowledge.pointer("/jobs").and_then(Value::as_u64), Some(3)); - assert!(support::array_contains_str( - knowledge, - "/coverage", - "Changed-source watch/rebuild reports changed, stale, and reviewable memory-candidate outputs without source mutation." - )?); - - let matrix = support::array_at(&report, "/comparison_matrix")?; - let pageindex = support::find_by_field(matrix, "/target", "VectifyAI PageIndex")?; - let openkb = support::find_by_field(matrix, "/target", "VectifyAI OpenKB")?; - let p3 = support::find_by_field(matrix, "/target", "P3 PageIndex/OpenKB adapter queue")?; - - assert_eq!(pageindex.pointer("/status").and_then(Value::as_str), Some("not_tested")); - assert_eq!(openkb.pointer("/status").and_then(Value::as_str), Some("not_tested")); - assert_eq!(p3.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - report - .pointer("/p3_queue_decision/ready_to_queue_after_main_thread_acceptance") - .and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - report.pointer("/p3_queue_decision/queued_label_applied").and_then(Value::as_bool), - Some(false) - ); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not claim ELF beats PageIndex or OpenKB." - )?); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not queue a P3 issue in this lane." - )?); - assert!(markdown.contains("P2 Knowledge Workspace PageIndex/OpenKB Closeout Report")); - assert!(markdown.contains("VectifyAI PageIndex")); - assert!(markdown.contains("VectifyAI OpenKB")); - assert!(markdown.contains("This report does not apply `decodex:queued:elf`")); - assert!(makefile.contains("[tasks.real-world-memory-p2-knowledge-closeout]")); - assert!(makefile.contains("\"real-world-memory-source-library-report\"")); - assert!(makefile.contains("\"real-world-memory-knowledge-report\"")); - assert!( - benchmarking_index - .contains("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md") - ); - assert!(readme.contains("P2 Knowledge Workspace PageIndex/OpenKB closeout after XY-1066")); - assert!(readme.contains("real-world-memory-p2-knowledge-closeout")); - assert!(benchmark_runbook.contains("cargo make real-world-memory-p2-knowledge-closeout")); - - Ok(()) -} - -#[test] -fn operator_approved_public_proxy_private_addendum_preserves_boundary() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::operator_approved_public_proxy_private_addendum_report_json_path()?, - )?)?; - let markdown = fs::read_to_string( - support::operator_approved_public_proxy_private_addendum_report_markdown_path()?, - )?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.operator_approved_public_proxy_baseline_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-930")); - assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - report.pointer("/command/run_id").and_then(Value::as_str), - Some("live-baseline-20260619143959") - ); - assert_eq!( - report.pointer("/corpus/profile").and_then(Value::as_str), - Some("production-private") - ); - assert_eq!( - report.pointer("/corpus/runner_track").and_then(Value::as_str), - Some("private_production") - ); - assert_eq!( - report.pointer("/corpus/manifest_kind").and_then(Value::as_str), - Some("operator_approved_public_proxy") - ); - assert_eq!( - report.pointer("/corpus/manifest_id").and_then(Value::as_str), - Some("operator-approved-public-proxy-prod-corpus-2026-06-19") - ); - assert_eq!(report.pointer("/embedding/mode").and_then(Value::as_str), Some("local")); - assert_eq!( - report.pointer("/embedding/provider_backed_quality_proven").and_then(Value::as_bool), - Some(false) - ); - assert_eq!(report.pointer("/summary/project_status").and_then(Value::as_str), Some("pass")); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/check_summary/total").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/check_summary/pass").and_then(Value::as_u64), Some(8)); - assert_eq!( - report.pointer("/query_summary/wrong_result_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!(report.pointer("/backfill/completed_count").and_then(Value::as_u64), Some(12)); - assert_eq!(report.pointer("/backfill/duplicate_source_notes").and_then(Value::as_u64), Some(0)); - - let queries = support::array_at(&report, "/queries")?; - let provider = support::find_by_field(queries, "/id", "q-explain-provider-blocker")?; - - assert_eq!(queries.len(), 8); - assert_eq!( - provider.pointer("/top_evidence").and_then(Value::as_str), - Some("blocker-provider-missing") - ); - assert_eq!(provider.pointer("/matched").and_then(Value::as_bool), Some(true)); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not call this real private-corpus production proof." - )?); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not claim provider-backed production quality; embedding mode was local." - )?); - assert!(support::array_contains_str( - &report, - "/improvement_regression_readback/unchanged", - "Real private-corpus production quality is still not proven." - )?); - assert!(support::array_contains_str( - &report, - "/next_optimization_direction/when_operator_inputs_exist", - "Run provider-backed embeddings with ELF_BASELINE_ELF_EMBEDDING_MODE=provider and a routed provider setup." - )?); - assert!(markdown.contains("proxy corpus pass")); - assert!(markdown.contains("Do not call this real private-corpus production proof.")); - assert!(markdown.contains("| Embedding mode | `local` |")); - assert!( - benchmarking_index - .contains("2026-06-19-operator-approved-public-proxy-production-private-addendum.md") - ); - assert!(benchmarking_index.contains("not real private-corpus or provider-backed proof")); - assert!(readme.contains("Operator-approved public-proxy addendum after XY-930")); - assert!(readme.contains("8/8 query passes")); - assert!(readme.contains("does not prove real private-corpus production quality")); - - Ok(()) -} - -#[test] -fn openmemory_ui_export_product_recheck_preserves_blocked_boundary() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::openmemory_ui_export_product_readback_report_json_path()?, - )?)?; - let markdown = - fs::read_to_string(support::openmemory_ui_export_product_readback_report_markdown_path()?)?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.openmemory_ui_export_product_recheck_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-987")); - assert_eq!( - report.pointer("/command/command").and_then(Value::as_str), - Some("cargo make openmemory-ui-export-readback") - ); - assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - report.pointer("/command/probe_artifact").and_then(Value::as_str), - Some("tmp/live-baseline/mem0-openmemory-ui-export.json") - ); - assert_eq!(report.pointer("/run/sdk_check_summary/pass").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/run/ui_export_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - report.pointer("/run/ui_export_reason_code").and_then(Value::as_str), - Some("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER") - ); - assert_eq!( - report - .pointer("/same_corpus_boundary/sdk_get_all_is_ui_export_evidence") - .and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report - .pointer("/openmemory_product_surface/export_requires_running_container") - .and_then(Value::as_bool), - Some(true) - ); - assert!( - report - .pointer("/openmemory_probe/attempt/output_excerpt") - .and_then(Value::as_str) - .is_some_and(|excerpt| excerpt.contains("docker: command not found") - && excerpt.contains("Container 'openmemory-openmemory-mcp-1' not found/running")) - ); - assert_eq!( - report.pointer("/classification/comparison_judgment").and_then(Value::as_str), - Some("unchanged") - ); - assert_eq!( - report - .pointer("/claim_boundary/product_browser_or_dashboard_readback_reached") - .and_then(Value::as_bool), - Some(false) - ); - assert!(support::array_contains_str( - &report, - "/improvement_regression_readback/unchanged", - "OpenMemory product UI/export readback remains blocked before same-corpus product app database validation." - )?); - assert!(support::array_contains_str( - &report, - "/next_optimization_direction/required_fields", - "same_corpus_import_into_openmemory_app_database" - )?); - assert!(markdown.contains("OpenMemory UI/export product-readback status is unchanged")); - assert!(markdown.contains("Product browser/dashboard readback reached")); - assert!( - benchmarking_index.contains("2026-06-19-openmemory-ui-export-product-readback-report.md") - ); - assert!(readme.contains("OpenMemory UI/Export Product Readback Report - June 19, 2026")); - assert!(readme.contains("OpenMemory UI/export product recheck after XY-987")); - - Ok(()) -} - -#[test] -fn graph_rag_citation_navigation_promotion_preserves_typed_non_passes() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::graph_rag_citation_navigation_promotion_report_json_path()?, - )?)?; - let markdown = fs::read_to_string( - support::graph_rag_citation_navigation_promotion_report_markdown_path()?, - )?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.graph_rag_citation_navigation_promotion_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-985")); - assert_eq!( - report.pointer("/command/command").and_then(Value::as_str), - Some("cargo make real-world-memory-graph-rag") - ); - assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - report.pointer("/summary/overall_judgment").and_then(Value::as_str), - Some("unchanged_typed_non_pass") - ); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.25)); - assert_eq!( - report.pointer("/summary/knowledge_citation_coverage").and_then(Value::as_f64), - Some(0.667) - ); - - let scenarios = support::array_at(&report, "/scenario_outcomes")?; - let ragflow = support::find_by_field(scenarios, "/project", "RAGFlow")?; - let lightrag = support::find_by_field(scenarios, "/project", "LightRAG")?; - let graphrag = support::find_by_field(scenarios, "/project", "GraphRAG")?; - let graphiti = support::find_by_field(scenarios, "/project", "Graphiti/Zep")?; - let graphify = support::find_by_field(scenarios, "/project", "graphify")?; - let llm_wiki = support::find_by_field(scenarios, "/project", "llm-wiki")?; - let gbrain = support::find_by_field(scenarios, "/project", "gbrain")?; - - assert_eq!(ragflow.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert_eq!(lightrag.pointer("/current_status").and_then(Value::as_str), Some("incomplete")); - assert_eq!(graphrag.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert_eq!(graphiti.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert_eq!(graphify.pointer("/current_status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(llm_wiki.pointer("/current_status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(gbrain.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert!(support::array_contains_str( - graphify, - "/produced_evidence", - "graphify-source-location-output" - )?); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not claim graph/RAG parity or broad graph-navigation quality." - )?); - assert!(support::array_contains_str( - &report, - "/next_optimization_direction/required_fields", - "graphrag_output_table_rows_with_generated_evidence_ids" - )?); - assert!(markdown.contains("typed non-pass, no parity claim")); - assert!( - markdown.contains("graphify produces evidence-linked output but still scores wrong_result") - ); - assert!( - benchmarking_index.contains("2026-06-19-graph-rag-citation-navigation-promotion-report.md") - ); - assert!(readme.contains("Graph/RAG Citation and Navigation Promotion Report - June 19, 2026")); - assert!(readme.contains("Graph/RAG citation/navigation promotion after XY-985")); - - Ok(()) -} - -#[test] -fn graph_rag_adapter_matrix_report_preserves_no_parity_claims() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::graph_rag_adapter_matrix_report_json_path()?, - )?)?; - let markdown = fs::read_to_string(support::graph_rag_adapter_matrix_report_markdown_path()?)?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.graph_rag_adapter_matrix_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1071")); - assert_eq!(report.pointer("/summary/matrix_row_count").and_then(Value::as_u64), Some(18)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(6)); - assert_eq!( - report.pointer("/summary/broad_graph_rag_parity").and_then(Value::as_str), - Some("not_proven") - ); - - let rows = support::array_at(&report, "/adapter_matrix")?; - let ragflow_citation = find_matrix_row(rows, "RAGFlow", "citation_quality")?; - let lightrag_retrieval = find_matrix_row(rows, "LightRAG", "retrieval_quality")?; - let graphrag_navigation = find_matrix_row(rows, "GraphRAG", "navigation_quality")?; - let graphrag_retrieval = find_matrix_row(rows, "GraphRAG", "retrieval_quality")?; - - assert_eq!(ragflow_citation.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(lightrag_retrieval.pointer("/status").and_then(Value::as_str), Some("incomplete")); - assert_eq!(graphrag_navigation.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(graphrag_retrieval.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not reposition ELF as a generic RAG platform from this adapter matrix." - )?); - assert!(markdown.contains("The graph/RAG comparison remains typed non-pass")); - assert!(markdown.contains("| RAGFlow | `blocked`: answer text plus selected reference chunks")); - assert!(benchmarking_index.contains("2026-06-23-graph-rag-adapter-matrix-report.md")); - assert!(readme.contains("RAGFlow/GraphRAG/LightRAG adapter matrix after XY-1071")); - assert!(readme.contains("Graph/RAG Adapter Matrix Report - June 23, 2026")); - - Ok(()) -} - -#[test] -fn p3_competitor_strength_absorption_report_preserves_claim_boundaries() -> Result<()> { - let report = serde_json::from_str::(&fs::read_to_string( - support::p3_competitor_strength_absorption_report_json_path()?, - )?)?; - let markdown = - fs::read_to_string(support::p3_competitor_strength_absorption_report_markdown_path()?)?; - let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; - let readme = fs::read_to_string(support::readme_path()?)?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.p3_competitor_strength_absorption_report/v1") - ); - assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1072")); - assert_eq!( - report.pointer("/self_assessment/verdict").and_then(Value::as_str), - Some("pass_with_p4_queue_ready_after_main_thread_acceptance") - ); - assert_eq!( - report.pointer("/self_assessment/p4_queued_label_applied").and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report - .pointer("/self_assessment/typed_non_pass_states_are_not_wins") - .and_then(Value::as_bool), - Some(true) - ); - - let products = support::array_at(&report, "/product_strengths")?; - - for product in [ - "qmd", - "VectifyAI PageIndex", - "VectifyAI OpenKB", - "mem0/OpenMemory", - "Letta", - "Graphiti/Zep", - "OpenViking", - "RAGFlow", - "GraphRAG", - "LightRAG", - ] { - support::find_by_field(products, "/product", product)?; - } - - let qmd = support::find_by_field(products, "/product", "qmd")?; - let pageindex = support::find_by_field(products, "/product", "VectifyAI PageIndex")?; - let mem0 = support::find_by_field(products, "/product", "mem0/OpenMemory")?; - let graphiti = support::find_by_field(products, "/product", "Graphiti/Zep")?; - let lightrag = support::find_by_field(products, "/product", "LightRAG")?; - - assert_eq!(qmd.pointer("/current_status").and_then(Value::as_str), Some("mixed")); - assert!( - qmd.pointer("/remains_stronger_elsewhere") - .and_then(Value::as_str) - .is_some_and(|value| value.contains("top-k JSON")) - ); - assert_eq!(pageindex.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - mem0.pointer("/current_status").and_then(Value::as_str), - Some("split_pass_and_blocked") - ); - assert_eq!(graphiti.pointer("/current_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - lightrag.pointer("/current_status").and_then(Value::as_str), - Some("incomplete_or_not_encoded") - ); - - let queue = support::array_at(&report, "/p4_optimization_queue")?; - - for key in [ - "qmd_candidate_replay_parity", - "adapter_outcome_grammar_and_metrics", - "source_library_tree_and_wiki_adapters", - "memory_history_export_and_core_archive", - "temporal_trajectory_graph_rag_adapters", - ] { - let item = support::find_by_field(queue, "/key", key)?; - - assert_eq!( - item.pointer("/ready_after_main_thread_acceptance").and_then(Value::as_bool), - Some(true) - ); - assert_eq!(item.pointer("/queued_label_applied").and_then(Value::as_bool), Some(false)); - } - - assert_product_queue_items_reference_queue(products, queue)?; - - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Typed non-pass states are not wins." - )?); - assert!(support::array_contains_str( - &report, - "/claim_boundaries/not_allowed", - "Do not apply decodex:queued:elf to a P4 issue until the main thread accepts the P3 closeout." - )?); - assert!(markdown.contains("P3 is decision-ready for main-thread inspection")); - assert!(markdown.contains("Typed non-pass states are not wins")); - assert!(markdown.contains("No P4 issue receives `decodex:queued:elf`")); - assert!(benchmarking_index.contains("2026-06-23-p3-competitor-strength-absorption-report.md")); - assert!(readme.contains("P3 competitor-strength absorption closeout after XY-1072")); - assert!(readme.contains("`decodex:queued:elf` label")); - - Ok(()) -} - -fn assert_product_queue_items_reference_queue(products: &[Value], queue: &[Value]) -> Result<()> { - let queue_keys = queue - .iter() - .filter_map(|item| item.pointer("/key").and_then(Value::as_str)) - .collect::>(); - - for product in products { - let product_name = product - .pointer("/product") - .and_then(Value::as_str) - .ok_or_else(|| eyre::eyre!("product row is missing product name"))?; - let queue_item = product - .pointer("/p4_queue_item") - .and_then(Value::as_str) - .ok_or_else(|| eyre::eyre!("product {product_name} is missing p4_queue_item"))?; - - assert!( - queue_keys.contains(&queue_item), - "product {product_name} references missing P4 queue item {queue_item}" - ); - } - - Ok(()) -} - -fn find_matrix_row<'a>(rows: &'a [Value], adapter: &str, dimension: &str) -> Result<&'a Value> { - rows.iter() - .find(|row| { - row.pointer("/adapter").and_then(Value::as_str) == Some(adapter) - && row.pointer("/dimension").and_then(Value::as_str) == Some(dimension) - }) - .ok_or_else(|| eyre::eyre!("missing matrix row for {adapter} {dimension}")) -} +mod closeout_reports_agent_knowledge; +mod closeout_reports_competitor_strength; +mod closeout_reports_graph_rag; +mod closeout_reports_helpers; +mod closeout_reports_openmemory; +mod closeout_reports_workspace; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_agent_knowledge.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_agent_knowledge.rs new file mode 100644 index 00000000..1a8ed3d6 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_agent_knowledge.rs @@ -0,0 +1,165 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn agent_knowledge_os_closeout_benchmark_preserves_full_matrix_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::agent_knowledge_os_closeout_benchmark_report_json_path()?, + )?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_knowledge_os_closeout_benchmark_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1023")); + assert_eq!( + report.pointer("/summary/strongest_measured_integrated_product").and_then(Value::as_str), + Some("ELF integrated Agent Knowledge OS") + ); + assert_eq!( + report.pointer("/all_project_fixture_rerun/status").and_then(Value::as_str), + Some("pass") + ); + assert_eq!( + report.pointer("/all_project_fixture_rerun/job_count").and_then(Value::as_u64), + Some(62) + ); + assert_eq!(report.pointer("/all_project_fixture_rerun/pass").and_then(Value::as_u64), Some(55)); + assert_eq!(report.pointer("/summary/product_count").and_then(Value::as_u64), Some(19)); + assert_eq!(report.pointer("/summary/scenario_count").and_then(Value::as_u64), Some(6)); + assert_eq!( + report + .pointer("/summary/not_every_product_has_complete_live_coverage") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report.pointer("/summary/evidence_class_counts/pass").and_then(Value::as_u64), + Some(9) + ); + assert_eq!( + report.pointer("/summary/evidence_class_counts/not_tested").and_then(Value::as_u64), + Some(78) + ); + + let scenarios = support::array_at(&report, "/supported_scenarios")?; + let matrix = support::array_at(&report, "/product_matrix")?; + + for scenario in [ + "source_library_ingest_hydration", + "memory_authority_history_read_profiles", + "knowledge_workspace_pages", + "temporal_topic_graph_lite", + "dreaming_review_queue", + "recall_debug_panel", + ] { + support::find_by_field(scenarios, "/id", scenario)?; + } + + let elf = support::find_by_field(matrix, "/product", "ELF")?; + + for scenario in [ + "source_library_ingest_hydration", + "memory_authority_history_read_profiles", + "knowledge_workspace_pages", + "temporal_topic_graph_lite", + "dreaming_review_queue", + "recall_debug_panel", + ] { + assert_eq!( + elf.pointer(&format!("/statuses/{scenario}")).and_then(Value::as_str), + Some("pass") + ); + } + + let qmd = support::find_by_field(matrix, "/product", "qmd")?; + + assert_eq!( + qmd.pointer("/statuses/recall_debug_panel").and_then(Value::as_str), + Some("wrong_result") + ); + assert!( + qmd.pointer("/strongest_advantage") + .and_then(Value::as_str) + .is_some_and(|value| value.contains("weighted fusion")) + ); + + for product in ["VectifyAI PageIndex", "VectifyAI OpenKB"] { + let row = support::find_by_field(matrix, "/product", product)?; + + assert_eq!(row.pointer("/coverage").and_then(Value::as_str), Some("reference_only")); + assert_eq!( + row.pointer("/statuses/knowledge_workspace_pages").and_then(Value::as_str), + Some("not_tested") + ); + } + + assert_eq!( + report.pointer("/claim_boundaries/no_broad_superiority_claim").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report + .pointer("/claim_boundaries/reference_only_projects_do_not_count_as_pass") + .and_then(Value::as_bool), + Some(true) + ); + assert!(support::array_contains_str( + &report, + "/source_evidence", + "https://github.com/VectifyAI/PageIndex" + )?); + assert!(support::array_contains_str( + &report, + "/source_evidence", + "https://github.com/VectifyAI/OpenKB" + )?); + + Ok(()) +} + +#[test] +fn agent_knowledge_os_closeout_benchmark_wires_docs_and_optimization_queue() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::agent_knowledge_os_closeout_benchmark_report_json_path()?, + )?)?; + let markdown = + fs::read_to_string(support::agent_knowledge_os_closeout_benchmark_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + let queue = support::array_at(&report, "/optimization_queue")?; + + for item in queue { + assert_eq!(item.pointer("/generated_from_delta").and_then(Value::as_bool), Some(true)); + } + for key in [ + "pageindex_openkb_source_library_adapter", + "qmd_retrieval_knobs_and_short_replay", + "operator_knowledge_library_ui", + "openviking_context_trajectory_artifacts", + "graph_rag_temporal_adapter_matrix", + ] { + let item = support::find_by_field(queue, "/key", key)?; + + assert_eq!(item.pointer("/generated_from_delta").and_then(Value::as_bool), Some(true)); + } + + assert!(markdown.contains("ELF is the strongest measured integrated product")); + assert!(markdown.contains("complete live coverage")); + assert!(markdown.contains("VectifyAI PageIndex")); + assert!(markdown.contains("VectifyAI OpenKB")); + assert!(markdown.contains("Do not claim ELF broadly beats every competitor")); + assert!( + benchmarking_index.contains("2026-06-20-agent-knowledge-os-closeout-benchmark-report.md") + ); + assert!(readme.contains("Agent Knowledge OS closeout after XY-1023")); + assert!(readme.contains("62 jobs, 55 pass")); + assert!(readme.contains("VectifyAI PageIndex/OpenKB")); + assert!(readme.contains("strongest measured integrated")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_competitor_strength.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_competitor_strength.rs new file mode 100644 index 00000000..76607820 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_competitor_strength.rs @@ -0,0 +1,116 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::{closeout_reports::closeout_reports_helpers, support}; + +#[test] +fn p3_competitor_strength_absorption_report_preserves_claim_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::p3_competitor_strength_absorption_report_json_path()?, + )?)?; + let markdown = + fs::read_to_string(support::p3_competitor_strength_absorption_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.p3_competitor_strength_absorption_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1072")); + assert_eq!( + report.pointer("/self_assessment/verdict").and_then(Value::as_str), + Some("pass_with_p4_queue_ready_after_main_thread_acceptance") + ); + assert_eq!( + report.pointer("/self_assessment/p4_queued_label_applied").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/self_assessment/typed_non_pass_states_are_not_wins") + .and_then(Value::as_bool), + Some(true) + ); + + let products = support::array_at(&report, "/product_strengths")?; + + for product in [ + "qmd", + "VectifyAI PageIndex", + "VectifyAI OpenKB", + "mem0/OpenMemory", + "Letta", + "Graphiti/Zep", + "OpenViking", + "RAGFlow", + "GraphRAG", + "LightRAG", + ] { + support::find_by_field(products, "/product", product)?; + } + + let qmd = support::find_by_field(products, "/product", "qmd")?; + let pageindex = support::find_by_field(products, "/product", "VectifyAI PageIndex")?; + let mem0 = support::find_by_field(products, "/product", "mem0/OpenMemory")?; + let graphiti = support::find_by_field(products, "/product", "Graphiti/Zep")?; + let lightrag = support::find_by_field(products, "/product", "LightRAG")?; + + assert_eq!(qmd.pointer("/current_status").and_then(Value::as_str), Some("mixed")); + assert!( + qmd.pointer("/remains_stronger_elsewhere") + .and_then(Value::as_str) + .is_some_and(|value| value.contains("top-k JSON")) + ); + assert_eq!(pageindex.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/current_status").and_then(Value::as_str), + Some("split_pass_and_blocked") + ); + assert_eq!(graphiti.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + lightrag.pointer("/current_status").and_then(Value::as_str), + Some("incomplete_or_not_encoded") + ); + + let queue = support::array_at(&report, "/p4_optimization_queue")?; + + for key in [ + "qmd_candidate_replay_parity", + "adapter_outcome_grammar_and_metrics", + "source_library_tree_and_wiki_adapters", + "memory_history_export_and_core_archive", + "temporal_trajectory_graph_rag_adapters", + ] { + let item = support::find_by_field(queue, "/key", key)?; + + assert_eq!( + item.pointer("/ready_after_main_thread_acceptance").and_then(Value::as_bool), + Some(true) + ); + assert_eq!(item.pointer("/queued_label_applied").and_then(Value::as_bool), Some(false)); + } + + closeout_reports_helpers::assert_product_queue_items_reference_queue(products, queue)?; + + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Typed non-pass states are not wins." + )?); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not apply decodex:queued:elf to a P4 issue until the main thread accepts the P3 closeout." + )?); + assert!(markdown.contains("P3 is decision-ready for main-thread inspection")); + assert!(markdown.contains("Typed non-pass states are not wins")); + assert!(markdown.contains("No P4 issue receives `decodex:queued:elf`")); + assert!(benchmarking_index.contains("2026-06-23-p3-competitor-strength-absorption-report.md")); + assert!(readme.contains("P3 competitor-strength absorption closeout after XY-1072")); + assert!(readme.contains("`decodex:queued:elf` label")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_graph_rag.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_graph_rag.rs new file mode 100644 index 00000000..f1b73cc6 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_graph_rag.rs @@ -0,0 +1,137 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::{closeout_reports::closeout_reports_helpers, support}; + +#[test] +fn graph_rag_citation_navigation_promotion_preserves_typed_non_passes() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::graph_rag_citation_navigation_promotion_report_json_path()?, + )?)?; + let markdown = fs::read_to_string( + support::graph_rag_citation_navigation_promotion_report_markdown_path()?, + )?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.graph_rag_citation_navigation_promotion_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-985")); + assert_eq!( + report.pointer("/command/command").and_then(Value::as_str), + Some("cargo make real-world-memory-graph-rag") + ); + assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + report.pointer("/summary/overall_judgment").and_then(Value::as_str), + Some("unchanged_typed_non_pass") + ); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.25)); + assert_eq!( + report.pointer("/summary/knowledge_citation_coverage").and_then(Value::as_f64), + Some(0.667) + ); + + let scenarios = support::array_at(&report, "/scenario_outcomes")?; + let ragflow = support::find_by_field(scenarios, "/project", "RAGFlow")?; + let lightrag = support::find_by_field(scenarios, "/project", "LightRAG")?; + let graphrag = support::find_by_field(scenarios, "/project", "GraphRAG")?; + let graphiti = support::find_by_field(scenarios, "/project", "Graphiti/Zep")?; + let graphify = support::find_by_field(scenarios, "/project", "graphify")?; + let llm_wiki = support::find_by_field(scenarios, "/project", "llm-wiki")?; + let gbrain = support::find_by_field(scenarios, "/project", "gbrain")?; + + assert_eq!(ragflow.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(lightrag.pointer("/current_status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(graphrag.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphiti.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphify.pointer("/current_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(llm_wiki.pointer("/current_status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(gbrain.pointer("/current_status").and_then(Value::as_str), Some("blocked")); + assert!(support::array_contains_str( + graphify, + "/produced_evidence", + "graphify-source-location-output" + )?); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not claim graph/RAG parity or broad graph-navigation quality." + )?); + assert!(support::array_contains_str( + &report, + "/next_optimization_direction/required_fields", + "graphrag_output_table_rows_with_generated_evidence_ids" + )?); + assert!(markdown.contains("typed non-pass, no parity claim")); + assert!( + markdown.contains("graphify produces evidence-linked output but still scores wrong_result") + ); + assert!( + benchmarking_index.contains("2026-06-19-graph-rag-citation-navigation-promotion-report.md") + ); + assert!(readme.contains("Graph/RAG Citation and Navigation Promotion Report - June 19, 2026")); + assert!(readme.contains("Graph/RAG citation/navigation promotion after XY-985")); + + Ok(()) +} + +#[test] +fn graph_rag_adapter_matrix_report_preserves_no_parity_claims() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::graph_rag_adapter_matrix_report_json_path()?, + )?)?; + let markdown = fs::read_to_string(support::graph_rag_adapter_matrix_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.graph_rag_adapter_matrix_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1071")); + assert_eq!(report.pointer("/summary/matrix_row_count").and_then(Value::as_u64), Some(18)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(6)); + assert_eq!( + report.pointer("/summary/broad_graph_rag_parity").and_then(Value::as_str), + Some("not_proven") + ); + + let rows = support::array_at(&report, "/adapter_matrix")?; + let ragflow_citation = + closeout_reports_helpers::find_matrix_row(rows, "RAGFlow", "citation_quality")?; + let lightrag_retrieval = + closeout_reports_helpers::find_matrix_row(rows, "LightRAG", "retrieval_quality")?; + let graphrag_navigation = + closeout_reports_helpers::find_matrix_row(rows, "GraphRAG", "navigation_quality")?; + let graphrag_retrieval = + closeout_reports_helpers::find_matrix_row(rows, "GraphRAG", "retrieval_quality")?; + + assert_eq!(ragflow_citation.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(lightrag_retrieval.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(graphrag_navigation.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphrag_retrieval.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not reposition ELF as a generic RAG platform from this adapter matrix." + )?); + assert!(markdown.contains("The graph/RAG comparison remains typed non-pass")); + assert!(markdown.contains("| RAGFlow | `blocked`: answer text plus selected reference chunks")); + assert!(benchmarking_index.contains("2026-06-23-graph-rag-adapter-matrix-report.md")); + assert!(readme.contains("RAGFlow/GraphRAG/LightRAG adapter matrix after XY-1071")); + assert!(readme.contains("Graph/RAG Adapter Matrix Report - June 23, 2026")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_helpers.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_helpers.rs new file mode 100644 index 00000000..289ab6c4 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_helpers.rs @@ -0,0 +1,43 @@ +use color_eyre::{Result, eyre}; +use serde_json::Value; + +pub(super) fn assert_product_queue_items_reference_queue( + products: &[Value], + queue: &[Value], +) -> Result<()> { + let queue_keys = queue + .iter() + .filter_map(|item| item.pointer("/key").and_then(Value::as_str)) + .collect::>(); + + for product in products { + let product_name = product + .pointer("/product") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("product row is missing product name"))?; + let queue_item = product + .pointer("/p4_queue_item") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("product {product_name} is missing p4_queue_item"))?; + + assert!( + queue_keys.contains(&queue_item), + "product {product_name} references missing P4 queue item {queue_item}" + ); + } + + Ok(()) +} + +pub(super) fn find_matrix_row<'a>( + rows: &'a [Value], + adapter: &str, + dimension: &str, +) -> Result<&'a Value> { + rows.iter() + .find(|row| { + row.pointer("/adapter").and_then(Value::as_str) == Some(adapter) + && row.pointer("/dimension").and_then(Value::as_str) == Some(dimension) + }) + .ok_or_else(|| eyre::eyre!("missing matrix row for {adapter} {dimension}")) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_openmemory.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_openmemory.rs new file mode 100644 index 00000000..e48f4caa --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_openmemory.rs @@ -0,0 +1,86 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn openmemory_ui_export_product_recheck_preserves_blocked_boundary() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::openmemory_ui_export_product_readback_report_json_path()?, + )?)?; + let markdown = + fs::read_to_string(support::openmemory_ui_export_product_readback_report_markdown_path()?)?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.openmemory_ui_export_product_recheck_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-987")); + assert_eq!( + report.pointer("/command/command").and_then(Value::as_str), + Some("cargo make openmemory-ui-export-readback") + ); + assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + report.pointer("/command/probe_artifact").and_then(Value::as_str), + Some("tmp/live-baseline/mem0-openmemory-ui-export.json") + ); + assert_eq!(report.pointer("/run/sdk_check_summary/pass").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/run/ui_export_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + report.pointer("/run/ui_export_reason_code").and_then(Value::as_str), + Some("DOCKER_UNAVAILABLE_IN_BASELINE_RUNNER") + ); + assert_eq!( + report + .pointer("/same_corpus_boundary/sdk_get_all_is_ui_export_evidence") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/openmemory_product_surface/export_requires_running_container") + .and_then(Value::as_bool), + Some(true) + ); + assert!( + report + .pointer("/openmemory_probe/attempt/output_excerpt") + .and_then(Value::as_str) + .is_some_and(|excerpt| excerpt.contains("docker: command not found") + && excerpt.contains("Container 'openmemory-openmemory-mcp-1' not found/running")) + ); + assert_eq!( + report.pointer("/classification/comparison_judgment").and_then(Value::as_str), + Some("unchanged") + ); + assert_eq!( + report + .pointer("/claim_boundary/product_browser_or_dashboard_readback_reached") + .and_then(Value::as_bool), + Some(false) + ); + assert!(support::array_contains_str( + &report, + "/improvement_regression_readback/unchanged", + "OpenMemory product UI/export readback remains blocked before same-corpus product app database validation." + )?); + assert!(support::array_contains_str( + &report, + "/next_optimization_direction/required_fields", + "same_corpus_import_into_openmemory_app_database" + )?); + assert!(markdown.contains("OpenMemory UI/export product-readback status is unchanged")); + assert!(markdown.contains("Product browser/dashboard readback reached")); + assert!( + benchmarking_index.contains("2026-06-19-openmemory-ui-export-product-readback-report.md") + ); + assert!(readme.contains("OpenMemory UI/Export Product Readback Report - June 19, 2026")); + assert!(readme.contains("OpenMemory UI/export product recheck after XY-987")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_workspace.rs b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_workspace.rs new file mode 100644 index 00000000..382c8ba6 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/closeout_reports_workspace.rs @@ -0,0 +1,202 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn p2_knowledge_workspace_closeout_preserves_pageindex_openkb_boundaries() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::p2_knowledge_workspace_pageindex_openkb_closeout_report_json_path()?, + )?)?; + let markdown = fs::read_to_string( + support::p2_knowledge_workspace_pageindex_openkb_closeout_report_markdown_path()?, + )?; + let makefile = fs::read_to_string(support::workspace_root()?.join("Makefile.toml"))?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + let benchmark_runbook = fs::read_to_string( + support::workspace_root()? + .join("docs") + .join("runbook") + .join("benchmarking") + .join("real_world_agent_memory_benchmark.md"), + )?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.p2_knowledge_workspace_pageindex_openkb_closeout_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-1066")); + assert_eq!( + report.pointer("/self_assessment/verdict").and_then(Value::as_str), + Some("pass_with_reference_only_competitor_boundary") + ); + assert_eq!(report.pointer("/typed_state_summary/pass").and_then(Value::as_u64), Some(2)); + assert_eq!( + report.pointer("/typed_state_summary/wrong_result").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/typed_state_summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/typed_state_summary/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/typed_state_summary/not_tested").and_then(Value::as_u64), Some(2)); + + let results = support::array_at(&report, "/elf_same_corpus_results")?; + let source_library = support::find_by_field(results, "/suite", "source_library")?; + let knowledge = support::find_by_field(results, "/suite", "knowledge_compilation")?; + + assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(source_library.pointer("/jobs").and_then(Value::as_u64), Some(2)); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge.pointer("/jobs").and_then(Value::as_u64), Some(3)); + assert!(support::array_contains_str( + knowledge, + "/coverage", + "Changed-source watch/rebuild reports changed, stale, and reviewable memory-candidate outputs without source mutation." + )?); + + let matrix = support::array_at(&report, "/comparison_matrix")?; + let pageindex = support::find_by_field(matrix, "/target", "VectifyAI PageIndex")?; + let openkb = support::find_by_field(matrix, "/target", "VectifyAI OpenKB")?; + let p3 = support::find_by_field(matrix, "/target", "P3 PageIndex/OpenKB adapter queue")?; + + assert_eq!(pageindex.pointer("/status").and_then(Value::as_str), Some("not_tested")); + assert_eq!(openkb.pointer("/status").and_then(Value::as_str), Some("not_tested")); + assert_eq!(p3.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + report + .pointer("/p3_queue_decision/ready_to_queue_after_main_thread_acceptance") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report.pointer("/p3_queue_decision/queued_label_applied").and_then(Value::as_bool), + Some(false) + ); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not claim ELF beats PageIndex or OpenKB." + )?); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not queue a P3 issue in this lane." + )?); + assert!(markdown.contains("P2 Knowledge Workspace PageIndex/OpenKB Closeout Report")); + assert!(markdown.contains("VectifyAI PageIndex")); + assert!(markdown.contains("VectifyAI OpenKB")); + assert!(markdown.contains("This report does not apply `decodex:queued:elf`")); + assert!(makefile.contains("[tasks.real-world-memory-p2-knowledge-closeout]")); + assert!(makefile.contains("\"real-world-memory-source-library-report\"")); + assert!(makefile.contains("\"real-world-memory-knowledge-report\"")); + assert!( + benchmarking_index + .contains("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md") + ); + assert!(readme.contains("P2 Knowledge Workspace PageIndex/OpenKB closeout after XY-1066")); + assert!(readme.contains("real-world-memory-p2-knowledge-closeout")); + assert!(benchmark_runbook.contains("cargo make real-world-memory-p2-knowledge-closeout")); + + Ok(()) +} + +#[test] +fn operator_approved_public_proxy_private_addendum_preserves_boundary() -> Result<()> { + let report = serde_json::from_str::(&fs::read_to_string( + support::operator_approved_public_proxy_private_addendum_report_json_path()?, + )?)?; + let markdown = fs::read_to_string( + support::operator_approved_public_proxy_private_addendum_report_markdown_path()?, + )?; + let benchmarking_index = fs::read_to_string(support::benchmarking_index_path()?)?; + let readme = fs::read_to_string(support::readme_path()?)?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.operator_approved_public_proxy_baseline_report/v1") + ); + assert_eq!(report.pointer("/authority").and_then(Value::as_str), Some("XY-930")); + assert_eq!(report.pointer("/command/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + report.pointer("/command/run_id").and_then(Value::as_str), + Some("live-baseline-20260619143959") + ); + assert_eq!( + report.pointer("/corpus/profile").and_then(Value::as_str), + Some("production-private") + ); + assert_eq!( + report.pointer("/corpus/runner_track").and_then(Value::as_str), + Some("private_production") + ); + assert_eq!( + report.pointer("/corpus/manifest_kind").and_then(Value::as_str), + Some("operator_approved_public_proxy") + ); + assert_eq!( + report.pointer("/corpus/manifest_id").and_then(Value::as_str), + Some("operator-approved-public-proxy-prod-corpus-2026-06-19") + ); + assert_eq!(report.pointer("/embedding/mode").and_then(Value::as_str), Some("local")); + assert_eq!( + report.pointer("/embedding/provider_backed_quality_proven").and_then(Value::as_bool), + Some(false) + ); + assert_eq!(report.pointer("/summary/project_status").and_then(Value::as_str), Some("pass")); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/check_summary/total").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/check_summary/pass").and_then(Value::as_u64), Some(8)); + assert_eq!( + report.pointer("/query_summary/wrong_result_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/backfill/completed_count").and_then(Value::as_u64), Some(12)); + assert_eq!(report.pointer("/backfill/duplicate_source_notes").and_then(Value::as_u64), Some(0)); + + let queries = support::array_at(&report, "/queries")?; + let provider = support::find_by_field(queries, "/id", "q-explain-provider-blocker")?; + + assert_eq!(queries.len(), 8); + assert_eq!( + provider.pointer("/top_evidence").and_then(Value::as_str), + Some("blocker-provider-missing") + ); + assert_eq!(provider.pointer("/matched").and_then(Value::as_bool), Some(true)); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not call this real private-corpus production proof." + )?); + assert!(support::array_contains_str( + &report, + "/claim_boundaries/not_allowed", + "Do not claim provider-backed production quality; embedding mode was local." + )?); + assert!(support::array_contains_str( + &report, + "/improvement_regression_readback/unchanged", + "Real private-corpus production quality is still not proven." + )?); + assert!(support::array_contains_str( + &report, + "/next_optimization_direction/when_operator_inputs_exist", + "Run provider-backed embeddings with ELF_BASELINE_ELF_EMBEDDING_MODE=provider and a routed provider setup." + )?); + assert!(markdown.contains("proxy corpus pass")); + assert!(markdown.contains("Do not call this real private-corpus production proof.")); + assert!(markdown.contains("| Embedding mode | `local` |")); + assert!( + benchmarking_index + .contains("2026-06-19-operator-approved-public-proxy-production-private-addendum.md") + ); + assert!(benchmarking_index.contains("not real private-corpus or provider-backed proof")); + assert!(readme.contains("Operator-approved public-proxy addendum after XY-930")); + assert!(readme.contains("8/8 query passes")); + assert!(readme.contains("does not prove real private-corpus production quality")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters.rs index 9bba3e95..b6feede9 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters.rs @@ -1,3 +1,9 @@ +mod external_adapters_first_generation; +mod external_adapters_fixture; +mod external_adapters_graph_gates; +mod external_adapters_letta; +mod external_adapters_live_sweep; +mod external_adapters_operator_debug; mod graph_rag; mod loss_summary; mod manifest_summary; @@ -45,7 +51,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { support::find_by_field(adapters, "/adapter_id", "openviking_deep_profile_gate")?; let letta = support::find_by_field(adapters, "/adapter_id", "letta_research_gate")?; - assert_elf_fixture_adapter_record(elf)?; + external_adapters_fixture::assert_elf_fixture_adapter_record(elf)?; assert_eq!( elf_live.pointer("/evidence_class").and_then(Value::as_str), @@ -53,13 +59,16 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { ); assert_eq!(elf_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); - assert_live_sweep_record(elf_live, "blocked")?; - assert_operator_debug_live_adapter_records(elf_operator_debug, qmd_operator_debug)?; + external_adapters_live_sweep::assert_live_sweep_record(elf_live, "blocked")?; + external_adapters_operator_debug::assert_operator_debug_live_adapter_records( + elf_operator_debug, + qmd_operator_debug, + )?; assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("pass")); assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); - assert_qmd_live_baseline_record(qmd); + external_adapters_fixture::assert_qmd_live_baseline_record(qmd); assert_eq!( qmd_live.pointer("/evidence_class").and_then(Value::as_str), @@ -67,20 +76,26 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { ); assert_eq!(qmd_live.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); - assert_live_sweep_record(qmd_live, "blocked")?; + external_adapters_live_sweep::assert_live_sweep_record(qmd_live, "blocked")?; assert_eq!( agentmemory.pointer("/capabilities/1/status").and_then(Value::as_str), Some("mocked") ); - assert_first_generation_adapter_records(agentmemory, mem0, memsearch, claude_mem); + external_adapters_first_generation::assert_first_generation_adapter_records( + agentmemory, + mem0, + memsearch, + claude_mem, + ); assert_eq!(openviking.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); - assert_graph_rag_research_gate_records(ragflow, lightrag, graphrag); - assert_graphiti_zep_adapter(graphiti_zep); - + external_adapters_graph_gates::assert_graph_rag_research_gate_records( + ragflow, lightrag, graphrag, + ); + external_adapters_graph_gates::assert_graphiti_zep_adapter(graphiti_zep); graph_rag::assert_graphify_adapter(graphify)?; graph_rag::assert_graph_rag_representative_scenarios( ragflow, @@ -89,9 +104,8 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { graphiti_zep, graphify, )?; - - assert_letta_core_archival_gate(letta)?; - assert_qmd_deep_profile_gate(qmd_deep); + external_adapters_letta::assert_letta_core_archival_gate(letta)?; + external_adapters_fixture::assert_qmd_deep_profile_gate(qmd_deep); assert_eq!( qmd_deep.pointer("/capabilities/2/status").and_then(Value::as_str), @@ -106,7 +120,7 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Some("docker_local_embed_context_trajectory_gate") ); - assert_openviking_deep_profile_gate(openviking_deep); + external_adapters_fixture::assert_openviking_deep_profile_gate(openviking_deep); assert_eq!( openviking_deep.pointer("/result/artifact").and_then(Value::as_str), @@ -115,597 +129,3 @@ fn assert_external_adapter_manifest_records(report: &Value) -> Result<()> { Ok(()) } - -fn assert_graph_rag_research_gate_records(ragflow: &Value, lightrag: &Value, graphrag: &Value) { - assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), - Some( - "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" - ) - ); - assert_eq!( - ragflow.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-ragflow-docker") - ); - assert_eq!( - ragflow.pointer("/result/artifact").and_then(Value::as_str), - Some("tmp/real-world-memory/ragflow-smoke/ragflow-report.json") - ); - assert_eq!( - ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), - Some("https://github.com/infiniflow/ragflow") - ); - assert_eq!(lightrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!(lightrag.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - lightrag.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-lightrag-docker-context") - ); - assert_eq!( - lightrag.pointer("/run/command").and_then(Value::as_str), - Some("ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context") - ); - assert_eq!( - lightrag.pointer("/capabilities/3/status").and_then(Value::as_str), - Some("not_encoded") - ); - assert_eq!(graphrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!( - graphrag.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-graphrag-docker") - ); - assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); -} - -fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert!( - adapter - .pointer("/setup/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("smoke-letta-core-archive-export-readback") - && evidence.contains("Docker-only benchmark-created agent export/readback")) - ); - assert_eq!( - adapter.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-letta-core-archive-export-readback") - ); - assert_eq!( - adapter.pointer("/run/command").and_then(Value::as_str), - Some( - "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback" - ) - ); - assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( - |setup| setup.contains("exports core block JSON plus archival search/readback JSON") - && setup.contains("typed artifact") - )); - - let suites = support::array_at(adapter, "/suites")?; - let core_suite = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; - - assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - adapter.pointer("/capabilities/2/capability").and_then(Value::as_str), - Some("real_world_job_adapter") - ); - assert_eq!(adapter.pointer("/capabilities/2/status").and_then(Value::as_str), Some("blocked")); - - let scenarios = support::array_at(adapter, "/scenarios")?; - let attachment = - support::find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; - let scope = support::find_by_field(scenarios, "/scenario_id", "core_block_scope_readback")?; - let provenance = - support::find_by_field(scenarios, "/scenario_id", "core_block_provenance_readback")?; - let stale = support::find_by_field(scenarios, "/scenario_id", "stale_core_detection")?; - let fallback = support::find_by_field(scenarios, "/scenario_id", "archival_fallback_readback")?; - let decision = support::find_by_field( - scenarios, - "/scenario_id", - "core_archival_project_decision_recovery", - )?; - - assert_eq!(scenarios.len(), 6); - - for scenario in [attachment, scope, provenance, stale, fallback, decision] { - assert_eq!(scenario.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); - assert_eq!( - scenario.pointer("/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); - assert_eq!( - scenario.pointer("/command").and_then(Value::as_str), - Some("cargo make smoke-letta-core-archive-export-readback") - ); - assert_eq!( - scenario.pointer("/artifact").and_then(Value::as_str), - Some("tmp/real-world-memory/letta-core-archive/summary.json") - ); - } - - assert_eq!(attachment.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); - assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); - assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); - - Ok(()) -} - -fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { - assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("82 jobs across 19 suites") - && evidence.contains("75 pass") - && evidence.contains("7 blocked") - && evidence.contains("core_archival_memory") - && evidence.contains("memory_summary") - && evidence.contains("proactive_brief") - && evidence.contains("scheduled_memory") - && evidence.contains("context_trajectory") - })); - - let suites = support::array_at(adapter, "/suites")?; - let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; - let scheduled = support::find_by_field(suites, "/suite_id", "scheduled_memory")?; - let context_trajectory = support::find_by_field(suites, "/suite_id", "context_trajectory")?; - - assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); - assert!(core_archival.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("core block attachment") - && evidence.contains("project-decision recovery") - && evidence.contains("archival note search") - })); - assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert!(scheduled.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("4 passing source-linked task readbacks") - && evidence.contains("private/provider scheduler blocker") - })); - assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert!( - adapter - .pointer("/notes/1") - .and_then(Value::as_str) - .is_some_and(|note| note.contains("OpenViking context-trajectory measurement gates")) - ); - - Ok(()) -} - -fn assert_qmd_deep_profile_gate(adapter: &Value) { - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("not_encoded")); -} - -fn assert_qmd_live_baseline_record(adapter: &Value) { - let result_evidence = adapter.pointer("/result/evidence").and_then(Value::as_str); - let retrieval_evidence = adapter.pointer("/suites/0/evidence").and_then(Value::as_str); - - assert!(result_evidence.is_some_and(|evidence| { - evidence.contains("This live_baseline_only record is same-corpus evidence only") - && evidence.contains("cite qmd_live_real_world for the full live real-world sweep") - && !evidence.contains("no real_world_job qmd adapter is encoded yet") - })); - assert!(retrieval_evidence.is_some_and(|evidence| { - evidence.contains("does not execute real_world_job retrieval prompts") - && evidence.contains("cite qmd_live_real_world for the live retrieval adapter run") - && !evidence.contains("no real_world_job retrieval adapter run is encoded") - })); -} - -fn assert_operator_debug_live_adapter_records(elf: &Value, qmd: &Value) -> Result<()> { - assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); - assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("pass")); - assert_eq!( - elf.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make real-world-job-operator-ux-live-adapters") - ); - assert_eq!( - elf.pointer("/suites/0/suite_id").and_then(Value::as_str), - Some("operator_debugging_ux") - ); - assert_eq!(elf.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - elf.pointer("/capabilities/1/capability").and_then(Value::as_str), - Some("trace_hydration_metadata") - ); - assert_eq!(elf.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - elf.pointer("/capabilities/2/capability").and_then(Value::as_str), - Some("replay_command_metadata") - ); - assert_eq!(elf.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - elf.pointer("/capabilities/3/capability").and_then(Value::as_str), - Some("candidate_drop_visibility") - ); - assert_eq!(elf.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - elf.pointer("/capabilities/4/capability").and_then(Value::as_str), - Some("openmemory_or_claude_mem_ui_runner") - ); - assert_eq!(elf.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); - - let elf_scenarios = support::array_at(elf, "/scenarios")?; - let elf_trace = - support::find_by_field(elf_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; - let elf_replay = - support::find_by_field(elf_scenarios, "/scenario_id", "operator_debug_replay_command")?; - let elf_candidate = support::find_by_field( - elf_scenarios, - "/scenario_id", - "operator_debug_candidate_drop_visibility", - )?; - let elf_repair = support::find_by_field( - elf_scenarios, - "/scenario_id", - "operator_debug_repair_action_clarity", - )?; - let elf_selected = support::find_by_field( - elf_scenarios, - "/scenario_id", - "operator_debug_selected_but_not_narrated", - )?; - - assert_eq!(elf_scenarios.len(), 5); - assert_eq!(elf_trace.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(elf_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - assert_eq!(elf_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(elf_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - assert_eq!(elf_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(elf_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - - assert_operator_debug_qmd_adapter_record(qmd)?; - - assert!(support::array_at(elf, "/notes")?.iter().any(|note| { - note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) - })); - assert!(support::array_at(qmd, "/notes")?.iter().any(|note| { - note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) - })); - - Ok(()) -} - -fn assert_operator_debug_qmd_adapter_record(qmd: &Value) -> Result<()> { - assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); - assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - qmd.pointer("/suites/0/suite_id").and_then(Value::as_str), - Some("operator_debugging_ux") - ); - assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - qmd.pointer("/capabilities/1/capability").and_then(Value::as_str), - Some("local_replay_command_metadata") - ); - assert_eq!(qmd.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - qmd.pointer("/capabilities/2/capability").and_then(Value::as_str), - Some("trace_hydration_metadata") - ); - assert_eq!(qmd.pointer("/capabilities/2/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - qmd.pointer("/capabilities/3/capability").and_then(Value::as_str), - Some("candidate_drop_visibility") - ); - assert_eq!(qmd.pointer("/capabilities/3/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(qmd.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); - - let qmd_scenarios = support::array_at(qmd, "/scenarios")?; - let qmd_trace = - support::find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; - let qmd_replay = - support::find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_replay_command")?; - let qmd_candidate = support::find_by_field( - qmd_scenarios, - "/scenario_id", - "operator_debug_candidate_drop_visibility", - )?; - let qmd_repair = support::find_by_field( - qmd_scenarios, - "/scenario_id", - "operator_debug_repair_action_clarity", - )?; - let qmd_selected = support::find_by_field( - qmd_scenarios, - "/scenario_id", - "operator_debug_selected_but_not_narrated", - )?; - - assert_eq!(qmd_scenarios.len(), 5); - assert_eq!(qmd_trace.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(qmd_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - assert_eq!(qmd_replay.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(qmd_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(qmd_candidate.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(qmd_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - assert_eq!(qmd_repair.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(qmd_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); - assert_eq!(qmd_selected.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(qmd_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); - - Ok(()) -} - -fn assert_openviking_deep_profile_gate(adapter: &Value) { - let trajectory_evidence = adapter.pointer("/capabilities/1/evidence").and_then(Value::as_str); - - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert!(trajectory_evidence.is_some_and(|evidence| { - evidence.contains("evidence-bearing same-corpus output") - && evidence.contains("selected hierarchy/expansion artifacts") - && !evidence.contains("setup reaches runnable OpenViking APIs") - })); -} - -fn assert_first_generation_adapter_records( - agentmemory: &Value, - mem0: &Value, - memsearch: &Value, - claude_mem: &Value, -) { - assert_agentmemory_first_generation_records(agentmemory); - assert_mem0_first_generation_records(mem0); - assert_memsearch_first_generation_records(memsearch); - assert_claude_mem_first_generation_records(claude_mem); -} - -fn assert_agentmemory_first_generation_records(agentmemory: &Value) { - assert_eq!( - agentmemory.pointer("/scenarios/1/status").and_then(Value::as_str), - Some("lifecycle_fail") - ); - assert_eq!( - agentmemory.pointer("/scenarios/1/elf_position").and_then(Value::as_str), - Some("wins") - ); - assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - agentmemory.pointer("/scenarios/2/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); -} - -fn assert_mem0_first_generation_records(mem0: &Value) { - assert_eq!( - mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), - Some("local_lifecycle_update_delete_reload") - ); - assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - mem0.pointer("/capabilities/3/capability").and_then(Value::as_str), - Some("preference_correction_history") - ); - assert_eq!(mem0.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - mem0.pointer("/capabilities/7/capability").and_then(Value::as_str), - Some("openmemory_ui_readback") - ); - assert_eq!(mem0.pointer("/capabilities/7/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - mem0.pointer("/capabilities/8/capability").and_then(Value::as_str), - Some("hosted_managed_memory_claims") - ); - assert_eq!(mem0.pointer("/capabilities/8/status").and_then(Value::as_str), Some("unsupported")); - assert_eq!(mem0.pointer("/scenarios/0/status").and_then(Value::as_str), Some("pass")); - assert_eq!(mem0.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("ties")); - assert_eq!( - mem0.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), - Some("preference_correction_history") - ); - assert_eq!(mem0.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - mem0.pointer("/scenarios/1/comparison_outcome").and_then(Value::as_str), - Some("loss") - ); - assert_eq!( - mem0.pointer("/scenarios/5/scenario_id").and_then(Value::as_str), - Some("openmemory_ui_export_readback") - ); - assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - mem0.pointer("/scenarios/5/command").and_then(Value::as_str), - Some("cargo make openmemory-ui-export-readback") - ); - assert_eq!( - mem0.pointer("/scenarios/5/artifact").and_then(Value::as_str), - Some("tmp/live-baseline/mem0-openmemory-ui-export.json") - ); - assert!( - mem0.pointer("/capabilities/7/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("export-helper setup probe") - && evidence.contains("requires Docker access")) - ); - assert_eq!( - mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), - Some("non_goal") - ); -} - -fn assert_memsearch_first_generation_records(memsearch: &Value) { - assert_eq!( - memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), - Some("reindex_update_delete_reload") - ); - assert_eq!(memsearch.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - memsearch.pointer("/scenarios/0/scenario_id").and_then(Value::as_str), - Some("canonical_markdown_reindex_reload") - ); - assert_eq!( - memsearch.pointer("/scenarios/0/elf_position").and_then(Value::as_str), - Some("untested") - ); - assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); - assert!(memsearch.pointer("/suites/0/evidence").and_then(Value::as_str).is_some_and( - |evidence| evidence.contains("fixture-backed source-of-truth prompt coverage") - && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet") - && evidence.contains("not a suite pass") - )); - assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); - assert!(memsearch.pointer("/suites/1/evidence").and_then(Value::as_str).is_some_and( - |evidence| evidence.contains("fixture-backed retrieval-debug prompt coverage") - && evidence.contains( - "No live memsearch runtime adapter executes retrieval prompt scoring yet" - ) && evidence.contains("not a suite pass") - )); - assert_eq!(memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - memsearch.pointer("/scenarios/1/elf_position").and_then(Value::as_str), - Some("untested") - ); - assert_eq!( - memsearch.pointer("/scenarios/3/status").and_then(Value::as_str), - Some("unsupported") - ); - assert_eq!( - memsearch.pointer("/capabilities/4/capability").and_then(Value::as_str), - Some("markdown_source_store_prompt_jobs") - ); - assert_eq!(memsearch.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); -} - -fn assert_claude_mem_first_generation_records(claude_mem: &Value) { - assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real")); - assert_eq!( - claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str), - Some("repository_progressive_disclosure") - ); - assert_eq!(claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - claude_mem.pointer("/capabilities/6/status").and_then(Value::as_str), - Some("blocked") - ); - assert_eq!(claude_mem.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(claude_mem.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); - assert!( - claude_mem - .pointer("/suites/1/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("fixture-backed progressive-disclosure") - && evidence.contains("viewer/operator workflow remains blocked")) - ); - assert_eq!(claude_mem.pointer("/suites/2/status").and_then(Value::as_str), Some("blocked")); - assert!( - claude_mem - .pointer("/suites/2/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("hook capture remains blocked")) - ); - assert_eq!( - claude_mem.pointer("/scenarios/0/status").and_then(Value::as_str), - Some("wrong_result") - ); - assert_eq!( - claude_mem.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), - Some("retrieval_repair_artifact_path") - ); - assert_eq!( - claude_mem.pointer("/scenarios/1/status").and_then(Value::as_str), - Some("wrong_result") - ); - assert!( - claude_mem - .pointer("/scenarios/1/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("rerun/inspection targets") - && evidence.contains("tmp/live-baseline/claude-mem-checks.json")) - ); - assert_eq!(claude_mem.pointer("/scenarios/2/status").and_then(Value::as_str), Some("pass")); - assert_eq!(claude_mem.pointer("/scenarios/4/status").and_then(Value::as_str), Some("pass")); - assert_eq!(claude_mem.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); -} - -fn assert_graphiti_zep_adapter(adapter: &Value) { - assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - adapter.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-graphiti-zep-docker-temporal") - ); - assert_eq!( - adapter.pointer("/run/command").and_then(Value::as_str), - Some( - "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal" - ) - ); - assert_eq!( - adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), - Some("memory_evolution") - ); - assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), - Some( - "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" - ) - ); -} -fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { - let suites = support::array_at(adapter, "/suites")?; - let capabilities = support::array_at(adapter, "/capabilities")?; - let adapter_id = adapter.pointer("/adapter_id").and_then(Value::as_str).unwrap_or_default(); - let targeted = support::find_by_field(capabilities, "/capability", "targeted_live_pass")?; - let full_pass = support::find_by_field(capabilities, "/capability", "full_suite_live_pass")?; - let work_resume = support::find_by_field(suites, "/suite_id", "work_resume")?; - let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; - let production_ops = support::find_by_field(suites, "/suite_id", "production_ops")?; - let consolidation = support::find_by_field(suites, "/suite_id", "consolidation")?; - let knowledge = support::find_by_field(suites, "/suite_id", "knowledge_compilation")?; - let operator_debug = support::find_by_field(suites, "/suite_id", "operator_debugging_ux")?; - let capture = support::find_by_field(suites, "/suite_id", "capture_integration")?; - let personalization = support::find_by_field(suites, "/suite_id", "personalization")?; - let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; - let context_trajectory = support::find_by_field(suites, "/suite_id", "context_trajectory")?; - let trust_sot = support::find_by_field(suites, "/suite_id", "trust_source_of_truth")?; - let retrieval = support::find_by_field(suites, "/suite_id", "retrieval")?; - let project_decisions = support::find_by_field(suites, "/suite_id", "project_decisions")?; - - assert_eq!(suites.len(), 13); - assert_eq!(targeted.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(full_pass.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert!( - adapter - .pointer("/result/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("55 jobs across all 13 checked-in suites")) - ); - assert_eq!(trust_sot.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - production_ops.pointer("/status").and_then(Value::as_str), - Some(production_ops_status) - ); - - if adapter_id == "elf_live_real_world" { - assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); - assert!( - capture - .pointer("/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("4/4 capture_integration jobs")) - ); - } else { - assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - } - - assert_eq!(personalization.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); - - Ok(()) -} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_first_generation.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_first_generation.rs new file mode 100644 index 00000000..53582147 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_first_generation.rs @@ -0,0 +1,180 @@ +use serde_json::Value; + +pub(super) fn assert_first_generation_adapter_records( + agentmemory: &Value, + mem0: &Value, + memsearch: &Value, + claude_mem: &Value, +) { + assert_agentmemory_first_generation_records(agentmemory); + assert_mem0_first_generation_records(mem0); + assert_memsearch_first_generation_records(memsearch); + assert_claude_mem_first_generation_records(claude_mem); +} + +pub(super) fn assert_agentmemory_first_generation_records(agentmemory: &Value) { + assert_eq!( + agentmemory.pointer("/scenarios/1/status").and_then(Value::as_str), + Some("lifecycle_fail") + ); + assert_eq!( + agentmemory.pointer("/scenarios/1/elf_position").and_then(Value::as_str), + Some("wins") + ); + assert_eq!(agentmemory.pointer("/scenarios/2/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + agentmemory.pointer("/scenarios/2/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); +} + +pub(super) fn assert_mem0_first_generation_records(mem0: &Value) { + assert_eq!( + mem0.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("local_lifecycle_update_delete_reload") + ); + assert_eq!(mem0.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/capabilities/7/capability").and_then(Value::as_str), + Some("openmemory_ui_readback") + ); + assert_eq!(mem0.pointer("/capabilities/7/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/capabilities/8/capability").and_then(Value::as_str), + Some("hosted_managed_memory_claims") + ); + assert_eq!(mem0.pointer("/capabilities/8/status").and_then(Value::as_str), Some("unsupported")); + assert_eq!(mem0.pointer("/scenarios/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!(mem0.pointer("/scenarios/0/elf_position").and_then(Value::as_str), Some("ties")); + assert_eq!( + mem0.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("preference_correction_history") + ); + assert_eq!(mem0.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + mem0.pointer("/scenarios/1/comparison_outcome").and_then(Value::as_str), + Some("loss") + ); + assert_eq!( + mem0.pointer("/scenarios/5/scenario_id").and_then(Value::as_str), + Some("openmemory_ui_export_readback") + ); + assert_eq!(mem0.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + mem0.pointer("/scenarios/5/command").and_then(Value::as_str), + Some("cargo make openmemory-ui-export-readback") + ); + assert_eq!( + mem0.pointer("/scenarios/5/artifact").and_then(Value::as_str), + Some("tmp/live-baseline/mem0-openmemory-ui-export.json") + ); + assert!( + mem0.pointer("/capabilities/7/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("export-helper setup probe") + && evidence.contains("requires Docker access")) + ); + assert_eq!( + mem0.pointer("/scenarios/6/comparison_outcome").and_then(Value::as_str), + Some("non_goal") + ); +} + +pub(super) fn assert_memsearch_first_generation_records(memsearch: &Value) { + assert_eq!( + memsearch.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("reindex_update_delete_reload") + ); + assert_eq!(memsearch.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memsearch.pointer("/scenarios/0/scenario_id").and_then(Value::as_str), + Some("canonical_markdown_reindex_reload") + ); + assert_eq!( + memsearch.pointer("/scenarios/0/elf_position").and_then(Value::as_str), + Some("untested") + ); + assert_eq!(memsearch.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert!(memsearch.pointer("/suites/0/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed source-of-truth prompt coverage") + && evidence.contains("No live memsearch runtime adapter executes prompt scoring yet") + && evidence.contains("not a suite pass") + )); + assert_eq!(memsearch.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); + assert!(memsearch.pointer("/suites/1/evidence").and_then(Value::as_str).is_some_and( + |evidence| evidence.contains("fixture-backed retrieval-debug prompt coverage") + && evidence.contains( + "No live memsearch runtime adapter executes retrieval prompt scoring yet" + ) && evidence.contains("not a suite pass") + )); + assert_eq!(memsearch.pointer("/scenarios/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + memsearch.pointer("/scenarios/1/elf_position").and_then(Value::as_str), + Some("untested") + ); + assert_eq!( + memsearch.pointer("/scenarios/3/status").and_then(Value::as_str), + Some("unsupported") + ); + assert_eq!( + memsearch.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("markdown_source_store_prompt_jobs") + ); + assert_eq!(memsearch.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); +} + +pub(super) fn assert_claude_mem_first_generation_records(claude_mem: &Value) { + assert_eq!(claude_mem.pointer("/capabilities/1/status").and_then(Value::as_str), Some("real")); + assert_eq!( + claude_mem.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("repository_progressive_disclosure") + ); + assert_eq!(claude_mem.pointer("/capabilities/4/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + claude_mem.pointer("/capabilities/6/status").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(claude_mem.pointer("/suites/0/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(claude_mem.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("fixture-backed progressive-disclosure") + && evidence.contains("viewer/operator workflow remains blocked")) + ); + assert_eq!(claude_mem.pointer("/suites/2/status").and_then(Value::as_str), Some("blocked")); + assert!( + claude_mem + .pointer("/suites/2/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("hook capture remains blocked")) + ); + assert_eq!( + claude_mem.pointer("/scenarios/0/status").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!( + claude_mem.pointer("/scenarios/1/scenario_id").and_then(Value::as_str), + Some("retrieval_repair_artifact_path") + ); + assert_eq!( + claude_mem.pointer("/scenarios/1/status").and_then(Value::as_str), + Some("wrong_result") + ); + assert!( + claude_mem + .pointer("/scenarios/1/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("rerun/inspection targets") + && evidence.contains("tmp/live-baseline/claude-mem-checks.json")) + ); + assert_eq!(claude_mem.pointer("/scenarios/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/4/status").and_then(Value::as_str), Some("pass")); + assert_eq!(claude_mem.pointer("/scenarios/5/status").and_then(Value::as_str), Some("blocked")); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_fixture.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_fixture.rs new file mode 100644 index 00000000..da46deb9 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_fixture.rs @@ -0,0 +1,77 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("82 jobs across 19 suites") + && evidence.contains("75 pass") + && evidence.contains("7 blocked") + && evidence.contains("core_archival_memory") + && evidence.contains("memory_summary") + && evidence.contains("proactive_brief") + && evidence.contains("scheduled_memory") + && evidence.contains("context_trajectory") + })); + + let suites = support::array_at(adapter, "/suites")?; + let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; + let scheduled = support::find_by_field(suites, "/suite_id", "scheduled_memory")?; + let context_trajectory = support::find_by_field(suites, "/suite_id", "context_trajectory")?; + + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!(core_archival.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("core block attachment") + && evidence.contains("project-decision recovery") + && evidence.contains("archival note search") + })); + assert_eq!(scheduled.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!(scheduled.pointer("/evidence").and_then(Value::as_str).is_some_and(|evidence| { + evidence.contains("4 passing source-linked task readbacks") + && evidence.contains("private/provider scheduler blocker") + })); + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/notes/1") + .and_then(Value::as_str) + .is_some_and(|note| note.contains("OpenViking context-trajectory measurement gates")) + ); + + Ok(()) +} + +pub(super) fn assert_qmd_deep_profile_gate(adapter: &Value) { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("not_encoded")); +} + +pub(super) fn assert_qmd_live_baseline_record(adapter: &Value) { + let result_evidence = adapter.pointer("/result/evidence").and_then(Value::as_str); + let retrieval_evidence = adapter.pointer("/suites/0/evidence").and_then(Value::as_str); + + assert!(result_evidence.is_some_and(|evidence| { + evidence.contains("This live_baseline_only record is same-corpus evidence only") + && evidence.contains("cite qmd_live_real_world for the full live real-world sweep") + && !evidence.contains("no real_world_job qmd adapter is encoded yet") + })); + assert!(retrieval_evidence.is_some_and(|evidence| { + evidence.contains("does not execute real_world_job retrieval prompts") + && evidence.contains("cite qmd_live_real_world for the live retrieval adapter run") + && !evidence.contains("no real_world_job retrieval adapter run is encoded") + })); +} +pub(super) fn assert_openviking_deep_profile_gate(adapter: &Value) { + let trajectory_evidence = adapter.pointer("/capabilities/1/evidence").and_then(Value::as_str); + + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!(trajectory_evidence.is_some_and(|evidence| { + evidence.contains("evidence-bearing same-corpus output") + && evidence.contains("selected hierarchy/expansion artifacts") + && !evidence.contains("setup reaches runnable OpenViking APIs") + })); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_graph_gates.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_graph_gates.rs new file mode 100644 index 00000000..0fb014dc --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_graph_gates.rs @@ -0,0 +1,73 @@ +use serde_json::Value; + +pub(super) fn assert_graph_rag_research_gate_records( + ragflow: &Value, + lightrag: &Value, + graphrag: &Value, +) { + assert_eq!(ragflow.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(ragflow.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + ragflow.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + ) + ); + assert_eq!( + ragflow.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-ragflow-docker") + ); + assert_eq!( + ragflow.pointer("/result/artifact").and_then(Value::as_str), + Some("tmp/real-world-memory/ragflow-smoke/ragflow-report.json") + ); + assert_eq!( + ragflow.pointer("/execution_metadata/sources/0/url").and_then(Value::as_str), + Some("https://github.com/infiniflow/ragflow") + ); + assert_eq!(lightrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(lightrag.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + lightrag.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-lightrag-docker-context") + ); + assert_eq!( + lightrag.pointer("/run/command").and_then(Value::as_str), + Some("ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context") + ); + assert_eq!( + lightrag.pointer("/capabilities/3/status").and_then(Value::as_str), + Some("not_encoded") + ); + assert_eq!(graphrag.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!( + graphrag.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphrag-docker") + ); + assert_eq!(graphrag.pointer("/suites/1/status").and_then(Value::as_str), Some("not_encoded")); +} +pub(super) fn assert_graphiti_zep_adapter(adapter: &Value) { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphiti-zep-docker-temporal") + ); + assert_eq!( + adapter.pointer("/run/command").and_then(Value::as_str), + Some( + "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal" + ) + ); + assert_eq!( + adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("memory_evolution") + ); + assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + ) + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_letta.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_letta.rs new file mode 100644 index 00000000..40aed576 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_letta.rs @@ -0,0 +1,78 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_letta_core_archival_gate(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); + assert!( + adapter + .pointer("/setup/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("smoke-letta-core-archive-export-readback") + && evidence.contains("Docker-only benchmark-created agent export/readback")) + ); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-letta-core-archive-export-readback") + ); + assert_eq!( + adapter.pointer("/run/command").and_then(Value::as_str), + Some( + "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback" + ) + ); + assert!(adapter.pointer("/execution_metadata/setup_path").and_then(Value::as_str).is_some_and( + |setup| setup.contains("exports core block JSON plus archival search/readback JSON") + && setup.contains("typed artifact") + )); + + let suites = support::array_at(adapter, "/suites")?; + let core_suite = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; + + assert_eq!(core_suite.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("real_world_job_adapter") + ); + assert_eq!(adapter.pointer("/capabilities/2/status").and_then(Value::as_str), Some("blocked")); + + let scenarios = support::array_at(adapter, "/scenarios")?; + let attachment = + support::find_by_field(scenarios, "/scenario_id", "core_block_attachment_readback")?; + let scope = support::find_by_field(scenarios, "/scenario_id", "core_block_scope_readback")?; + let provenance = + support::find_by_field(scenarios, "/scenario_id", "core_block_provenance_readback")?; + let stale = support::find_by_field(scenarios, "/scenario_id", "stale_core_detection")?; + let fallback = support::find_by_field(scenarios, "/scenario_id", "archival_fallback_readback")?; + let decision = support::find_by_field( + scenarios, + "/scenario_id", + "core_archival_project_decision_recovery", + )?; + + assert_eq!(scenarios.len(), 6); + + for scenario in [attachment, scope, provenance, stale, fallback, decision] { + assert_eq!(scenario.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(scenario.pointer("/elf_position").and_then(Value::as_str), Some("untested")); + assert_eq!( + scenario.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + scenario.pointer("/command").and_then(Value::as_str), + Some("cargo make smoke-letta-core-archive-export-readback") + ); + assert_eq!( + scenario.pointer("/artifact").and_then(Value::as_str), + Some("tmp/real-world-memory/letta-core-archive/summary.json") + ); + } + + assert_eq!(attachment.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + assert_eq!(stale.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + assert_eq!(fallback.pointer("/comparison_outcome").and_then(Value::as_str), Some("blocked")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_live_sweep.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_live_sweep.rs new file mode 100644 index 00000000..a3b0640c --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_live_sweep.rs @@ -0,0 +1,68 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Result<()> { + let suites = support::array_at(adapter, "/suites")?; + let capabilities = support::array_at(adapter, "/capabilities")?; + let adapter_id = adapter.pointer("/adapter_id").and_then(Value::as_str).unwrap_or_default(); + let targeted = support::find_by_field(capabilities, "/capability", "targeted_live_pass")?; + let full_pass = support::find_by_field(capabilities, "/capability", "full_suite_live_pass")?; + let work_resume = support::find_by_field(suites, "/suite_id", "work_resume")?; + let memory_evolution = support::find_by_field(suites, "/suite_id", "memory_evolution")?; + let production_ops = support::find_by_field(suites, "/suite_id", "production_ops")?; + let consolidation = support::find_by_field(suites, "/suite_id", "consolidation")?; + let knowledge = support::find_by_field(suites, "/suite_id", "knowledge_compilation")?; + let operator_debug = support::find_by_field(suites, "/suite_id", "operator_debugging_ux")?; + let capture = support::find_by_field(suites, "/suite_id", "capture_integration")?; + let personalization = support::find_by_field(suites, "/suite_id", "personalization")?; + let core_archival = support::find_by_field(suites, "/suite_id", "core_archival_memory")?; + let context_trajectory = support::find_by_field(suites, "/suite_id", "context_trajectory")?; + let trust_sot = support::find_by_field(suites, "/suite_id", "trust_source_of_truth")?; + let retrieval = support::find_by_field(suites, "/suite_id", "retrieval")?; + let project_decisions = support::find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(suites.len(), 13); + assert_eq!(targeted.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(full_pass.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert!( + adapter + .pointer("/result/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("55 jobs across all 13 checked-in suites")) + ); + assert_eq!(trust_sot.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(work_resume.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + production_ops.pointer("/status").and_then(Value::as_str), + Some(production_ops_status) + ); + + if adapter_id == "elf_live_real_world" { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert!( + capture + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("4/4 capture_integration jobs")) + ); + } else { + assert_eq!(consolidation.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(knowledge.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(operator_debug.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + } + + assert_eq!(personalization.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(core_archival.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_operator_debug.rs b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_operator_debug.rs new file mode 100644 index 00000000..6540c12e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/external_adapters_operator_debug.rs @@ -0,0 +1,139 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_operator_debug_live_adapter_records(elf: &Value, qmd: &Value) -> Result<()> { + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(elf.pointer("/overall_status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make real-world-job-operator-ux-live-adapters") + ); + assert_eq!( + elf.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(elf.pointer("/suites/0/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(elf.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("replay_command_metadata") + ); + assert_eq!(elf.pointer("/capabilities/2/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(elf.pointer("/capabilities/3/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + elf.pointer("/capabilities/4/capability").and_then(Value::as_str), + Some("openmemory_or_claude_mem_ui_runner") + ); + assert_eq!(elf.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let elf_scenarios = support::array_at(elf, "/scenarios")?; + let elf_trace = + support::find_by_field(elf_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let elf_replay = + support::find_by_field(elf_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let elf_candidate = support::find_by_field( + elf_scenarios, + "/scenario_id", + "operator_debug_candidate_drop_visibility", + )?; + let elf_repair = support::find_by_field( + elf_scenarios, + "/scenario_id", + "operator_debug_repair_action_clarity", + )?; + let elf_selected = support::find_by_field( + elf_scenarios, + "/scenario_id", + "operator_debug_selected_but_not_narrated", + )?; + + assert_eq!(elf_scenarios.len(), 5); + assert_eq!(elf_trace.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(elf_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(elf_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(elf_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + + assert_operator_debug_qmd_adapter_record(qmd)?; + + assert!(support::array_at(elf, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + assert!(support::array_at(qmd, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("narrow operator-debug live slice")) + })); + + Ok(()) +} + +pub(super) fn assert_operator_debug_qmd_adapter_record(qmd: &Value) -> Result<()> { + assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(qmd.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("operator_debugging_ux") + ); + assert_eq!(qmd.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/1/capability").and_then(Value::as_str), + Some("local_replay_command_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/1/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + qmd.pointer("/capabilities/2/capability").and_then(Value::as_str), + Some("trace_hydration_metadata") + ); + assert_eq!(qmd.pointer("/capabilities/2/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + qmd.pointer("/capabilities/3/capability").and_then(Value::as_str), + Some("candidate_drop_visibility") + ); + assert_eq!(qmd.pointer("/capabilities/3/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd.pointer("/capabilities/4/status").and_then(Value::as_str), Some("not_encoded")); + + let qmd_scenarios = support::array_at(qmd, "/scenarios")?; + let qmd_trace = + support::find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_trace_hydration")?; + let qmd_replay = + support::find_by_field(qmd_scenarios, "/scenario_id", "operator_debug_replay_command")?; + let qmd_candidate = support::find_by_field( + qmd_scenarios, + "/scenario_id", + "operator_debug_candidate_drop_visibility", + )?; + let qmd_repair = support::find_by_field( + qmd_scenarios, + "/scenario_id", + "operator_debug_repair_action_clarity", + )?; + let qmd_selected = support::find_by_field( + qmd_scenarios, + "/scenario_id", + "operator_debug_selected_but_not_narrated", + )?; + + assert_eq!(qmd_scenarios.len(), 5); + assert_eq!(qmd_trace.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_trace.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_replay.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_replay.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_candidate.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_candidate.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + assert_eq!(qmd_repair.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(qmd_repair.pointer("/comparison_outcome").and_then(Value::as_str), Some("tie")); + assert_eq!(qmd_selected.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(qmd_selected.pointer("/comparison_outcome").and_then(Value::as_str), Some("win")); + + Ok(()) +}