diff --git a/Makefile.toml b/Makefile.toml index f00c200a..e5a2ca8d 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -71,6 +71,9 @@ # | real-world-memory-project-decisions | composite | | # | real-world-memory-project-decisions-json | command | | # | real-world-memory-project-decisions-report | command | | +# | real-world-memory-quantitative-scoreboard | composite | | +# | real-world-memory-quantitative-scoreboard-json | command | | +# | real-world-memory-quantitative-scoreboard-report | command | | # | real-world-memory-report | command | | # | real-world-memory-retrieval | composite | | # | real-world-memory-retrieval-json | command | | @@ -1110,6 +1113,55 @@ args = [ "tmp/real-world-memory/project-decisions/report.md", ] +[tasks.real-world-memory-quantitative-scoreboard] +workspace = false +dependencies = [ + "real-world-memory-quantitative-scoreboard-report", +] + +[tasks.real-world-memory-quantitative-scoreboard-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory", + "--out", + "tmp/real-world-memory/quantitative-scoreboard/report.json", + "--run-id", + "public-quantitative-competitor-scoreboard", + "--adapter-id", + "elf_real_world_memory_fixture", + "--adapter-name", + "ELF real-world memory fixture", +] + +[tasks.real-world-memory-quantitative-scoreboard-report] +workspace = false +dependencies = [ + "real-world-memory-quantitative-scoreboard-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/quantitative-scoreboard/report.json", + "--out", + "tmp/real-world-memory/quantitative-scoreboard/report.md", +] + [tasks.real-world-memory-report] workspace = false dependencies = [ diff --git a/README.md b/README.md index 3bb37719..f79564b3 100644 --- a/README.md +++ b/README.md @@ -328,6 +328,20 @@ provider-backed ELF evidence was required. evidence/source-ref/quote coverage and 0.000 irrelevant context ratio. P5 productization is narrowed to proven local/public workflows and remains unqueued until main-thread acceptance. +- Public quantitative competitor scoreboard after XY-1120: the June 27 report adds + `cargo make real-world-memory-quantitative-scoreboard` and + `elf.quality_scoreboard/v1` rows for 20 tracked products. Rows expose recall@5, + precision@5, MRR, nDCG, stale suppression, update/delete correctness, source-ref + coverage, and latency where measured, and typed blockers plus source provenance + and next-evidence metadata where comparable metrics are not yet available. + VectifyAI PageIndex, VectifyAI OpenKB, and plastic-labs Honcho are explicit typed + non-pass rows; PageIndex/OpenKB use existing fixture/source provenance, while + Honcho remains a source-provenance research gate until product-runtime evidence is + checked in. + The full ELF fixture run scores 75 pass, 0 wrong_result, 0 unsupported claims, and + 7 blocked jobs with aggregate evidence/source-ref/quote coverage at 1.000, while + every public product row remains non-comparable until held-out, leakage-audited, + digest-identified product-runtime evidence exists. - Operator-approved public-proxy addendum after XY-930: the June 19 follow-up runs `cargo make baseline-production-private-addendum` with a simulated/public-proxy production corpus manifest approved for this stage. The run records 12 documents, @@ -473,6 +487,7 @@ Detailed evidence and interpretation: - [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md) - [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md) - [P4 Quality Hardening and Productization Readiness Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md) +- [Public Quantitative Competitor Scoreboard Report - June 27, 2026](docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Single-User Production Runbook](docs/runbook/single_user_production.md) - Benchmark contract: @@ -571,6 +586,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [P3 Competitor-Strength Absorption Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md) - [P4 Production-Readiness Evidence Gates Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-production-readiness-evidence-gates-report.md) - [P4 Quality Hardening and Productization Readiness Report - June 23, 2026](docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md) +- [Public Quantitative Competitor Scoreboard Report - June 27, 2026](docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md) - [Live Baseline Benchmark Runbook](docs/runbook/benchmarking/live_baseline_benchmark.md) - [Real-World Agent Memory Benchmark](docs/runbook/benchmarking/real_world_agent_memory_benchmark.md) - [External Memory Improvement Plan](docs/evidence/external_memory/external_memory_improvement_plan.md) @@ -582,7 +598,7 @@ Detailed comparison, mechanism-level analysis, and source map: - [Derived Knowledge Page Follow-Up Research](docs/research/derived_knowledge_page_followup.md) - [Dreaming Product Surface Follow-Up Research](docs/research/dreaming_product_surface_followup.md) -Latest real-world benchmark report: June 23, 2026. Latest external research refresh: +Latest real-world benchmark report: June 27, 2026. Latest external research refresh: June 11, 2026; June 20 adds the Agent Knowledge OS Closeout Benchmark Report, the Graph Topic-Map Report - June 20, 2026, Knowledge Workspace Version-Diff Report - June 20, 2026, and the Live Knowledge-Page Rebuild/Lint Report - June 20, @@ -597,6 +613,8 @@ June 19 XY-930 operator-approved public-proxy production addendum and service-na Dreaming readback, the qmd debug-ergonomics Dreaming retest, the June 17 competitor-strength closeout, and the June 16 temporal reconciliation, live consolidation self-check, proactive-brief, and scheduled-memory scoring evidence. +June 27 adds the public quantitative competitor scoreboard report with row-level +comparability gates and no universal leaderboard claim. ## Documentation diff --git a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json index 578fe7fe..57778e96 100644 --- a/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json +++ b/apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json @@ -29,7 +29,7 @@ }, "run": { "status": "blocked", - "evidence": "The current fixture set reports 60 jobs across 16 suites: 53 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization.", + "evidence": "The current fixture set reports 82 jobs across 19 suites: 75 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization, and Work Continuity plus authority-recovery rows remain fixture-backed rather than live product-runtime superiority evidence.", "command": "cargo make real-world-memory", "artifact": "tmp/real-world-memory/real-world-memory-report.json" }, @@ -3171,6 +3171,330 @@ "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." } + }, + { + "adapter_id": "vectifyai_pageindex_same_corpus_blocker", + "project": "VectifyAI PageIndex", + "adapter_kind": "typed_same_corpus_setup_blocker", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-1068 records a same-corpus PageIndex blocker: no contained PageIndex product installation, PageIndex MCP readback, or emitted tree artifact maps node paths back to ELF Source Library source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json" + }, + "run": { + "status": "blocked", + "evidence": "The checked-in blocker uses the same source-library long-document corpus ids as ELF but does not run PageIndex product runtime or emit PageIndex tree_nodes.json, cited node paths, traversal output, or MCP readback.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "tmp/real-world-memory/pageindex-openkb/report.json" + }, + "result": { + "status": "blocked", + "evidence": "No PageIndex parity, win, tie, loss, or comparable pass claim is allowed until a contained PageIndex run emits source-id-mapped tree artifacts and setup/runtime metadata.", + "artifact": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md" + }, + "capabilities": [ + { + "capability": "long_document_tree_retrieval", + "status": "blocked", + "evidence": "PageIndex remains the vectorless long-document tree retrieval reference, but no contained tree retrieval product output is checked in." + }, + { + "capability": "pageindex_mcp_readback", + "status": "blocked", + "evidence": "No PageIndex MCP readback artifact maps to ELF source ids." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "A runnable adapter must emit tree nodes, cited node paths, traversal output, source ids, and setup/runtime metadata before scoring." + } + ], + "suites": [ + { + "suite_id": "source_library", + "status": "blocked", + "evidence": "The PageIndex blocker compares against the same ELF Source Library long-document corpus ids, but PageIndex product artifacts are missing." + } + ], + "scenarios": [ + { + "scenario_id": "pageindex_long_document_tree_blocked", + "suite_id": "source_library", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "PageIndex remains blocked until tree artifacts and MCP readback map back to same-corpus source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/VectifyAI/PageIndex", + "status": "real" + }, + { + "kind": "fixture", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json", + "status": "blocked" + }, + { + "kind": "report", + "ref": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "PageIndex repository", + "url": "https://github.com/VectifyAI/PageIndex", + "evidence": "Repository provenance for the tracked PageIndex product row." + } + ], + "setup_path": "Resolve the XY-1068 setup blocker by running PageIndex in a contained runtime over the same Source Library corpus.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit PageIndex tree artifacts, MCP readback, source ids, and runtime metadata.", + "resource_expectation": "Unknown until PageIndex product runtime is materialized for the same corpus.", + "retry_guidance": [ + "Run cargo make real-world-memory-pageindex-openkb to regenerate the typed blocker report.", + "Do not claim PageIndex parity, win, tie, or loss from ELF Source Library fixture evidence.", + "Score PageIndex only after emitted tree artifacts and MCP readback map to benchmark source ids." + ], + "research_depth": "Typed same-corpus setup blocker from XY-1068" + }, + "notes": [ + "This row is intentionally non-comparable and preserves PageIndex as a blocked tracked product row." + ], + "follow_up": { + "title": "Run PageIndex same-corpus long-document tree adapter", + "reason": "The fair comparison needs PageIndex tree nodes and traversal output over the source-library long-document corpus with source ids mapped to benchmark evidence ids." + } + }, + { + "adapter_id": "vectifyai_openkb_same_corpus_blocker", + "project": "VectifyAI OpenKB", + "adapter_kind": "typed_same_corpus_setup_blocker", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-1068 records a same-corpus OpenKB blocker: no contained OpenKB product run, generated wiki page export, entity/concept index export, saved exploration state, lint output, or watch/recompile trace maps back to ELF Knowledge Workspace source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json" + }, + "run": { + "status": "blocked", + "evidence": "The checked-in blocker uses the same knowledge corpus ids as ELF but does not run OpenKB product runtime or emit wiki, lint, saved exploration, or watch/recompile artifacts.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "tmp/real-world-memory/pageindex-openkb/report.json" + }, + "result": { + "status": "blocked", + "evidence": "No OpenKB parity, win, tie, loss, or comparable pass claim is allowed until a contained OpenKB run emits source-id-mapped wiki/index/lint/watch artifacts and setup/runtime metadata.", + "artifact": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md" + }, + "capabilities": [ + { + "capability": "compiled_wiki_export", + "status": "blocked", + "evidence": "OpenKB remains the compiled wiki/export reference, but no contained wiki export is checked in." + }, + { + "capability": "concept_entity_index_lint_watch", + "status": "blocked", + "evidence": "No entity/concept index export, lint output, saved exploration state, or watch/recompile trace maps to ELF Knowledge Workspace source ids." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "A runnable adapter must emit wiki pages, concept/entity indexes, lint output, saved exploration state, watch/recompile trace, source ids, and setup/runtime metadata before scoring." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The OpenKB blocker compares against the same ELF Knowledge Workspace corpus ids, but OpenKB product artifacts are missing." + } + ], + "scenarios": [ + { + "scenario_id": "openkb_wiki_recompile_blocked", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "OpenKB remains blocked until wiki, lint, saved exploration, and watch/recompile artifacts map back to same-corpus source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/VectifyAI/OpenKB", + "status": "real" + }, + { + "kind": "fixture", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json", + "status": "blocked" + }, + { + "kind": "report", + "ref": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenKB repository", + "url": "https://github.com/VectifyAI/OpenKB", + "evidence": "Repository provenance for the tracked OpenKB product row." + } + ], + "setup_path": "Resolve the XY-1068 setup blocker by running OpenKB in a contained runtime over the same Knowledge Workspace corpus.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit OpenKB wiki, index, lint, watch/recompile, source id, and runtime metadata artifacts.", + "resource_expectation": "Unknown until OpenKB product runtime is materialized for the same corpus.", + "retry_guidance": [ + "Run cargo make real-world-memory-pageindex-openkb to regenerate the typed blocker report.", + "Do not claim OpenKB parity, win, tie, or loss from ELF Knowledge Workspace fixture evidence.", + "Score OpenKB only after emitted wiki/index/lint/watch artifacts map to benchmark source ids." + ], + "research_depth": "Typed same-corpus setup blocker from XY-1068" + }, + "notes": [ + "This row is intentionally non-comparable and preserves OpenKB as a blocked tracked product row." + ], + "follow_up": { + "title": "Run OpenKB same-corpus wiki and watch/recompile adapter", + "reason": "The fair comparison needs OpenKB wiki/entity/concept outputs and watch/recompile artifacts over the same knowledge corpus with source ids mapped to benchmark evidence ids." + } + }, + { + "adapter_id": "plastic_labs_honcho_research_gate", + "project": "plastic-labs Honcho", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "No Docker-contained Honcho product-runtime adapter, benchmark fixture, source-id mapping, held-out split, leakage audit, or container digest is checked in for this scoreboard.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "run": { + "status": "blocked", + "evidence": "Honcho is tracked as a requested public comparison row, but no same-task product surface has been materialized against the ELF real_world_job corpus.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "result": { + "status": "blocked", + "evidence": "No Honcho parity, win, tie, loss, or comparable pass claim is allowed until product-runtime evidence maps returned context to benchmark source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "capabilities": [ + { + "capability": "stateful_agent_memory_runtime", + "status": "blocked", + "evidence": "Honcho is a requested memory/runtime comparison target, but no contained benchmark runtime evidence is checked in." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "No Honcho output maps returned context to ELF benchmark source ids." + }, + { + "capability": "container_digest_evidence", + "status": "blocked", + "evidence": "No Honcho container image digest or runtime metadata is recorded for the public scoreboard." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Honcho memory-quality behavior is not scored until benchmark runtime output exists." + }, + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "Honcho work-continuity or resume behavior is not scored until benchmark runtime output exists." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Honcho retrieval/context output is not scored until returned context maps to expected evidence ids." + } + ], + "scenarios": [ + { + "scenario_id": "honcho_product_runtime_blocked", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "Honcho remains blocked until a contained product-runtime adapter emits benchmark retrieved context, source ids, latency/cost/resource metadata, and container digest evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/plastic-labs/honcho", + "status": "real" + }, + { + "kind": "source", + "ref": "https://honcho.dev/docs/v3/documentation/introduction/vibecoding", + "status": "real" + }, + { + "kind": "manifest", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Honcho repository", + "url": "https://github.com/plastic-labs/honcho", + "evidence": "Repository provenance for the tracked Honcho product row." + }, + { + "label": "Honcho documentation", + "url": "https://honcho.dev/docs/v3/documentation/introduction/vibecoding", + "evidence": "Documentation provenance for Honcho as a stateful agent-memory/runtime comparison target." + } + ], + "setup_path": "Create a Docker-contained Honcho adapter over the real_world_job corpus before scoring.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit Honcho context/memory output, source ids, latency/cost/resource metadata, and container digest evidence.", + "resource_expectation": "Unknown until Honcho product runtime is materialized for the benchmark corpus.", + "retry_guidance": [ + "Do not claim Honcho parity, win, tie, or loss from the research-gate row.", + "Score Honcho only after product-runtime output maps to benchmark source ids.", + "Record held-out, leakage-audit, and container digest metadata before comparability." + ], + "research_depth": "D0 tracked product row with typed runtime blocker" + }, + "notes": [ + "This row is intentionally non-comparable and records the user-requested Honcho comparison target as a typed blocker." + ], + "follow_up": { + "title": "Run Honcho product-runtime adapter", + "reason": "The fair comparison needs Honcho runtime context output, source-id mapping, held-out/leakage evidence, and container digest metadata over the ELF real_world_job corpus." + } } ] } diff --git a/apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json b/apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json new file mode 100644 index 00000000..00cb9467 --- /dev/null +++ b/apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json @@ -0,0 +1,16271 @@ +{ + "schema": "elf.real_world_job_report/v1", + "run_id": "public-quantitative-competitor-scoreboard", + "generated_at": "2026-06-27T15:52:54.359477Z", + "runner_version": "0.2.0-bc98c2f3ff7e9bbb40e4478cb11b594d32128f44-aarch64-apple-darwin", + "corpus_profile": "mixed", + "adapter": { + "adapter_id": "elf_real_world_memory_fixture", + "name": "ELF real-world memory fixture", + "behavior": "offline_fixture_response", + "storage": "not_encoded", + "runtime": "not_encoded", + "notes": "Offline runner scores checked-in fixture responses; it does not exercise a live external adapter." + }, + "scoreboard": { + "schema": "elf.quality_scoreboard/v1", + "result_states": [ + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "not_comparable", + "unsupported_claim" + ], + "evidence_classes": [ + "fixture_backed", + "live_baseline", + "live_real_world", + "research_gate" + ], + "metric_basis": "produced_evidence_order", + "retrieval_k": 5, + "job_typed_non_pass_count": 7, + "job_typed_non_pass_states_present": [ + "blocked" + ], + "job_summary_claim": "typed_non_pass_present", + "external_adapter_typed_non_pass_count": 240, + "external_adapter_typed_non_pass_states_present": [ + "blocked", + "incomplete", + "not_encoded", + "not_tested", + "wrong_result" + ], + "typed_non_pass_count": 247, + "typed_non_pass_states_present": [ + "blocked", + "incomplete", + "not_encoded", + "not_tested", + "wrong_result" + ], + "evidence_class_counts": { + "fixture_backed": 1, + "live_baseline": 6, + "live_real_world": 5, + "research_gate": 14 + }, + "summary_claim": "typed_non_pass_present", + "unqualified_win_claim_allowed": false, + "claim_boundary": "Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins.", + "rows": [ + { + "product_id": "elf_current_report", + "product_name": "ELF", + "row_source": "current_real_world_job_report", + "evidence_class": "fixture_backed", + "result_state": "blocked", + "comparable": false, + "same_corpus": true, + "source_id_mapped": true, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "produced_evidence_order", + "recall_at_k": 0.988, + "precision_at_k": 0.415, + "mrr": 0.988, + "ndcg": 0.985, + "expected_evidence_recall": 1.0, + "citation_source_ref_coverage": 1.0, + "expected_evidence_matched": 172, + "expected_evidence_total": 172, + "produced_evidence_total": 187 + }, + "lifecycle": { + "stale_suppression": 1.0, + "stale_suppressed_count": 23, + "stale_check_count": 23, + "update_correctness": 1.0, + "update_correct_count": 3, + "update_check_count": 3, + "delete_correctness": 1.0, + "delete_correct_count": 1, + "delete_check_count": 1, + "rollback_history_readback_rate": 1.0, + "rollback_history_readback_count": 3, + "rollback_history_check_count": 3 + }, + "answer_safety": { + "unsupported_claim_rate": 0.0, + "unsupported_claim_count": 0, + "stale_answer_rate": 0.0, + "stale_answer_count": 0, + "hallucinated_evidence_rate": 0.0, + "redaction_leak_count": 0, + "irrelevant_context_ratio": 0.0 + }, + "operations": { + "mean_latency_ms": 2.885, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "resource_envelope_status": "pass", + "resource_envelope_job_count": 2, + "resource_envelope_pass_count": 2 + }, + "coverage": { + "job_count": 82, + "encoded_suite_count": 19, + "pass_count": 75, + "typed_non_pass_count": 7, + "source_ref_coverage": 1.0, + "evidence_coverage": 1.0, + "evidence_class": "fixture_backed" + } + }, + "strengths": [ + "Expected evidence recall is complete for encoded jobs.", + "Source-ref coverage is complete for encoded required evidence.", + "Encoded stale-answer and stale-retrieval counters are zero.", + "Encoded redaction leak count is zero.", + "Work Continuity readback metrics are encoded in the report." + ], + "weaknesses": [ + "7 encoded job row(s) are typed non-pass.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "apps/elf-eval/fixtures/real_world_memory/", + "apps/elf-eval/src/bin/real_world_job_benchmark.rs" + ] + }, + { + "product_id": "graphrag", + "product_name": "GraphRAG", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 4, + "pass_count": 0, + "typed_non_pass_count": 18, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter graphrag_research_gate overall status is blocked.", + "Suite knowledge_compilation is blocked.", + "Suite memory_evolution is not_encoded.", + "Suite production_ops is not_encoded.", + "Suite retrieval is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "cargo make smoke-graphrag-docker", + "https://github.com/microsoft/graphrag", + "https://microsoft.github.io/graphrag/", + "tmp/real-world-memory/graphrag-smoke/graphrag-report.json", + "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "tmp/real-world-memory/graphrag-smoke/summary.json" + ] + }, + { + "product_id": "graphiti_zep", + "product_name": "Graphiti/Zep", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 13, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter graphiti_zep_research_gate overall status is blocked.", + "Suite memory_evolution is blocked.", + "Suite production_ops is not_encoded.", + "Suite retrieval is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "cargo make smoke-graphiti-zep-docker-temporal", + "https://github.com/getzep/graphiti", + "https://www.getzep.com/platform/graphiti/", + "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json", + "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + ] + }, + { + "product_id": "langgraph", + "product_name": "LangGraph", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "not_encoded", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 2, + "pass_count": 0, + "typed_non_pass_count": 9, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter langgraph_research_gate overall status is not_encoded.", + "Suite production_ops is not_encoded.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://docs.langchain.com/oss/python/langgraph/persistence" + ] + }, + { + "product_id": "letta", + "product_name": "Letta", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 4, + "pass_count": 0, + "typed_non_pass_count": 18, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter letta_research_gate overall status is blocked.", + "Suite core_archival_memory is blocked.", + "Suite personalization is not_encoded.", + "Suite project_decisions is blocked.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://docs.letta.com/api/python", + "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "https://docs.letta.com/guides/docker", + "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "tmp/real-world-memory/letta-core-archive/report.json", + "tmp/real-world-memory/letta-core-archive/summary.json" + ] + }, + { + "product_id": "lightrag", + "product_name": "LightRAG", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 19, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter lightrag_research_gate overall status is blocked.", + "Suite memory_evolution is not_encoded.", + "Suite operator_debugging_ux is not_encoded.", + "Suite retrieval is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "cargo make smoke-lightrag-docker-context", + "https://github.com/HKUDS/LightRAG", + "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "tmp/real-world-memory/lightrag-context/lightrag-report.json", + "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "tmp/real-world-memory/lightrag-context/summary.json" + ] + }, + { + "product_id": "openviking", + "product_name": "OpenViking", + "row_source": "external_adapter_manifest", + "evidence_class": "live_baseline", + "result_state": "wrong_result", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 6, + "pass_count": 0, + "typed_non_pass_count": 17, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_baseline" + } + }, + "strengths": [ + "docker_local_embed_setup capability is pass.", + "local_embed_setup capability is pass." + ], + "weaknesses": [ + "Adapter openviking_deep_profile_gate overall status is blocked.", + "Adapter openviking_live_baseline overall status is wrong_result.", + "Suite context_trajectory is blocked.", + "Suite operator_debugging_ux is not_encoded.", + "Suite retrieval is wrong_result.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/runbook/benchmarking/live_baseline_benchmark.md", + "https://github.com/volcengine/OpenViking/", + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/OpenViking.log", + "tmp/live-baseline/live-baseline-report.json" + ] + }, + { + "product_id": "ragflow", + "product_name": "RAGFlow", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 18, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter ragflow_research_gate overall status is blocked.", + "Suite knowledge_compilation is not_encoded.", + "Suite production_ops is blocked.", + "Suite retrieval is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://github.com/infiniflow/ragflow", + "https://ragflow.io/docs/", + "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json", + "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + ] + }, + { + "product_id": "vectifyai_openkb", + "product_name": "VectifyAI OpenKB", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 1, + "pass_count": 0, + "typed_non_pass_count": 9, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter vectifyai_openkb_same_corpus_blocker overall status is blocked.", + "Suite knowledge_compilation is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json", + "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "https://github.com/VectifyAI/OpenKB", + "tmp/real-world-memory/pageindex-openkb/report.json" + ] + }, + { + "product_id": "vectifyai_pageindex", + "product_name": "VectifyAI PageIndex", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 1, + "pass_count": 0, + "typed_non_pass_count": 9, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter vectifyai_pageindex_same_corpus_blocker overall status is blocked.", + "Suite source_library is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json", + "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "https://github.com/VectifyAI/PageIndex", + "tmp/real-world-memory/pageindex-openkb/report.json" + ] + }, + { + "product_id": "agentmemory", + "product_name": "agentmemory", + "row_source": "external_adapter_manifest", + "evidence_class": "live_baseline", + "result_state": "blocked", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 14, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_baseline" + } + }, + "strengths": [ + "same_corpus_retrieval capability is pass." + ], + "weaknesses": [ + "Adapter agentmemory_live_baseline overall status is lifecycle_fail.", + "Suite capture_integration is blocked.", + "Suite memory_evolution is blocked.", + "Suite work_resume is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "docs/evidence/external_memory/agentmemory_adapter.md", + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/agentmemory.log", + "tmp/live-baseline/live-baseline-report.json" + ] + }, + { + "product_id": "claude_mem", + "product_name": "claude-mem", + "row_source": "external_adapter_manifest", + "evidence_class": "live_baseline", + "result_state": "wrong_result", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 13, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_baseline" + } + }, + "strengths": [ + "durable_storage capability is real.", + "progressive_disclosure_real_world_job capability is pass.", + "repository_lifecycle capability is real.", + "repository_progressive_disclosure capability is real." + ], + "weaknesses": [ + "Adapter claude_mem_live_baseline overall status is wrong_result.", + "Suite capture_integration is blocked.", + "Suite operator_debugging_ux is blocked.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/claude-mem.log", + "tmp/live-baseline/live-baseline-report.json" + ] + }, + { + "product_id": "gbrain", + "product_name": "gbrain", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 2, + "pass_count": 0, + "typed_non_pass_count": 10, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter gbrain_research_gate overall status is not_encoded.", + "Suite knowledge_compilation is not_encoded.", + "Suite operator_debugging_ux is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://github.com/garrytan/gbrain", + "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md" + ] + }, + { + "product_id": "graphify", + "product_name": "graphify", + "row_source": "external_adapter_manifest", + "evidence_class": "live_real_world", + "result_state": "wrong_result", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": true, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 10, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_real_world" + } + }, + "strengths": [ + "docker_cli_boundary capability is pass.", + "graph_report_generation capability is pass." + ], + "weaknesses": [ + "Adapter graphify_docker_smoke overall status is wrong_result.", + "Suite knowledge_compilation is wrong_result.", + "Suite retrieval is blocked.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "cargo make smoke-graphify-docker-graph-report", + "https://github.com/safishamsi/graphify", + "tmp/real-world-memory/graphify-smoke/graphify-report.json", + "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "tmp/real-world-memory/graphify-smoke/summary.json" + ] + }, + { + "product_id": "llm_wiki", + "product_name": "llm-wiki", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "not_encoded", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 2, + "pass_count": 0, + "typed_non_pass_count": 10, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter llm_wiki_research_gate overall status is not_encoded.", + "Suite knowledge_compilation is not_encoded.", + "Suite work_resume is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://github.com/nvk/llm-wiki" + ] + }, + { + "product_id": "mem0_openmemory", + "product_name": "mem0/OpenMemory", + "row_source": "external_adapter_manifest", + "evidence_class": "live_baseline", + "result_state": "blocked", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 10, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_baseline" + } + }, + "strengths": [ + "Scenario preference_correction_history is recorded as a competitor strength.", + "deletion_audit_history capability is pass.", + "entity_scoped_personalization capability is pass.", + "local_get_all_export_readback capability is pass.", + "local_lifecycle_update_delete_reload capability is pass.", + "local_storage capability is real." + ], + "weaknesses": [ + "Suite memory_evolution is not_encoded.", + "Suite operator_debugging_ux is blocked.", + "Suite personalization is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/live-baseline-report.json", + "tmp/live-baseline/mem0.log" + ] + }, + { + "product_id": "memsearch", + "product_name": "memsearch", + "row_source": "external_adapter_manifest", + "evidence_class": "live_baseline", + "result_state": "not_encoded", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 6, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_baseline" + } + }, + "strengths": [ + "canonical_markdown_store capability is real.", + "markdown_source_store_prompt_jobs capability is pass.", + "reindex_update_delete_reload capability is pass.", + "same_corpus_retrieval capability is pass." + ], + "weaknesses": [ + "Suite memory_evolution is not_encoded.", + "Suite retrieval is not_encoded.", + "Suite trust_source_of_truth is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/live-baseline-report.json", + "tmp/live-baseline/memsearch.log" + ] + }, + { + "product_id": "nanograph", + "product_name": "nanograph", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "not_encoded", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 2, + "pass_count": 0, + "typed_non_pass_count": 9, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter nanograph_research_gate overall status is not_encoded.", + "Suite memory_evolution is not_encoded.", + "Suite retrieval is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "https://github.com/nanograph/nanograph" + ] + }, + { + "product_id": "plastic_labs_honcho", + "product_name": "plastic-labs Honcho", + "row_source": "external_adapter_manifest", + "evidence_class": "research_gate", + "result_state": "blocked", + "comparable": false, + "same_corpus": false, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": false, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 3, + "pass_count": 0, + "typed_non_pass_count": 11, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "research_gate" + } + }, + "strengths": [], + "weaknesses": [ + "Adapter plastic_labs_honcho_research_gate overall status is blocked.", + "Suite memory_evolution is blocked.", + "Suite retrieval is blocked.", + "Suite work_resume is blocked.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map this product to the same corpus.", + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Run a Docker-contained product-runtime adapter for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "https://github.com/plastic-labs/honcho", + "https://honcho.dev/docs/v3/documentation/introduction/vibecoding" + ] + }, + { + "product_id": "qmd", + "product_name": "qmd", + "row_source": "external_adapter_manifest", + "evidence_class": "live_real_world", + "result_state": "wrong_result", + "comparable": false, + "same_corpus": true, + "source_id_mapped": false, + "held_out": false, + "leakage_audited": false, + "product_runtime": true, + "container_digest_identified": false, + "metrics": { + "retrieval": { + "k": 5, + "metric_basis": "external_adapter_manifest_no_ordered_evidence", + "recall_at_k": null, + "precision_at_k": null, + "mrr": null, + "ndcg": null, + "expected_evidence_recall": null, + "citation_source_ref_coverage": null, + "expected_evidence_matched": 0, + "expected_evidence_total": 0, + "produced_evidence_total": 0 + }, + "lifecycle": { + "stale_suppression": null, + "stale_suppressed_count": 0, + "stale_check_count": 0, + "update_correctness": null, + "update_correct_count": 0, + "update_check_count": 0, + "delete_correctness": null, + "delete_correct_count": 0, + "delete_check_count": 0, + "rollback_history_readback_rate": null, + "rollback_history_readback_count": 0, + "rollback_history_check_count": 0 + }, + "answer_safety": { + "unsupported_claim_rate": null, + "unsupported_claim_count": 0, + "stale_answer_rate": null, + "stale_answer_count": 0, + "hallucinated_evidence_rate": null, + "redaction_leak_count": 0, + "irrelevant_context_ratio": null + }, + "operations": { + "mean_latency_ms": null, + "total_cost": null, + "resource_envelope_status": "", + "resource_envelope_job_count": 0, + "resource_envelope_pass_count": 0 + }, + "coverage": { + "job_count": 0, + "encoded_suite_count": 19, + "pass_count": 5, + "typed_non_pass_count": 35, + "source_ref_coverage": null, + "evidence_coverage": null, + "evidence_class": "live_real_world" + } + }, + "strengths": [ + "local_cli_retrieval capability is real.", + "local_replay_command_metadata capability is pass.", + "operator_debug_real_world_job_adapter capability is pass.", + "real_world_job_adapter capability is pass.", + "same_corpus_retrieval capability is pass.", + "targeted_live_pass capability is pass." + ], + "weaknesses": [ + "Adapter qmd_deep_profile_gate overall status is not_encoded.", + "Adapter qmd_live_real_world overall status is wrong_result.", + "Adapter qmd_operator_debug_live overall status is wrong_result.", + "Suite capture_integration is not_encoded.", + "Suite consolidation is not_encoded.", + "Suite context_trajectory is blocked.", + "Suite core_archival_memory is not_encoded.", + "Suite knowledge_compilation is not_encoded.", + "This row is not a comparable product-runtime scoreboard pass." + ], + "next_evidence": [ + "Map returned evidence to stable source ids.", + "Publish a held-out split for this row.", + "Publish leakage-audit evidence for this row.", + "Record container image digest evidence.", + "Resolve typed non-pass state before claiming a comparable pass." + ], + "source_provenance": [ + "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "apps/elf-eval/fixtures/real_world_memory/", + "cargo make real-world-job-operator-ux-live-adapters", + "cargo make real-world-memory-live-adapters", + "docker-compose.baseline.yml", + "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md", + "docs/runbook/benchmarking/live_baseline_benchmark.md", + "https://github.com/tobi/qmd", + "scripts/live-baseline-benchmark.sh", + "tmp/live-baseline/live-baseline-report.json", + "tmp/live-baseline/qmd.log", + "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + ] + } + ], + "optimization_roadmap": [ + "Capture Docker image digests and runtime metadata for product-runtime rows.", + "Add held-out and leakage-audit manifests before broad competitor comparisons.", + "Promote external adapters from typed blockers to same-corpus source-id-mapped runtime rows only after they emit comparable evidence.", + "Use row-level metrics for optimization direction; do not claim a universal leaderboard." + ] + }, + "operational_evidence": { + "schema": "elf.operational_evidence_gates/v1", + "tiers": [ + { + "tier": "local_fixture", + "status": "blocked", + "job_count": 77, + "pass": 74, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 3, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 2.813, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 1, + "cold_start_evidence_count": 2, + "restore_evidence_count": 2, + "qdrant_rebuild_evidence_count": 3, + "pass_claim_allowed": false, + "blocker_reasons": [ + "OpenViking hierarchy selection is encoded as a benchmark job, but scoring is blocked until the adapter emits selected hierarchy nodes with evidence ids after the same-corpus precondition passes.", + "OpenViking recursive/context expansion is encoded as a benchmark job, but scoring is blocked until the adapter materializes expansion paths and same-corpus evidence ids are correct.", + "OpenViking staged retrieval trajectory is encoded as a benchmark job, but scoring is blocked until same-corpus output returns expected evidence ids and comparable staged artifacts exist." + ], + "job_ids": [ + "adversarial-quality-conflicting-source-authority-001", + "adversarial-quality-correction-persistence-001", + "adversarial-quality-private-excluded-span-001", + "adversarial-quality-stale-fact-current-answer-001", + "adversarial-quality-unsupported-claim-refusal-001", + "capture-redaction-exclusion-001", + "capture-source-id-binding-001", + "capture-write-policy-redaction-001", + "consolidation-contradiction-report-discard-001", + "consolidation-preference-candidate-defer-001", + "consolidation-project-summary-apply-001", + "consolidation-weekly-decision-summary-apply-001", + "context-trajectory-openviking-hierarchy-selection-001", + "context-trajectory-openviking-recursive-expansion-001", + "context-trajectory-openviking-staged-retrieval-001", + "core-archival-archival-fallback-001", + "core-archival-core-block-attachment-001", + "core-archival-core-block-provenance-001", + "core-archival-core-block-scope-001", + "core-archival-project-decision-recovery-001", + "core-archival-stale-core-detection-001", + "memory-evolution-benchmark-verdict-001", + "memory-evolution-deploy-method-001", + "memory-evolution-issue-state-001", + "memory-evolution-preference-001", + "memory-evolution-relation-temporal-001", + "knowledge-watch-rebuild-003", + "knowledge-entity-concept-002", + "knowledge-project-page-001", + "memory-evolution-delete-ttl-001", + "memory-summary-source-trace-001", + "p1-closeout-correction-persistence-rollback-001", + "p1-closeout-source-candidate-approval-recall-001", + "p1-closeout-stale-decision-suppression-001", + "p1-closeout-unsupported-claim-refusal-work-resume-001", + "personalization-scoped-preference-001", + "proactive-daily-project-brief-001", + "proactive-resume-work-brief-001", + "proactive-stale-decision-audit-001", + "proactive-stale-plan-preference-warning-001", + "production-ops-authority-plane-recovery-001", + "production-ops-restore-cold-start-001", + "production-ops-cold-start-dependency-001", + "production-ops-backfill-resume-001", + "production-ops-resource-envelope-001", + "project-decision-accepted-typed-failures-001", + "project-decision-current-validation-gate-001", + "project-decision-private-manifest-caveat-001", + "project-decision-reversal-live-baseline-001", + "project-decision-tradeoff-fixture-backed-001", + "retrieval-alt-phrasing-001", + "retrieval-current-vs-obsolete-001", + "retrieval-distractor-heavy-001", + "retrieval-minimal-context-001", + "retrieval-multi-hop-routing-001", + "operator-debug-stage-attribution-001", + "scheduled-knowledge-page-refresh-suggestion-001", + "scheduled-stale-decision-audit-001", + "scheduled-stale-preference-plan-audit-001", + "scheduled-weekly-project-status-summary-001", + "source-library-long-doc-001", + "source-library-social-thread-001", + "trust-sot-rebuild-001", + "work-continuity-decision-rationale-001", + "work-continuity-explicit-next-step-001", + "work-continuity-handoff-source-ref-001", + "work-continuity-inferred-next-step-001", + "work-continuity-janitor-false-promotion-001", + "work-continuity-redaction-001", + "work-continuity-rejected-option-001", + "work-continuity-reset-resume-001", + "capture-integration-boundaries-001", + "work-resume-decodex-linear-status-001", + "work-resume-failed-command-recovery-001", + "work-resume-next-action-extraction-001", + "work-resume-pr-review-blocker-001", + "work-resume-stale-worktree-001" + ] + }, + { + "tier": "public_proxy", + "status": "pass", + "job_count": 1, + "pass": 1, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 0, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 10.843, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "resource_evidence_count": 1, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": true, + "blocker_reasons": [], + "job_ids": [ + "production-ops-public-proxy-addendum-001" + ] + }, + { + "tier": "private_corpus", + "status": "blocked", + "job_count": 3, + "pass": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 3, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 1.6, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 0, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": false, + "blocker_reasons": [ + "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed.", + "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930." + ], + "job_ids": [ + "proactive-private-corpus-refresh-blocked-001", + "production-ops-private-manifest-blocked-001", + "scheduled-private-provider-scheduler-blocked-001" + ] + }, + { + "tier": "provider_backed", + "status": "blocked", + "job_count": 1, + "pass": 0, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 1, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_latency_ms": 1.7, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "resource_evidence_count": 0, + "cold_start_evidence_count": 0, + "restore_evidence_count": 0, + "qdrant_rebuild_evidence_count": 0, + "pass_claim_allowed": false, + "blocker_reasons": [ + "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets." + ], + "job_ids": [ + "production-ops-credential-boundary-001" + ] + } + ], + "latency": { + "measured_job_count": 79, + "missing_latency_job_count": 3, + "mean_ms": 2.885, + "max_ms": 31.5 + }, + "cost": { + "jobs_with_cost_report": 79, + "missing_cost_job_count": 3, + "zero_cost_job_count": 79, + "total": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "claim_boundary": "Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend." + }, + "resource": { + "resource_envelope_job_count": 2, + "resource_envelope_pass_count": 2, + "latency_resource_dimension_job_count": 10, + "job_ids": [ + "production-ops-public-proxy-addendum-001", + "production-ops-resource-envelope-001" + ] + }, + "cold_start_restore_rebuild": { + "cold_start_job_count": 2, + "cold_start_pass_count": 2, + "restore_job_count": 2, + "restore_pass_count": 2, + "qdrant_rebuild_job_count": 3, + "qdrant_rebuild_pass_count": 3, + "job_ids": [ + "production-ops-authority-plane-recovery-001", + "production-ops-cold-start-dependency-001", + "production-ops-restore-cold-start-001", + "trust-sot-rebuild-001" + ] + }, + "authority_recovery": { + "drill_count": 1, + "drill_pass_count": 1, + "topology_reported_count": 1, + "failure_injection_count": 2, + "degraded_read_labeled_count": 1, + "source_of_truth_visible_count": 1, + "backup_pitr_restored_count": 1, + "rpo_target_count": 1, + "rpo_met_count": 1, + "rto_target_count": 1, + "rto_met_count": 1, + "authority_plane_count": 7, + "record_count_preserved_count": 7, + "source_ref_preserved_count": 7, + "lifecycle_history_preserved_count": 7, + "idempotent_outbox_replay_count": 1, + "qdrant_rebuild_complete_count": 1, + "migration_repair_count": 1, + "dead_letter_handled_count": 1, + "job_ids": [ + "production-ops-authority-plane-recovery-001" + ] + }, + "missing_private_provider_inputs_are_typed_blockers": true, + "private_corpus_pass_claim_allowed": false, + "provider_backed_pass_claim_allowed": false, + "claim_boundary": "Operational evidence tiers are separate: local fixture and public-proxy passes do not prove private-corpus or provider-backed production quality." + }, + "external_adapters": { + "schema": "elf.real_world_external_adapter_report/v1", + "manifest_id": "real-world-memory-project-adapters-2026-06-11-first-generation-continuity-source-store", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/live-baseline-benchmark.sh", + "artifact_dir": "tmp/live-baseline/", + "host_global_installs_required": false, + "notes": [ + "External project runs default to Docker Compose and Docker-managed caches.", + "Real-world job fixture reports and live baseline reports use separate schemas and claim boundaries." + ] + }, + "summary": { + "adapter_count": 26, + "external_project_count": 19, + "docker_default_count": 26, + "host_global_install_required_count": 0, + "fixture_backed_count": 1, + "live_baseline_only_count": 6, + "live_real_world_count": 5, + "research_gate_count": 14, + "overall_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 10, + "incomplete": 0, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 4, + "not_encoded": 5 + }, + "capability_status_counts": { + "real": 8, + "mocked": 1, + "unsupported": 6, + "blocked": 32, + "incomplete": 0, + "wrong_result": 10, + "lifecycle_fail": 0, + "pass": 30, + "not_encoded": 26 + }, + "suite_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 0, + "blocked": 29, + "incomplete": 0, + "wrong_result": 7, + "lifecycle_fail": 0, + "pass": 27, + "not_encoded": 37 + }, + "scenario_status_counts": { + "real": 0, + "mocked": 0, + "unsupported": 3, + "blocked": 24, + "incomplete": 5, + "wrong_result": 6, + "lifecycle_fail": 1, + "pass": 23, + "not_encoded": 13 + }, + "scenario_position_counts": { + "wins": 10, + "ties": 11, + "loses": 1, + "untested": 53 + }, + "scenario_outcome_counts": { + "win": 10, + "tie": 11, + "loss": 1, + "not_tested": 19, + "blocked": 29, + "non_goal": 5 + } + }, + "adapters": [ + { + "adapter_id": "elf_real_world_memory_fixture", + "project": "ELF", + "adapter_kind": "offline_fixture_response", + "evidence_class": "fixture_backed", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The checked-in real_world_memory fixtures parse and score through the ELF fixture runner.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "run": { + "status": "blocked", + "evidence": "The current fixture set reports 82 jobs across 19 suites: 75 pass, 0 incomplete, 7 blocked, 0 wrong_result, 0 not_encoded, and 0 unsupported_claim. The six core_archival_memory jobs pass as ELF fixture evidence, not as live Letta comparison evidence; the one memory_summary job passes as fixture-backed source-trace evidence, not as managed-memory parity evidence; the proactive_brief suite scores 4 passing evidence-linked suggestions plus one blocked private-corpus refresh case tied to XY-930, not Pulse or hosted managed-memory parity; the scheduled_memory suite scores 4 passing scheduled readback tasks plus one blocked private/provider scheduler case tied to XY-930, not hosted scheduler, ChatGPT Tasks, Pulse, or provider-backed private-corpus parity; context_trajectory remains blocked behind OpenViking staged-artifact materialization, and Work Continuity plus authority-recovery rows remain fixture-backed rather than live product-runtime superiority evidence.", + "command": "cargo make real-world-memory", + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + "result": { + "status": "blocked", + "evidence": "This is fixture-backed ELF scoring, not a live external adapter result.", + "artifact": "tmp/real-world-memory/real-world-memory-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_fixture_scoring", + "status": "real", + "evidence": "The runner scores checked-in real_world_job records with expected evidence, traps, and typed status output." + }, + { + "capability": "live_external_adapter_execution", + "status": "not_encoded", + "evidence": "The ELF fixture response path does not exercise an external memory project runtime." + }, + { + "capability": "docker_isolated_baseline", + "status": "pass", + "evidence": "ELF live baseline runs execute through docker-compose.baseline.yml for retrieval and lifecycle evidence." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "Checked-in source-of-truth rebuild fixture is encoded and passing." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "Checked-in work-resume fixtures are encoded and passing." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "Checked-in project-decision fixtures cover accepted decisions, reversals, current validation gates, rationale, and bounded caveats." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "Checked-in retrieval fixtures cover alternate phrasing, distractors, multi-hop routing, current-versus-obsolete selection, and minimal context." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "evidence": "Checked-in memory-evolution fixtures cover current-versus-historical facts and the relation temporal-validity case is encoded." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "Proposal-only consolidation fixtures are encoded and passing without source mutation." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "evidence": "The source-trace memory summary fixture is encoded and passing with freshness, rationale, tombstone, and unsupported-claim guards." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "evidence": "The proactive brief suite scores 4 passing source-linked suggestions and 1 typed private-corpus refresh blocker tied to XY-930." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "evidence": "The scheduled memory suite scores 4 passing source-linked task readbacks with execution trace coverage and 1 typed private/provider scheduler blocker tied to XY-930." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "Knowledge page fixtures are encoded and passing with citation and rebuild metrics." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "Operator-debugging fixtures now expose stage attribution and dropped-candidate evidence without raw SQL." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "Four redaction, exclusion, source-id, evidence-binding, and capture-boundary fixtures are encoded and passing." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "evidence": "Six fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Production-ops fixtures encode restore, Qdrant rebuild, backfill resume, resource-envelope interpretation, OpenViking wrong-result classification, plus typed blocked operator boundaries." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The scoped preference fixture is encoded and passing." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked until same-corpus evidence ids and staged artifacts are materialized." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory", + "status": "pass" + } + ], + "notes": [ + "This adapter record exists to keep ELF fixture results separate from live external adapter results.", + "The remaining non-pass ELF fixture states are production-ops operator boundaries plus OpenViking context-trajectory measurement gates.", + "Use elf_live_real_world for service-runtime real_world_job evidence; this fixture-backed record must not imply live-service behavior." + ] + }, + { + "adapter_id": "elf_live_real_world", + "project": "ELF", + "adapter_kind": "docker_service_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "ELF materializes 55 real_world_job adapter_response objects through ElfService, worker indexing, search_raw, live capture/write-policy ingestion, live consolidation proposal review, live knowledge-page rebuild/lint, and operator-debug trace metadata before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full live sweep scores 55 jobs across all 13 checked-in suites, including live-scored consolidation, knowledge-page, capture/write-policy, and operator-debug suites. This is not a full-suite live pass because memory-evolution, production-ops, core-archival, and context-trajectory gaps remain typed non-pass records.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes real_world_job prompts after runtime ingestion and writes generated answer artifacts before scoring." + }, + { + "capability": "service_runtime_execution", + "status": "real", + "evidence": "The materializer uses ElfService, Postgres, Qdrant, deterministic providers, worker indexing, and search_raw in Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution is wrong_result and production/core/context boundaries remain typed non-pass." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "Adapter setup/runtime limitations are materialized as typed jobs with evidence JSON instead of silent claim upgrades." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "The live adapter retrieved the restore/Qdrant rebuild proof evidence through the service runtime." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "The live adapter passed 5/5 work_resume jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "The live adapter passed 5/5 retrieval jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "The live adapter passed 5/5 project_decisions jobs through service-runtime evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "The live adapter passed the delete/TTL case but failed five current-versus-historical conflict jobs because retrieval-backed answers did not provide the required historical conflict evidence links." + }, + { + "suite_id": "consolidation", + "status": "pass", + "evidence": "The live adapter creates consolidation runs, materializes proposal jobs through the worker, preserves source lineage and unsupported-claim flags, and applies/defer/discards proposals through review audit transitions." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "evidence": "The live adapter rebuilds derived knowledge pages through ElfService, searches page sections, lints stale source refs after runtime source updates, and emits citation/backlink/unsupported-section page artifacts." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The full live sweep includes operator_debugging_ux fixtures and emits trace ids, viewer/admin trace-bundle links, replay commands, dropped-candidate visibility, repair-action clarity, and raw_sql_needed=false." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "evidence": "The live adapter passes 4/4 capture_integration jobs through Docker-local ELF ingestion, including capture-boundary classification, excluded evidence ids, source ids in source_ref, write_policy redaction audit counts, evidence binding, and zero secret leakage." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "The live adapter retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The full live adapter sweep preserves the core/archival fixture gap as typed not_encoded; this issue does not add live core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [ + { + "scenario_id": "live_capture_write_policy", + "suite_id": "capture_integration", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live capture/write-policy jobs pass for redaction, exclusions, source ids, evidence binding, and no secret leakage. This is an ELF self-check, not a win over external hook systems.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_consolidation_proposal_review", + "suite_id": "consolidation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live consolidation jobs now exercise source lineage, unsupported-claim flags, and apply/defer/discard review audit transitions. This is an ELF service self-check, not a broad competitor win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "live_knowledge_page_rebuild_lint", + "suite_id": "knowledge_compilation", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF live knowledge jobs now exercise page rebuild, search, stale-source lint, citations, backlinks, and unsupported-section handling. This is an ELF service self-check, not a broad knowledge-product win.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + }, + { + "scenario_id": "full_sweep_operator_debug", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF full live sweep now includes the operator-debug fixture tree with hydrated trace ids, trace-bundle replay commands, dropped-candidate visibility, repair guidance, and no raw SQL requirement.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/elf-materialization.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This Docker-isolated live real_world_job record now covers the full encoded fixture corpus, not only the original three-suite representative slice.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove private-corpus production quality or provider-backed production operations." + ] + }, + { + "adapter_id": "qmd_live_baseline", + "project": "qmd", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs qmd inside the baseline container.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "pass", + "evidence": "qmd same-corpus retrieval, update, delete, and cold-start checks are encoded in the live baseline runner.", + "command": "ELF_BASELINE_PROJECTS=qmd cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "This live_baseline_only record is same-corpus evidence only; cite qmd_live_real_world for the full live real-world sweep.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "qmd has an encoded Docker same-corpus retrieval adapter." + }, + { + "capability": "update_delete_cold_start", + "status": "pass", + "evidence": "qmd lifecycle smoke checks are encoded in the live-baseline runner." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job prompts; cite qmd_live_real_world for the full live real-world sweep." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "This live_baseline_only record does not execute real_world_job retrieval prompts; cite qmd_live_real_world for the live retrieval adapter run." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Live-baseline lifecycle checks exist, but no real_world_job memory_evolution run is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd debug ergonomics are a reference dimension; no operator_debugging_ux fixture is executed against qmd." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + }, + { + "kind": "compose", + "ref": "docker-compose.baseline.yml", + "status": "real" + } + ], + "notes": [ + "This same-corpus record remains separate from qmd_live_real_world, which records real_world_job prompt execution and scoring evidence." + ] + }, + { + "adapter_id": "qmd_live_real_world", + "project": "qmd", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live adapter task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes 55 real_world_job adapter_response objects through collection add, update, embed, and query --json before scoring; the full sweep includes typed wrong_result, blocked, and not_encoded job records, with operator-debug fixtures scored through qmd replay metadata rather than ELF trace hydration.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The fresh full qmd live sweep scores 55 jobs across all 13 checked-in suites, preserving consolidation, knowledge-page, capture, production-ops, core-archival, and context-trajectory gaps as typed non-pass records. This is not a full-suite live pass.", + "command": "cargo make real-world-memory-live-adapters", + "artifact": "tmp/real-world-memory/live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "real_world_job_adapter", + "status": "pass", + "evidence": "qmd executes real_world_job prompts through its local CLI retrieval/query workflow and records generated answer artifacts." + }, + { + "capability": "local_cli_retrieval", + "status": "real", + "evidence": "The adapter uses qmd collection add, update, embed -f, and query --json inside Docker." + }, + { + "capability": "targeted_live_pass", + "status": "pass", + "evidence": "The answer-retrieval suites from the original representative slice still pass: work_resume, retrieval, and project_decisions." + }, + { + "capability": "full_suite_live_sweep", + "status": "wrong_result", + "evidence": "The runner now emits per-job and per-suite live records for all 55 checked-in jobs, including the operator-debug fixture tree, but memory_evolution and operator_debugging_ux are wrong_result while non-qmd product surfaces remain typed not_encoded or blocked." + }, + { + "capability": "full_suite_live_pass", + "status": "wrong_result", + "evidence": "No full-suite live pass is claimed; generated reports preserve wrong_result, blocked, and not_encoded job outcomes." + }, + { + "capability": "typed_failure_reporting", + "status": "pass", + "evidence": "qmd setup/runtime limitations are materialized as typed jobs with command evidence and retry artifacts." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "evidence": "qmd retrieved the restore/Qdrant rebuild proof evidence through the local CLI workflow." + }, + { + "suite_id": "work_resume", + "status": "pass", + "evidence": "qmd passed 5/5 work_resume jobs through CLI evidence retrieval." + }, + { + "suite_id": "retrieval", + "status": "pass", + "evidence": "qmd passed 5/5 retrieval jobs through CLI evidence retrieval." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "evidence": "qmd passed 5/5 project_decisions jobs through CLI evidence retrieval." + }, + { + "suite_id": "memory_evolution", + "status": "wrong_result", + "evidence": "qmd failed all six memory-evolution jobs in the fresh June 11 diagnostic, including the delete/TTL tombstone job where qmd retrieved only the current plan and missed the tombstone evidence." + }, + { + "suite_id": "consolidation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate or review consolidation proposals." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep retrieves evidence-linked answers but does not generate derived knowledge pages." + }, + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The full qmd live sweep includes operator_debugging_ux fixtures and records replay-command metadata, but it lacks ELF trace hydration, viewer links, and intermediate candidate-drop stages, so the suite remains wrong_result." + }, + { + "suite_id": "capture_integration", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep does not exercise capture integrations or write-policy redaction boundaries; all capture_integration jobs remain typed not_encoded for qmd." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "The qmd live adapter sweep does not run backup/restore, private corpus, provider credential, or backfill operations; existing production-ops credential and private-manifest boundaries remain blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "evidence": "qmd retrieved the scoped preference evidence and passed the personalization job." + }, + { + "suite_id": "core_archival_memory", + "status": "not_encoded", + "evidence": "The qmd live adapter sweep preserves the core/archival fixture gap as typed not_encoded; qmd does not expose ELF core-block attachment/readback materialization." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The OpenViking-style context trajectory fixtures remain blocked by live staged-trajectory and recursive-expansion measurement gaps." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_memory/", + "status": "real" + }, + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-memory-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/live-adapters/qmd-report.json", + "status": "pass" + } + ], + "notes": [ + "This qmd record is real-world job evidence and must not be conflated with the same-corpus qmd_live_baseline record.", + "The record is a full-suite sweep, not a full-suite pass; wrong_result, blocked, and not_encoded states remain visible.", + "This record does not prove broad RAG/graph adapter parity or private-corpus production quality." + ] + }, + { + "adapter_id": "elf_operator_debug_live", + "project": "ELF", + "adapter_kind": "docker_service_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task runs inside docker-compose.baseline.yml with Docker-owned Postgres, Qdrant, Cargo, npm, qmd, and cache volumes.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + "run": { + "status": "pass", + "evidence": "ELF materializes operator_debugging_ux adapter_response objects through ElfService, worker indexing, search_raw trace ids, and generated operator_debug metadata.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + "result": { + "status": "pass", + "evidence": "The narrow live slice scores operator-debugging jobs with trace availability, replay command availability, candidate-drop visibility, repair-action clarity, and raw-SQL avoidance separated in job-level evidence.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through the live service materializer and generated scoring fixtures." + }, + { + "capability": "trace_hydration_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include service trace ids, viewer links, admin trace-bundle URLs, and trace_available=true." + }, + { + "capability": "replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include admin trace-bundle curl replay commands; no raw SQL path is required." + }, + { + "capability": "candidate_drop_visibility", + "status": "pass", + "evidence": "The operator-debug jobs keep dropped-candidate visibility as explicit job-level evidence instead of relying on direct database inspection." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This ELF live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "evidence": "The narrow live operator-debug slice scores trace hydration, stage attribution, candidate-drop visibility, selected-but-not-narrated diagnosis, and repair-action clarity through generated ELF live artifacts." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated trace_available=true, service trace ids, viewer URLs, and admin trace-bundle replay URLs for the operator-debug jobs; qmd has replay rows but no ELF trace hydration surface.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF generated admin trace-bundle replay commands; qmd generated local CLI query replay commands. These are comparable replay-command availability artifacts, not equivalent UI quality claims.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "ELF generated operator_debug candidate-drop visibility from trace and replay-candidate metadata without direct SQL assumptions; qmd keeps only top-k replay rows and lacks intermediate candidate-drop stages.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "ELF and qmd generated clear repair/replay steps for the narrow operator-debug jobs; OpenMemory UI/export remains blocked, and claude-mem UI repair paths remain blocked until Docker-contained hook/viewer evidence exists.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "The new selected-but-not-narrated job scores whether selected trace evidence is available for answer-composition repair without direct database inspection.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/elf-report.json", + "status": "pass" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "The record does not implement product UI improvements and does not claim broad qmd/OpenMemory/claude-mem superiority." + ] + }, + { + "adapter_id": "qmd_operator_debug_live", + "project": "qmd", + "adapter_kind": "docker_cli_operator_debug_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The narrow operator-debug live task clones and installs qmd inside the baseline Docker container when the checkout is absent.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + "run": { + "status": "wrong_result", + "evidence": "qmd materializes operator_debugging_ux adapter_response objects through collection add, update, embed, and query --json, then records local replay-command metadata but no service trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The narrow live slice gives qmd explicit replay-command evidence, but operator-debug jobs remain wrong_result where trace availability, trace completeness, or candidate-drop stage visibility is required.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.md" + }, + "capabilities": [ + { + "capability": "operator_debug_real_world_job_adapter", + "status": "pass", + "evidence": "The adapter executes the checked-in operator_debugging_ux jobs through qmd local CLI materialization and generated scoring fixtures." + }, + { + "capability": "local_replay_command_metadata", + "status": "pass", + "evidence": "Generated operator_debug records include qmd query replay commands tied to per-job collections." + }, + { + "capability": "trace_hydration_metadata", + "status": "wrong_result", + "evidence": "Generated qmd operator_debug records have trace_available=false and no ELF viewer/admin trace bundle because qmd exposes local replay rows rather than service trace hydration." + }, + { + "capability": "candidate_drop_visibility", + "status": "wrong_result", + "evidence": "qmd top-k replay output is available, but intermediate candidate-drop stages are not exposed in the generated artifact." + }, + { + "capability": "openmemory_or_claude_mem_ui_runner", + "status": "not_encoded", + "evidence": "This qmd live slice does not launch OpenMemory or claude-mem UI flows." + } + ], + "suites": [ + { + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "evidence": "The narrow qmd operator-debug slice scores local replay commands but remains wrong_result for trace hydration and candidate-drop stage visibility." + } + ], + "scenarios": [ + { + "scenario_id": "operator_debug_trace_hydration", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated replay-command metadata but trace_available=false, so ELF wins only this trace-hydration dimension; this is not a broad qmd loss.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_replay_command", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated local CLI query replay commands for the same operator-debugging scenarios; ELF generated admin trace-bundle curl commands.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/summary.json" + }, + { + "scenario_id": "operator_debug_candidate_drop_visibility", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd generated top-k replay output but not intermediate retrieved-but-dropped stage visibility, so candidate-drop diagnosis remains a qmd wrong_result in this narrow slice.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-materialization.json" + }, + { + "scenario_id": "operator_debug_repair_action_clarity", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "qmd generated clear local replay steps for repair investigation, matching ELF on repair-action clarity while differing on trace hydration.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + }, + { + "scenario_id": "operator_debug_selected_but_not_narrated", + "suite_id": "operator_debugging_ux", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "qmd can replay top-k rows, but the generated artifact does not expose service trace narration stages for the selected-but-not-narrated diagnosis.", + "command": "cargo make real-world-job-operator-ux-live-adapters", + "artifact": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json" + } + ], + "evidence": [ + { + "kind": "fixture_dir", + "ref": "apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make real-world-job-operator-ux-live-adapters", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-job/operator-ux-live-adapters/qmd-report.json", + "status": "wrong_result" + } + ], + "notes": [ + "This is a narrow operator-debug live slice, not a full-suite live pass.", + "qmd's replay-command availability remains useful; the wrong_result status is limited to trace hydration and candidate-drop stage visibility." + ] + }, + { + "adapter_id": "agentmemory_live_baseline", + "project": "agentmemory", + "adapter_kind": "docker_sdk_mock_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "lifecycle_fail", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner installs and exercises agentmemory package APIs.", + "command": "ELF_BASELINE_PROJECTS=agentmemory cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/agentmemory.log" + }, + "run": { + "status": "lifecycle_fail", + "evidence": "Same-corpus retrieval can run, but durable lifecycle behavior is not proven because the adapter uses an in-memory SDK/KV mock.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "lifecycle_fail", + "evidence": "agentmemory remains a reference for capture and continuity UX, but current Docker evidence is not a durable lifecycle pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "The current adapter can run mem::remember and mem::search against the shared corpus." + }, + { + "capability": "adapter_storage", + "status": "mocked", + "evidence": "The current adapter uses a process-local StateKV Map and in-memory index." + }, + { + "capability": "durable_cold_start", + "status": "blocked", + "evidence": "A persistent upstream KV/index path or hosted runtime is needed before cold-start recovery can be fairly scored." + }, + { + "capability": "durable_work_resume_capture_path", + "status": "blocked", + "evidence": "XY-925 selects the next local path as a Docker-contained agentmemory session directory with persisted SDK KV store, observation log, and searchable index across a fresh process; the current StateKV Map and in-memory index still block scoring." + }, + { + "capability": "write_policy_hook_capture", + "status": "blocked", + "evidence": "Capture/write-policy jobs require live agentmemory hook observations plus persisted write-policy audit evidence. The current adapter does not execute those hooks." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed blocked prompt coverage for the required durable path, but no live agentmemory real_world_job adapter executes prompts until the persistent local store exists." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "A durable upstream agentmemory session/capture path is required before work-resume jobs can be compared fairly." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "The current fixture import boundary is offline and does not run live agentmemory hooks." + }, + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Durable update/supersede/delete history is not proven by the in-memory adapter." + } + ], + "scenarios": [ + { + "scenario_id": "basic_same_corpus_retrieval", + "suite_id": "retrieval", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports agentmemory retrieval_pass with 3/3 same-corpus retrieval checks through mem::remember and mem::search. This is live-baseline-only evidence through an in-memory mock, not a real_world_job suite pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "durable_update_reload_lifecycle", + "suite_id": "memory_evolution", + "status": "lifecycle_fail", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks, while agentmemory update_replaces_note_text is lifecycle_fail and cold_start_recovery_search is blocked because the harness uses an in-memory SDK/KV mock. This is an ELF baseline win only at the local lifecycle-smoke evidence class.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "work_resume_capture_continuity", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory's relevant strength is durable coding-agent continuity and capture, but the Docker harness has not proven a persistent session/capture path. XY-925 selects the durable local path as a Docker-contained session directory that persists the SDK KV store and searchable index across a fresh process; keep work_resume and capture claims blocked until that path exists.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "tmp/real-world-memory/first-generation-oss/report.json" + }, + { + "scenario_id": "durable_work_resume_local_path", + "suite_id": "work_resume", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The selected comparable path is explicit: capture into a Docker-local agentmemory session directory, persist the SDK KV/index and observation log, restart a fresh process, then score work_resume prompts. The checked-in fixture records this as blocked rather than scoring the current mock.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + }, + { + "scenario_id": "capture_write_policy_hooks", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "agentmemory capture/write-policy comparison needs live hook observations and write-policy audit evidence persisted through the selected local store. The fixture preserves this as a typed blocker and does not convert the mem::remember smoke into capture proof.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/agentmemory_durable_capture_path_blocked.json" + } + ], + "evidence": [ + { + "kind": "evidence", + "ref": "docs/evidence/external_memory/agentmemory_adapter.md", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "mocked" + } + ], + "notes": [ + "The offline agentmemory fixture adapter is an import/comparison boundary and must not be treated as live benchmark proof." + ], + "follow_up": { + "title": "[ELF benchmark P0] Make agentmemory adapter lifecycle-durable and fail-typed", + "reason": "A durable upstream agentmemory storage path is required before lifecycle and real-world job suites can be fairly scored." + } + }, + { + "adapter_id": "mem0_openmemory_live_baseline", + "project": "mem0/OpenMemory", + "adapter_kind": "docker_sdk_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install mem0 and configure local FastEmbed/Qdrant paths.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 exercises local OSS mem0 with FastEmbed, Qdrant path storage, Memory.update, Memory.delete, Memory.history, Memory.get_all, entity filters, and cold-start reload; mem0 passed 8/8 encoded SDK checks. XY-931 adds a separate OpenMemory export-helper setup probe artifact and keeps that blocked UI/export result out of the SDK check summary.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "The local OSS mem0 baseline now passes same-corpus retrieval, update/delete/reload, preference correction history, entity-scoped personalization, local get_all export-style readback, and deletion audit history. The separate OpenMemory export-helper setup probe is blocked because Docker is unavailable inside the baseline-runner container before any product app database readback can run. It still does not claim hosted Platform export, optional graph memory, or a real_world_job prompt adapter.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "local_storage", + "status": "real", + "evidence": "The adapter targets local FastEmbed, Qdrant path storage, and local history DB paths in Docker." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "local_lifecycle_update_delete_reload", + "status": "pass", + "evidence": "The Docker runner exercises public Memory.update, Memory.delete, and a new Memory.from_config over the same local Qdrant/history paths; the fresh scoped run reports those lifecycle checks passing." + }, + { + "capability": "preference_correction_history", + "status": "pass", + "evidence": "The fresh scoped run reports preference_correction_history as pass: Memory.history preserved explicit ADD and UPDATE records with old and current preference text, and search returned only the current correction." + }, + { + "capability": "entity_scoped_personalization", + "status": "pass", + "evidence": "The fresh scoped run reports entity_scoped_personalization as pass: user_id, agent_id, and run_id filters returned the ELF scoped preference and omitted a PubFi scoped preference." + }, + { + "capability": "local_get_all_export_readback", + "status": "pass", + "evidence": "The fresh scoped run reports local_get_all_export_readback as pass: Memory.get_all returned the current scoped preference and omitted the other scope." + }, + { + "capability": "deletion_audit_history", + "status": "pass", + "evidence": "The fresh scoped run reports delete_history_audit_readback as pass: Memory.history exposed a DELETE event and search suppressed the deleted memory." + }, + { + "capability": "openmemory_ui_readback", + "status": "blocked", + "evidence": "XY-931 runs a bounded OpenMemory export-helper setup probe after the mem0 SDK corpus checks. The probe finds the OpenMemory tree, UI package, compose file, and export helper, then records a setup blocker because the export helper requires Docker access to a running OpenMemory container. Local SDK get_all readback is measured separately and must not be reused as UI evidence." + }, + { + "capability": "hosted_managed_memory_claims", + "status": "unsupported", + "evidence": "Hosted mem0 Platform behavior and Platform UI export are outside the local OSS Docker adapter and are non-goals for this local evidence record." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No mem0/OpenMemory adapter currently executes real_world_job prompts and answer scoring." + }, + { + "capability": "optional_graph_memory", + "status": "not_encoded", + "evidence": "Optional graph memory is not enabled in the default local OSS path and remains an opt-in scenario gate rather than a default pass/fail claim." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure preference correction history and deletion audit readback, but no mem0 real_world_job memory_evolution prompt adapter is encoded." + }, + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Scenario-level local OSS checks now measure entity-scoped personalization, but no mem0 real_world_job personalization prompt adapter is encoded." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "Local SDK get_all inspection is measured, but OpenMemory UI/export readback is blocked by the XY-931 export-helper setup probe until a dedicated OpenMemory compose/import path can load the same corpus into the OpenMemory app database." + } + ], + "scenarios": [ + { + "scenario_id": "basic_local_lifecycle", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Prior comparable baseline run live-baseline-20260611061612 reports ELF passing 8/8 local lifecycle checks and mem0 passing basic same-corpus retrieval, update, delete, and cold-start reload checks. This remains a basic local lifecycle tie at the encoded smoke surface and is not reused as history/UI evidence.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "preference_correction_history", + "suite_id": "personalization", + "status": "pass", + "elf_position": "loses", + "comparison_outcome": "loss", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 preference_correction_history as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF live memory-evolution preference as wrong_result. The current measured comparison is therefore an ELF loss on this history dimension until ELF temporal reconciliation is fixed.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "entity_scoped_personalization", + "suite_id": "personalization", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 entity_scoped_personalization as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md, which records ELF and qmd passing the encoded personalization slice. This is a measured tie on the current scoped-preference surface.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-competitor-strength-adoption-report.md" + }, + { + "scenario_id": "delete_audit_readback", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 delete_history_audit_readback as pass. ELF-side evidence comes from cargo make real-world-memory-live-adapters as summarized in docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md, which records ELF passing the delete/TTL tombstone job. The current measured delete-audit comparison is a tie.", + "command": "mem0: ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker; ELF: cargo make real-world-memory-live-adapters", + "artifact": "mem0: tmp/live-baseline/mem0-checks.json; ELF: tmp/real-world-memory/live-adapters/ and docs/evidence/benchmarking/2026-06-11-temporal-history-competitor-gap-report.md" + }, + { + "scenario_id": "local_get_all_export_readback", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh scoped baseline run live-baseline-20260611122416 reports mem0 local_get_all_export_readback as pass. This is local SDK inspection/export-style readback, not OpenMemory UI evidence; ELF has no directly comparable live UI/export scoring row in this run.", + "command": "ELF_BASELINE_PROJECTS=mem0 cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/mem0-checks.json" + }, + { + "scenario_id": "openmemory_ui_export_readback", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The XY-931 OpenMemory export-helper setup probe is Docker-contained in the mem0 baseline run. It detects the OpenMemory product tree, UI package, compose file, and export helper, but Docker is unavailable inside the baseline-runner container before the helper can reach a running OpenMemory product container or app database. Basic lifecycle and local SDK get_all readback are not reused as UI/export proof.", + "command": "cargo make openmemory-ui-export-readback", + "artifact": "tmp/live-baseline/mem0-openmemory-ui-export.json" + }, + { + "scenario_id": "hosted_platform_export", + "suite_id": "operator_debugging_ux", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted mem0 Platform export is explicitly outside the local OSS Docker comparison and is not counted as a local pass, loss, or blocker.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "optional_graph_memory", + "suite_id": "memory_evolution", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Optional graph memory is kept as an opt-in scenario gate. It is not enabled in the default mem0 local OSS run and is not part of the default pass/fail comparison.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Separate local OSS mem0 SDK evidence from OpenMemory product UI/export claims.", + "A blocked OpenMemory export-helper setup probe is not an ELF win or loss until the product app can import and export the same local corpus." + ] + }, + { + "adapter_id": "memsearch_live_baseline", + "project": "memsearch", + "adapter_kind": "docker_cli_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "pass", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install memsearch and run its CLI path.", + "command": "ELF_BASELINE_PROJECTS=memsearch cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/memsearch.log" + }, + "run": { + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 indexes a per-adapter corpus copy, rewrites and deletes files, reruns memsearch index, and reports memsearch 4/4 encoded checks passing.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "pass", + "evidence": "memsearch now passes the local same-corpus/reindex/update/delete/reload smoke. No real_world_job memsearch prompt adapter is encoded, so Markdown-first behavior remains baseline scenario evidence rather than suite pass evidence.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "canonical_markdown_store", + "status": "real", + "evidence": "memsearch is tracked as a Markdown-first source-of-truth reference." + }, + { + "capability": "same_corpus_retrieval", + "status": "pass", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch retrieval_pass with 3/3 same-corpus retrieval checks." + }, + { + "capability": "reindex_update_delete_reload", + "status": "pass", + "evidence": "The runner rewrites auth-memory.md, deletes a second corpus file, reruns memsearch index, and starts fresh memsearch search processes; the fresh scoped run reports update, delete, and cold-start reload passing." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Markdown source-store and retrieval-debug jobs, but no live memsearch runtime adapter executes real_world_job prompts and answer scoring." + }, + { + "capability": "markdown_source_store_prompt_jobs", + "status": "pass", + "evidence": "The first-generation OSS fixture slice encodes source-of-truth rebuild/reload and retrieval-debug prompts over the canonical Markdown store while preserving the live-baseline-only evidence boundary." + } + ], + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "not_encoded", + "evidence": "The Markdown-first source model passed the local reindex/reload smoke, and XY-925 adds fixture-backed source-of-truth prompt coverage over the canonical Markdown store. No live memsearch runtime adapter executes prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The Docker same-corpus check passes, and XY-925 adds fixture-backed retrieval-debug prompt coverage over memsearch CLI replay and Markdown source inspection. No live memsearch runtime adapter executes retrieval prompt scoring yet, so this is not a suite pass." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Update/delete reindex semantics pass in Docker, but memory_evolution real_world_job prompts are not encoded for memsearch." + } + ], + "scenarios": [ + { + "scenario_id": "canonical_markdown_reindex_reload", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports memsearch passed same-corpus retrieval, update reindex, delete suppression, and cold-start reload over a canonical Markdown corpus. ELF has no directly comparable canonical Markdown source-store scenario in this baseline, so the ELF position remains untested.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "markdown_source_store_rebuild_reload_prompt", + "suite_id": "trust_source_of_truth", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in real_world_job prompt fixture that asks for the memsearch source-of-truth path and rebuild/reload boundary: canonical Markdown files are authoritative, while the index is derived by rerunning memsearch index. This is fixture-backed scenario coverage plus baseline artifact evidence, not a memsearch live real_world_job suite pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_markdown_rebuild_reload.json" + }, + { + "scenario_id": "markdown_retrieval_debug_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds a checked-in retrieval-debug prompt over memsearch's canonical Markdown store. The expected debug surface is CLI replay plus Markdown source inspection and reindexing; staged expansion/fusion/rerank/candidate-drop trace bundles remain not encoded for memsearch.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/memsearch_retrieval_debug_prompt.json" + }, + { + "scenario_id": "ttl_expiry_lifecycle", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "The encoded memsearch CLI path supports reindex/delete but no TTL or expiry behavior. Unsupported TTL behavior is preserved as unsupported competitor evidence and does not create an ELF win/loss claim without a directly comparable scenario artifact.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "real_world_prompt_adapter", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "No live memsearch runtime adapter currently executes real_world_job prompts and answer scoring. XY-925 fixture-backed prompt jobs document the source-store and retrieval-debug shape, while baseline retrieval/reindex evidence remains separate from suite pass claims.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "Do not mark memsearch worse solely because setup or local indexing is heavier; preserve the typed incomplete/wrong-result boundary." + ] + }, + { + "adapter_id": "openviking_live_baseline", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "OpenViking local-embed setup installed and imported pinned llama-cpp-python==0.3.28 from the CPU wheel index in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The adapter reached same-corpus add_resource/find and now exposes expected/matched/missing evidence ids, but returned 0 of 3 expected evidence-term matches in the smoke run.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The current OpenViking Docker evidence is a behavioral wrong_result, not a local embedding setup blocker and not a real_world_job pass.", + "artifact": "docs/runbook/benchmarking/live_baseline_benchmark.md" + }, + "capabilities": [ + { + "capability": "local_embed_setup", + "status": "pass", + "evidence": "Docker local embedding dependency setup is pinned to llama-cpp-python==0.3.28 from https://abetlen.github.io/llama-cpp-python/whl/cpu and reached import/runtime in the smoke run." + }, + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "OpenViking add_resource/find returned resources but missed expected evidence-term matches for every smoke query." + }, + { + "capability": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged/hierarchical retrieval is now encoded as blocked context_trajectory fixtures until same-corpus expected evidence ids match and staged artifacts are materialized." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No OpenViking adapter currently executes real_world_job prompts and answer scoring." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "The Docker-local setup reached add_resource/find, but the retrieval check returned 0/3 expected evidence-term matches." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Hierarchical context resume scenarios are not encoded for OpenViking." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "The staged retrieval, hierarchy selection, and recursive/context expansion fixtures are encoded as blocked behind same-corpus evidence output and staged artifact readback." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + }, + { + "label": "llama-cpp-python CPU wheel index", + "url": "https://abetlen.github.io/llama-cpp-python/whl/cpu", + "evidence": "Official prebuilt CPU wheel index used by the Docker-local embedding pin." + } + ], + "setup_path": "Run ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker. The runner installs llama-cpp-python==0.3.28 with --only-binary llama-cpp-python from the CPU wheel index before OpenViking add_resource/find.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host-global OpenViking, llama-cpp-python, or model service install is required.", + "resource_expectation": "Local embedding setup may download a CPU wheel and model assets; record OpenViking.log, elapsed time, and cache size before claiming adapter quality.", + "retry_guidance": [ + "Use the default pinned CPU wheel path first.", + "Override ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_VERSION or ELF_BASELINE_OPENVIKING_LLAMA_CPP_PYTHON_INDEX only when the default wheel is unavailable for the Docker platform.", + "Treat install/import failure as incomplete, not wrong_result; treat add_resource/find evidence misses as wrong_result." + ] + }, + "notes": [ + "Record OpenViking as wrong_result now that the pinned Docker local embedding path reaches add_resource/find but misses expected evidence; keep context_trajectory as blocked until staged artifacts exist." + ], + "follow_up": { + "title": "Fix OpenViking evidence-bearing same-corpus retrieval output and materialize staged artifacts", + "reason": "The current adapter reaches add_resource/find and exposes expected evidence ids, but must match evidence ids and return stage/hierarchy/recursive artifacts before trajectory quality can be scored." + } + }, + { + "adapter_id": "claude_mem_live_baseline", + "project": "claude-mem", + "adapter_kind": "docker_repository_same_corpus", + "evidence_class": "live_baseline_only", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "The live-baseline Docker runner can install and build claude-mem.", + "command": "ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/claude-mem.log" + }, + "run": { + "status": "wrong_result", + "evidence": "The Docker runner now uses a durable SQLite file, exercises repository update/delete/reopen checks, and reports missed same-corpus or lifecycle evidence as typed non-pass.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "result": { + "status": "wrong_result", + "evidence": "No real_world_job claude-mem adapter is encoded; progressive disclosure remains a design reference.", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + "capabilities": [ + { + "capability": "same_corpus_retrieval", + "status": "wrong_result", + "evidence": "The current Docker adapter did not prove correct same-corpus retrieval." + }, + { + "capability": "durable_storage", + "status": "real", + "evidence": "The runner writes to a Docker-local SQLite file and constructs a new Database plus repository instances for cold-start recovery search." + }, + { + "capability": "repository_lifecycle", + "status": "real", + "evidence": "The runner uses MemoryItemsRepository.update, deletes from the repository-owned memory_items table, and relies on repository FTS triggers for update/delete checks." + }, + { + "capability": "repository_progressive_disclosure", + "status": "real", + "evidence": "The runner verifies search result to getById detail hydration and listSources source evidence on the durable repository path." + }, + { + "capability": "progressive_disclosure_real_world_job", + "status": "pass", + "evidence": "XY-925 adds fixture-backed prompt coverage for the Docker-contained repository progressive-disclosure path: search result to getById detail hydration and listSources evidence on durable SQLite. Hook, timeline, and viewer workflows remain blocked separately." + }, + { + "capability": "retrieval_repair_artifact", + "status": "wrong_result", + "evidence": "The same-corpus retrieval smoke remains wrong_result, and XY-925 records a repair prompt that tells operators to rerun ELF_BASELINE_PROJECTS=claude-mem cargo make baseline-live-docker before inspecting tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json." + }, + { + "capability": "hook_capture_viewer_workflow", + "status": "blocked", + "evidence": "The current Docker runner does not launch claude-mem hooks, timeline capture, local viewer readback, or an operator workflow over the same corpus." + } + ], + "suites": [ + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "The durable repository run is encoded, but hook-driven capture and real_world_job work-resume prompts are not proven by that local repository check." + }, + { + "suite_id": "operator_debugging_ux", + "status": "blocked", + "evidence": "XY-925 adds fixture-backed progressive-disclosure and retrieval-repair prompt coverage, but local viewer/operator workflow remains blocked until a Docker-contained viewer or equivalent readback runner exists." + }, + { + "suite_id": "capture_integration", + "status": "blocked", + "evidence": "claude-mem hook capture remains blocked because hooks, timeline capture, and observation workflows are not executed by this runner." + } + ], + "scenarios": [ + { + "scenario_id": "same_corpus_retrieval", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF retrieval_pass and claude-mem same_corpus_retrieval as wrong_result with 0/3 expected query checks passing, while its durable repository setup completed. This is an ELF baseline win for the narrow retrieval smoke scenario.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "retrieval_repair_artifact_path", + "suite_id": "retrieval", + "status": "wrong_result", + "elf_position": "wins", + "comparison_outcome": "win", + "evidence": "XY-925 adds a checked-in repair prompt that preserves the claude-mem wrong_result and names rerun/inspection targets from the reproducible Docker baseline: tmp/live-baseline/claude-mem.log and tmp/live-baseline/claude-mem-checks.json. This is repair evidence for a miss, not a retrieval pass.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_retrieval_repair.json" + }, + { + "scenario_id": "repository_lifecycle_reload", + "suite_id": "memory_evolution", + "status": "pass", + "elf_position": "ties", + "comparison_outcome": "tie", + "evidence": "Fresh comparable baseline run live-baseline-20260611061612 reports ELF passing local lifecycle checks and claude-mem update, delete, and cold-start reload checks passing over a durable Docker-local SQLite repository. This is a local lifecycle-smoke tie, not a hook-driven work-resume or full progressive-disclosure job pass.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_detail_hydration", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "claude-mem passed the repository-level search-to-detail/source hydration check, which is a useful progressive-disclosure signal. ELF does not have a directly comparable claude-mem-style progressive-disclosure scenario in this baseline, so the ELF position remains untested rather than a loss claim.", + "command": "ELF_BASELINE_PROJECTS=ELF,agentmemory,mem0,memsearch,claude-mem cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/live-baseline-report.json" + }, + { + "scenario_id": "progressive_disclosure_prompt", + "suite_id": "operator_debugging_ux", + "status": "pass", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-925 adds fixture-backed prompt coverage that asks for the measured claude-mem progressive-disclosure boundary: repository search results hydrate through getById and listSources on durable SQLite, but hooks, timeline, viewer, and live prompt scoring are not executed.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_progressive_disclosure.json" + }, + { + "scenario_id": "hook_capture_viewer_workflow", + "suite_id": "capture_integration", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "The Docker baseline uses repository classes only. claude-mem hooks, viewer, timeline, and observation workflows are not executed by the runner, so XY-925 preserves this as a typed blocker rather than not_encoded prose.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + }, + { + "scenario_id": "viewer_operator_workflow", + "suite_id": "operator_debugging_ux", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "A fair claude-mem viewer/operator comparison needs a Docker-contained run that opens the local viewer or equivalent readback over the same durable SQLite corpus and emits timeline, detail hydration, and repair-command artifacts. That path is not available in the current runner.", + "command": "cargo make real-world-first-generation-oss", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/first_generation_oss/claude_mem_hook_viewer_blocked.json" + } + ], + "evidence": [ + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "notes": [ + "claude-mem remains a UX reference; durable repository checks do not prove hook, viewer, or full real-world progressive-disclosure behavior." + ] + }, + { + "adapter_id": "qmd_deep_profile_gate", + "project": "qmd", + "adapter_kind": "docker_cli_deep_profile_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "pass", + "evidence": "qmd already has a Docker CLI live-baseline adapter; this gate records the deeper profile extension before a separate scaled run is claimed.", + "command": "ELF_BASELINE_PROJECTS=qmd ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/qmd.log" + }, + "run": { + "status": "not_encoded", + "evidence": "The XY-899 strength-profile report is checked in, but no new live qmd deep-profile adapter artifact is claimed from it." + }, + "result": { + "status": "not_encoded", + "evidence": "The XY-899 report records qmd scenario-level retrieval/debug/replay outcomes and wrong-result diagnosis taxonomy, while expansion/fusion/rerank scoring remains not_encoded.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "stress_profile_retrieval_debug", + "status": "not_encoded", + "evidence": "The stress command path exists, but this adapter-pack gate has not published a deep qmd profile result." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "The qmd live real-world sweep covers the current encoded fixture corpus; expanded retrieval-debug strength suites still need their own materialized adapter run." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "Repository-supported qmd benchmark runs must stay inside docker-compose.baseline.yml and must not require host-global installs." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "A deeper stress retrieval-debug report is not checked in for this gate." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "qmd query planning and score readback are not yet scored as operator-debugging real_world_job outputs." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/tobi/qmd", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "qmd repository", + "url": "https://github.com/tobi/qmd", + "evidence": "Official qmd source for local hybrid search, CLI setup, and query behavior." + } + ], + "setup_path": "Use the existing Docker baseline qmd install, collection add, update, embed, and query flow with scale or stress profiles.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container with project files and caches inside Docker volumes.", + "resource_expectation": "CPU local embedding and rerank cost scale with corpus size; record elapsed time and qmd log artifacts before claims.", + "retry_guidance": [ + "Run qmd stress profile in Docker and publish the artifact path.", + "Map qmd JSON output to retrieval-debug real_world_job scoring before suite claims." + ], + "research_depth": "D2 reviewed; deep profile not encoded" + }, + "notes": [ + "This gate deepens qmd planning without changing the existing qmd pass evidence from the smoke live baseline." + ] + }, + { + "adapter_id": "openviking_deep_profile_gate", + "project": "OpenViking", + "adapter_kind": "docker_local_embed_context_trajectory_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "pass", + "evidence": "The default pinned OpenViking local embedding dependency path reaches runtime in Docker.", + "command": "ELF_BASELINE_PROJECTS=OpenViking cargo make baseline-live-docker", + "artifact": "tmp/live-baseline/OpenViking.log" + }, + "run": { + "status": "blocked", + "evidence": "The XY-928 context_trajectory fixtures encode staged retrieval, hierarchy selection, and recursive/context expansion as blocked; no live trajectory adapter artifact is claimed." + }, + "result": { + "status": "blocked", + "evidence": "No OpenViking deep context-trajectory result is claimed from the current wrong-result smoke run; the XY-928 fixtures preserve trajectory surfaces as blocked/not_tested.", + "artifact": "docs/evidence/benchmarking/2026-06-11-qmd-openviking-strength-profile-report.md" + }, + "capabilities": [ + { + "capability": "docker_local_embed_setup", + "status": "pass", + "evidence": "The local embedding setup is pinned and reaches import/runtime in Docker." + }, + { + "capability": "hierarchical_context_trajectory", + "status": "blocked", + "evidence": "Stage trajectory scoring is encoded as blocked until the smoke adapter returns evidence-bearing same-corpus output and selected hierarchy/expansion artifacts." + }, + { + "capability": "host_global_install_boundary", + "status": "unsupported", + "evidence": "The adapter pack must not ask operators to install OpenViking dependencies globally on the host." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "wrong_result", + "evidence": "Same-corpus retrieval is still the precondition and remains wrong_result in the live baseline." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "evidence": "OpenViking staged retrieval, hierarchy selection, and recursive/context expansion jobs are encoded as blocked fixtures." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Trajectory readback is a reference feature but not a scored adapter output." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/volcengine/OpenViking/", + "status": "real" + }, + { + "kind": "runner", + "ref": "scripts/live-baseline-benchmark.sh", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenViking repository", + "url": "https://github.com/volcengine/OpenViking/", + "evidence": "Official source for OpenViking local context database, resource, and retrieval APIs." + } + ], + "setup_path": "Use the pinned Docker local embedding path from scripts/live-baseline-benchmark.sh, then run OpenViking add_resource/find before any deep profile scoring.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner container; no host model or compiler setup outside Docker.", + "resource_expectation": "Local embedding setup can download CPU wheels and model assets; record build/import logs, model cache size, and elapsed time.", + "retry_guidance": [ + "Run the default pinned llama-cpp-python==0.3.28 CPU wheel path first.", + "Override the OpenViking llama-cpp-python version or index only when the default wheel is unavailable for the Docker platform.", + "Fix evidence-bearing same-corpus output and materialize selected hierarchy/expansion artifacts before converting blocked context_trajectory fixtures into scored jobs." + ], + "research_depth": "D2 reviewed; local embedding setup pinned; blocked fixtures encoded" + }, + "notes": [ + "OpenViking remains a context-trajectory reference, but this gate prevents a smoke wrong_result or blocked fixture from becoming a deep-profile win claim." + ] + }, + { + "adapter_id": "ragflow_research_gate", + "project": "RAGFlow", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe tiny-corpus evidence smoke into a generated real_world_job report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The live path requires explicit resource-envelope opt-in and a local self-hosted RAGFlow API key; setup failures stay typed in the generated smoke artifact.", + "command": "ELF_RAGFLOW_SMOKE_START=1 ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1 cargo make smoke-ragflow-docker", + "artifact": "tmp/real-world-memory/ragflow-smoke/memory_projects_manifest.ragflow-smoke.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits ragflow-report.json and ragflow-report.md from one generated retrieval job. Pass or wrong_result is allowed only when returned reference chunks map to generated evidence ids; resource, setup, and API-key limits remain typed blockers.", + "artifact": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json" + }, + "capabilities": [ + { + "capability": "adapter_candidate_verdict", + "status": "not_encoded", + "evidence": "XY-882 completed D1/D2 feasibility research and marks RAGFlow adapter_candidate; no adapter run is encoded." + }, + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The smoke records official Docker setup, image/disk/startup envelope, CPU/GPU mode, vm.max_map_count handling, provider boundaries, and retry behavior." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "One generated retrieval job is scored from the smoke artifact or typed blocked when resource, service, or local API-key boundaries stop execution." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The scored smoke does not claim broad RAGFlow quality, private corpus behavior, scale, or comparative ranking." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated retrieval smoke is scored as pass, wrong_result, blocked, or incomplete by ragflow-report.json; the checked-in row remains blocked until live reference chunks map to evidence ids." + }, + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "RAGFlow knowledge output is not mapped to real_world_job page or citation scoring." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "evidence": "Resource envelope and service startup retry guidance must be documented first." + } + ], + "scenarios": [ + { + "scenario_id": "reference_chunk_citation_mapping", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for RAGFlow reference-chunk citation scoring. The job must remain blocked until returned reference chunks include generated document ids, chunk ids, content, and document metadata mapped to benchmark evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "retrieval_quality_reference_recall", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-1071 keeps RAGFlow retrieval quality blocked until the same generated corpus returns answer text and selected reference chunks whose document ids, chunk ids, content, and metadata map to expected evidence ids; setup or API reachability alone is not retrieval quality evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "navigation_quality_document_chunks", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "RAGFlow document/chunk navigation remains blocked until returned references expose stable document metadata plus chunk identifiers that can be followed back to same-corpus source evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "answer_faithfulness_reference_chunks", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "RAGFlow answer faithfulness is blocked until generated answers can be checked against returned reference chunk content and decoy/stale chunks are absent from cited support.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/ragflow_reference_chunks_blocked.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "RAGFlow stale-source replacement, invalidation, or lint behavior is not encoded by the current same-corpus reference-chunk blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "knowledge_compilation_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "RAGFlow knowledge compilation quality is not scored because no checked-in same-corpus RAGFlow page, section, citation, or stale-source lint artifact exists.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "private_or_large_corpus_ragflow_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Private corpus, large-corpus, and hosted RAGFlow quality are outside the generated-public Docker representative lane and must not be inferred from smoke reports.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/infiniflow/ragflow", + "status": "real" + }, + { + "kind": "source", + "ref": "https://ragflow.io/docs/", + "status": "real" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/ragflow-smoke/ragflow-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "RAGFlow repository", + "url": "https://github.com/infiniflow/ragflow", + "evidence": "Official source for RAGFlow service code and Docker Compose setup." + }, + { + "label": "RAGFlow docs", + "url": "https://ragflow.io/docs/", + "evidence": "Official deployment and setup documentation." + }, + { + "label": "RAGFlow HTTP API reference", + "url": "https://raw.githubusercontent.com/infiniflow/ragflow/main/docs/references/http_api_reference.md", + "evidence": "Official reference for OpenAI-compatible responses with reference chunks and document metadata." + } + ], + "setup_path": "Implement a tiny Docker evidence-smoke runner using the official Docker deployment, dataset ingest API, and OpenAI-compatible query API.", + "runtime_boundary": "Run scripts/ragflow-docker-evidence-smoke.sh through cargo make; the live path uses the official RAGFlow Docker Compose service boundary without host-global RAGFlow installs.", + "resource_expectation": "Large multi-service RAG stack; generated artifacts record CPU/GPU mode, memory, disk, image size, expanded disk notes, startup time, vm.max_map_count handling, and provider boundaries before scoring.", + "retry_guidance": [ + "Run cargo make smoke-ragflow-docker first to produce a typed preflight artifact.", + "Start the live path only with ELF_RAGFLOW_SMOKE_START=1 and ELF_RAGFLOW_SMOKE_ACCEPT_RESOURCE_ENVELOPE=1.", + "Keep private corpora and operator-owned provider credentials out of this smoke; map only generated public corpus reference chunks to evidence ids." + ], + "research_depth": "D2 feasibility verdict plus XY-885 evidence-smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed resource/setup/API-key blockers.", + "Do not interpret ragflow-report.json as broad RAGFlow quality evidence unless reference chunks map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement RAGFlow Docker evidence-smoke adapter", + "reason": "Created as XY-885. XY-882 found a Docker boundary and reference-chunk output contract; implementation must prove a tiny ingest/query run before any quality claim." + } + }, + { + "adapter_id": "lightrag_research_gate", + "project": "LightRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-886 adds a Docker-profile context-export smoke command, and XY-900 keeps its generated retrieval fixtures scored through real_world_job_benchmark. The checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if the LightRAG API is unavailable; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in Docker service profile.", + "command": "ELF_LIGHTRAG_CONTEXT_START=1 cargo make smoke-lightrag-docker-context", + "artifact": "tmp/real-world-memory/lightrag-context/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke emits lightrag-report.json and lightrag-report.md over generated retrieval jobs. Pass or wrong_result is allowed only when returned context, references, or file paths map to generated evidence ids.", + "artifact": "tmp/real-world-memory/lightrag-context/lightrag-report.json" + }, + "capabilities": [ + { + "capability": "docker_service_setup", + "status": "blocked", + "evidence": "The opt-in compose profile records explicit LightRAG image, LLM, embedding, rerank, workspace, and Docker volume configuration without host-global installs." + }, + { + "capability": "retrieved_context_export", + "status": "blocked", + "evidence": "The materializer calls /documents/texts, waits on /documents/track_status, and queries /query with only_need_context plus chunk references when the service is reachable." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The LightRAG materializer rewrites generated retrieval fixtures with adapter_response evidence only when source paths or context map to required evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not score broad graph-RAG quality, private corpora, scale, or comparative ranking claims." + } + ], + "suites": [ + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The generated smoke can exercise retrieval context/source mapping for retrieval fixtures, but the checked-in record stays blocked until a live artifact reaches query output." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "LightRAG update/delete/current-versus-historical behavior is not encoded by the context-export smoke." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "The smoke records context/source mappings, but full trace or viewer diagnostics are not mapped to benchmark scoring." + } + ], + "scenarios": [ + { + "scenario_id": "context_source_reference_mapping", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative incomplete fixture for LightRAG context/source-reference scoring. The job cannot score until the opt-in Docker API exports generated source file paths, snippets, or reference content.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "retrieval_quality_context_recall", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-1071 keeps LightRAG retrieval quality incomplete until the opt-in Docker API exports same-corpus context or references that can be joined to expected evidence ids; service startup alone is not a retrieval-quality result.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "citation_quality_context_references", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG citation quality is incomplete until returned context, references.file_path, references.content, or equivalent source snippets map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "navigation_quality_graph_context", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG graph/context navigation remains incomplete until exported context exposes source paths or graph-derived source snippets that can be followed back to same-corpus evidence.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "answer_faithfulness_context_refs", + "suite_id": "retrieval", + "status": "incomplete", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "LightRAG answer faithfulness is incomplete until generated answers and only_need_context output can be checked for required evidence, decoy exclusion, and source-reference alignment.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/lightrag_context_sources_incomplete.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current context-source blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "knowledge_compilation_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG knowledge compilation quality is not scored because no checked-in same-corpus page, section, citation, or stale-source lint artifact exists.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "graph_rag_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "LightRAG graph-RAG navigation quality remains not_tested beyond the context-source output contract; no ELF win, tie, or loss is claimed.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-lightrag-docker-context", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-materialization.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/lightrag-context/lightrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LightRAG repository", + "url": "https://github.com/HKUDS/LightRAG", + "evidence": "Official source for LightRAG server, Docker, and retrieval modes." + }, + { + "label": "LightRAG Docker docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/DockerDeployment.md", + "evidence": "Official Docker deployment reference." + }, + { + "label": "LightRAG API server docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/LightRAG-API-Server.md", + "evidence": "Official query-mode and context-output reference." + }, + { + "label": "LightRAG core programming docs", + "url": "https://github.com/HKUDS/LightRAG/blob/main/docs/ProgramingWithCore.md", + "evidence": "Official source-id and file-path citation reference." + } + ], + "setup_path": "Run cargo make smoke-lightrag-docker-context for a typed preflight artifact; set ELF_LIGHTRAG_CONTEXT_START=1 to start the opt-in LightRAG Docker profile and attempt live context export.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus opt-in lightrag and lightrag-mock-provider services; generated source files and LightRAG data stay in Docker-mounted artifact paths and Docker volumes.", + "resource_expectation": "The default profile uses the official LightRAG image, a local OpenAI-compatible mock provider, 64-dimensional embeddings, rerank disabled for context queries, cargo/pip/Hugging Face caches, and Docker volumes for rag_storage, inputs, and prompts.", + "retry_guidance": [ + "Run cargo make smoke-lightrag-docker-context first; a missing API must remain a typed incomplete artifact, not a pass claim.", + "Set ELF_LIGHTRAG_CONTEXT_START=1 only when Docker may pull/start the LightRAG service profile.", + "Score retrieval only when returned context, references.file_path, or references.content map to required evidence ids." + ], + "research_depth": "D2 feasibility plus XY-886 context-export implementation and XY-900 scored smoke aggregation; checked-in record remains research_gate unless a generated artifact reaches query output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed service/setup blockers.", + "Do not interpret lightrag-report.json as broad graph-RAG quality evidence unless generated source/context mappings score as pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement LightRAG Docker context-export adapter", + "reason": "Created as XY-886. XY-882 found a Docker service path and context/source mapping contract; implementation must prove evidence export before scoring." + } + }, + { + "adapter_id": "graphrag_research_gate", + "project": "GraphRAG", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-safe generated-corpus GraphRAG smoke into a scored knowledge_compilation report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed blocked artifact without model calls; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration to attempt live GraphRAG index/query.", + "command": "ELF_GRAPHRAG_SMOKE_RUN=1 cargo make smoke-graphrag-docker", + "artifact": "tmp/real-world-memory/graphrag-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphrag-report.json and graphrag-report.md from one generated knowledge_compilation job. Pass or wrong_result is allowed only when GraphRAG output tables map to generated evidence ids.", + "artifact": "tmp/real-world-memory/graphrag-smoke/graphrag-report.json" + }, + "capabilities": [ + { + "capability": "indexing_resource_envelope", + "status": "blocked", + "evidence": "The smoke bounds the generated public corpus, timeout, GraphRAG package, model configuration, cache size, output size, elapsed time, and observed cache entries." + }, + { + "capability": "source_citation_mapping", + "status": "blocked", + "evidence": "The generated artifact maps GraphRAG documents, text_units, communities, community_reports, entities, and relationships parquet rows back to real_world_job evidence ids when available." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; provider/setup limits remain blocked until live GraphRAG output maps to expected evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-navigation quality, knowledge-synthesis quality, private corpora, or large-corpus indexing." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The generated smoke can exercise parquet table source coverage for one tiny knowledge-compilation fixture, but the checked-in record stays blocked until live output exists." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "The smoke may run local search for reachability, but retrieval quality scoring is not encoded." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Resource bounds are recorded, but no production-ops suite scoring is encoded." + }, + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "GraphRAG update/delete/current-versus-historical behavior is not encoded by the smoke." + } + ], + "scenarios": [ + { + "scenario_id": "output_table_citation_mapping", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for GraphRAG output-table citation scoring. The job requires provider-backed Docker output tables whose document, text-unit, community, report, entity, and relationship identifiers map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "retrieval_quality_local_search", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-1071 keeps GraphRAG retrieval quality not tested because the current smoke records output-table and local-search reachability contracts but does not score same-corpus retrieval answers beyond mapped output prerequisites.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "navigation_quality_community_graph", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "GraphRAG community/entity/relationship navigation remains blocked until provider-backed output tables expose community, entity, relationship, text-unit, and document identifiers that map to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "answer_faithfulness_output_tables", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "GraphRAG answer faithfulness is blocked until summaries or local-search answers can be checked against mapped documents, text units, and community report rows while excluding unsupported or stale claims.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + }, + { + "scenario_id": "stale_source_behavior", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG stale-source replacement, invalidation, or lint behavior is not encoded by the current output-table blocker; no stale-source quality claim is made.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + { + "scenario_id": "graph_summary_synthesis_quality", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "GraphRAG graph-summary synthesis quality remains not_tested until provider-backed output tables and local-search context are scored beyond the smoke contract.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/microsoft/graphrag", + "status": "real" + }, + { + "kind": "source", + "ref": "https://microsoft.github.io/graphrag/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphrag-docker", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphrag-smoke/graphrag-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "GraphRAG repository", + "url": "https://github.com/microsoft/graphrag", + "evidence": "Official Microsoft GraphRAG source and setup reference." + }, + { + "label": "GraphRAG docs", + "url": "https://microsoft.github.io/graphrag/", + "evidence": "Official documentation for indexing and querying." + }, + { + "label": "GraphRAG input docs", + "url": "https://microsoft.github.io/graphrag/index/inputs/", + "evidence": "Official input format and document metadata reference." + }, + { + "label": "GraphRAG output tables", + "url": "https://microsoft.github.io/graphrag/index/outputs/", + "evidence": "Official output schema with document, text unit, community, and relationship identifiers." + }, + { + "label": "GraphRAG local search docs", + "url": "https://microsoft.github.io/graphrag/query/local_search/", + "evidence": "Official local-search context and graph traversal reference." + } + ], + "setup_path": "Run cargo make smoke-graphrag-docker for a typed preflight artifact; set ELF_GRAPHRAG_SMOKE_RUN=1 with explicit provider configuration for a live GraphRAG index/query attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, generated public corpus, and report artifacts under tmp/real-world-memory/graphrag-smoke.", + "resource_expectation": "The default profile uses a generated public corpus capped by ELF_GRAPHRAG_MAX_DOCS and ELF_GRAPHRAG_MAX_INPUT_CHARS, pins GraphRAG through ELF_GRAPHRAG_PACKAGE, and records elapsed time, cache size, output size, and observed cache entries.", + "retry_guidance": [ + "Run cargo make smoke-graphrag-docker first; missing provider configuration must remain a typed blocked artifact, not a pass claim.", + "Enable ELF_GRAPHRAG_SMOKE_RUN=1 only for generated public corpus indexing with explicit provider configuration.", + "Fail typed if source document or text_unit identifiers cannot be mapped to expected evidence IDs." + ], + "research_depth": "D2 feasibility plus XY-887 Docker smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches GraphRAG output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed provider/setup blockers.", + "Do not interpret graphrag-report.json as broad graph-navigation or knowledge-synthesis quality evidence unless output tables map to generated evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement GraphRAG cost-bounded Docker adapter", + "reason": "Created as XY-887. XY-882 found a Docker-bounded CLI/API path and output-table evidence handles; implementation must stay tiny and cost-recorded." + } + }, + { + "adapter_id": "graphiti_zep_research_gate", + "project": "Graphiti/Zep", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-900 promotes the Docker-contained Graphiti/Zep temporal smoke into a scored memory_evolution report while the checked-in row remains smoke-only research_gate evidence.", + "command": "cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json" + }, + "run": { + "status": "blocked", + "evidence": "The default smoke records a typed setup/runtime failure if live execution is not explicitly enabled. Set ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration to start Docker-local FalkorDB and run Graphiti.", + "command": "ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 cargo make smoke-graphiti-zep-docker-temporal", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "The smoke now emits graphiti-zep-report.json and graphiti-zep-report.md from one generated memory_evolution job. The default blocker is live-run opt-in disabled; when ELF_GRAPHITI_ZEP_SMOKE_START=1 and ELF_GRAPHITI_ZEP_SMOKE_RUN=1 are set without provider credentials, the blocker is provider_api_key_missing. No hosted Zep service or unrecorded credentials are used.", + "artifact": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.json" + }, + "capabilities": [ + { + "capability": "temporal_graph_memory", + "status": "blocked", + "evidence": "The smoke materializes generated current, historical, and rationale facts with validity windows, but the checked-in record stays blocked until a live artifact maps search output." + }, + { + "capability": "docker_graph_store_setup", + "status": "blocked", + "evidence": "The task uses a Docker Compose graphiti-zep profile for FalkorDB and a container-local Python venv; no host-global graph database or hosted Zep service is used." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "The generated temporal-validity fixture is scored or typed blocked; live quality evidence requires Graphiti/Zep search output mapped to current and historical evidence ids." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph-memory quality, managed Zep service behavior, private-corpus behavior, or large-corpus performance." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Generated current/historical relation facts are encoded, but the checked-in manifest stays blocked until the Docker smoke returns validity-window search output." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Hybrid graph retrieval reachability is not scored beyond the temporal search smoke." + }, + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "The smoke records setup and provider boundaries but does not encode backup, restore, private corpus, or hosted-service operations." + } + ], + "scenarios": [ + { + "scenario_id": "temporal_validity_window_mapping", + "suite_id": "memory_evolution", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "XY-929 adds a representative blocked fixture for Graphiti/Zep temporal-validity scoring. The job remains blocked until provider-backed Docker output maps current and historical validity-window facts to generated evidence ids.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphiti_temporal_validity_blocked.json" + }, + { + "scenario_id": "hosted_zep_temporal_memory", + "suite_id": "memory_evolution", + "status": "unsupported", + "elf_position": "untested", + "comparison_outcome": "non_goal", + "evidence": "Hosted Zep service behavior is outside the Docker-local representative lane; no hosted-service result is used as ELF win/loss evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/getzep/graphiti", + "status": "real" + }, + { + "kind": "source", + "ref": "https://www.getzep.com/platform/graphiti/", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphiti-zep-docker-temporal", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-smoke.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphiti-zep-smoke/graphiti-zep-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Graphiti repository", + "url": "https://github.com/getzep/graphiti", + "evidence": "Official open-source temporal context graph engine." + }, + { + "label": "Zep Graphiti overview", + "url": "https://www.getzep.com/platform/graphiti/", + "evidence": "Official product documentation for temporal context graph behavior." + }, + { + "label": "Graphiti quick start", + "url": "https://help.getzep.com/graphiti/getting-started/quick-start", + "evidence": "Official setup, episode ingest, and search output reference." + }, + { + "label": "Graphiti FalkorDB configuration", + "url": "https://help.getzep.com/graphiti/configuration/falkor-db-configuration", + "evidence": "Official Docker-local FalkorDB setup reference." + }, + { + "label": "Graphiti fact triples", + "url": "https://help.getzep.com/graphiti/working-with-data/adding-fact-triples", + "evidence": "Official manual fact-triple ingest contract." + } + ], + "setup_path": "Run cargo make smoke-graphiti-zep-docker-temporal for a typed artifact; set ELF_GRAPHITI_ZEP_SMOKE_START=1 ELF_GRAPHITI_ZEP_SMOKE_RUN=1 with explicit provider configuration for a live attempt.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus graphiti-zep FalkorDB profile, container-local Python venv, generated public temporal facts, and report artifacts under tmp/real-world-memory/graphiti-zep-smoke.", + "resource_expectation": "Requires Docker-local FalkorDB plus LLM/embedding configuration; generated artifacts record service startup, storage size, provider boundaries, fact count, and timeout before scoring.", + "retry_guidance": [ + "Run cargo make smoke-graphiti-zep-docker-temporal first to produce a typed blocked artifact.", + "Start the live path only with ELF_GRAPHITI_ZEP_SMOKE_START=1, ELF_GRAPHITI_ZEP_SMOKE_RUN=1, and explicit provider configuration.", + "Treat missing validity windows or unmapped current/historical facts as wrong_result, not pass." + ], + "research_depth": "D2 feasibility plus XY-888 Docker temporal smoke implementation and XY-900 scored smoke promotion; checked-in record remains research_gate unless a generated artifact reaches Graphiti search output" + }, + "notes": [ + "Status class: smoke-only scored adapter path with typed live-run opt-in, provider, and setup blockers.", + "Graphiti/Zep remains the temporal-validity reference; do not claim ELF-over-Graphiti/Zep until provider-backed temporal output maps to scored evidence ids." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement Graphiti/Zep temporal graph adapter", + "reason": "Created as XY-888. XY-882 found a Docker-local graph-store path and fact/validity-window output contract for memory_evolution scoring." + } + }, + { + "adapter_id": "letta_research_gate", + "project": "Letta", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "Letta is D1 reviewed as a core/archival memory reference. The contained comparison contract now has cargo make smoke-letta-core-archive-export-readback, a Docker-only benchmark-created agent export/readback materializer that must return core block JSON, archival search/readback JSON, and source ids before any scenario claim is scored.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json" + }, + "run": { + "status": "blocked", + "evidence": "The default materializer emits a typed blocked report unless a Docker-local Letta server and explicit model/provider configuration produce benchmark-owned core block export and archival readback/search output.", + "command": "ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + "result": { + "status": "blocked", + "evidence": "No Letta core block, archival fallback, stale-core, scope, provenance, or project-decision pass/win/tie/loss is claimed until the generated export/readback artifact maps required source ids.", + "artifact": "tmp/real-world-memory/letta-core-archive/report.json" + }, + "capabilities": [ + { + "capability": "core_archival_memory", + "status": "blocked", + "evidence": "ELF fixture jobs score core block attachment, scope, provenance, stale-core detection, archival fallback, and project-decision recovery separately from archival note search; Letta remains blocked until its export maps equivalent source ids." + }, + { + "capability": "docker_embedding_configuration", + "status": "blocked", + "evidence": "Official Docker setup requires explicit embedding configuration before archival retrieval can be tested." + }, + { + "capability": "real_world_job_adapter", + "status": "blocked", + "evidence": "A Docker-contained materializer now exists and emits typed blocked evidence by default; live scoring still requires exported Letta core blocks, archival list/search JSON, and source-id mappings." + }, + { + "capability": "broad_letta_quality_claim", + "status": "not_encoded", + "evidence": "The materializer does not score broad Letta product quality, hosted/private state, personalization breadth, or production durability." + } + ], + "suites": [ + { + "suite_id": "personalization", + "status": "not_encoded", + "evidence": "Core memory preference application is not encoded for Letta." + }, + { + "suite_id": "project_decisions", + "status": "blocked", + "evidence": "The project-decision recovery row is represented only through the core_archival_memory export/readback materializer and remains blocked without mapped source ids." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Agent resumption through Letta memory blocks is not encoded." + }, + { + "suite_id": "core_archival_memory", + "status": "blocked", + "evidence": "A Docker-contained materializer now emits the core_archival_memory scenarios as typed blocked unless live Letta export/readback maps core block JSON, archival search/readback JSON, and source ids." + } + ], + "scenarios": [ + { + "scenario_id": "core_block_attachment_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-attachment-001 scores exact core block attachment and keeps core readback out of Qdrant-backed archival search. Letta remains blocked until the generated export/readback artifact maps this core block attachment source id.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_scope_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-scope-001 scores read_profile, shared scope, and private-owner boundaries. Letta scope behavior remains blocked until the generated export includes agent, block, visibility metadata, and source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_block_provenance_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-core-block-provenance-001 scores source_ref and audit_history readback. Letta provenance remains blocked until exported core memory includes stable source ids and audit-equivalent events.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "stale_core_detection", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-stale-core-detection-001 scores archival evidence superseding a stale core block. Letta stale-core comparison is blocked until core export and archival readback can be joined by source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "archival_fallback_readback", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-archival-fallback-001 scores fallback from insufficient core memory to archival note search. Letta fallback comparison is blocked until archival search output can be exported with source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + }, + { + "scenario_id": "core_archival_project_decision_recovery", + "suite_id": "core_archival_memory", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "ELF fixture core-archival-project-decision-recovery-001 scores core routing plus archival decision rationale. Letta project-decision recovery remains blocked until the generated export/readback artifact maps core routing plus archival rationale source ids.", + "command": "cargo make smoke-letta-core-archive-export-readback", + "artifact": "tmp/real-world-memory/letta-core-archive/summary.json" + } + ], + "evidence": [ + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/letta-core-archive-export.json", + "status": "blocked" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/letta-core-archive/summary.json", + "status": "blocked" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/guides/docker", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/python", + "status": "real" + }, + { + "kind": "source", + "ref": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Letta Docker docs", + "url": "https://docs.letta.com/guides/docker", + "evidence": "Official Docker setup and explicit embedding configuration boundary." + }, + { + "label": "Letta Python API", + "url": "https://docs.letta.com/api/python", + "evidence": "Official Python SDK memory block creation and retrieval examples." + }, + { + "label": "Letta archival search API", + "url": "https://docs.letta.com/api/resources/agents/subresources/passages/methods/search", + "evidence": "Official archival-memory search endpoint contract." + } + ], + "setup_path": "Run cargo make smoke-letta-core-archive-export-readback for a typed artifact; set ELF_LETTA_SMOKE_START=1 ELF_LETTA_SMOKE_RUN=1 with explicit model/provider configuration for a live export attempt. The smoke exports core block JSON plus archival search/readback JSON when Letta setup succeeds.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner plus optional Letta server profile, benchmark-created agent, benchmark-owned fixture corpus, no hosted/private state, and artifacts under tmp/real-world-memory/letta-core-archive.", + "resource_expectation": "Letta Docker server, Python SDK client, explicit model and embedding configuration, exported core memory, archival search output, and provider boundaries must be explicit in the artifact.", + "retry_guidance": [ + "Default command records a typed blocked artifact without model calls.", + "Enable the live path only with Docker-local Letta and explicit provider or local model configuration.", + "Score core-versus-archival scenarios only after core block export and archival list/search output map to fixture evidence ids." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); XY-927 selected the contained export/readback contract; XY-984 adds the Docker-contained materializer and keeps the comparison blocked until live export evidence maps source ids." + }, + "notes": [] + }, + { + "adapter_id": "langgraph_research_gate", + "project": "LangGraph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "LangGraph is D1 reviewed as a replay/checkpoint reference, not a direct memory backend adapter." + }, + "run": { + "status": "not_encoded", + "evidence": "No checkpoint replay real_world_job harness is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No production-ops or resume suite result is claimed." + }, + "capabilities": [ + { + "capability": "checkpoint_replay_regression", + "status": "not_encoded", + "evidence": "Replay/fork behavior needs an agent graph harness before scoring." + }, + { + "capability": "standalone_memory_backend", + "status": "unsupported", + "evidence": "LangGraph persistence is an agent-state/checkpoint layer, not a drop-in memory retrieval backend." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No LangGraph benchmark materializer exists." + } + ], + "suites": [ + { + "suite_id": "production_ops", + "status": "not_encoded", + "evidence": "Checkpoint recovery and replay regression are not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume from checkpoint with memory reads is not encoded." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://docs.langchain.com/oss/python/langgraph/persistence", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "LangGraph persistence docs", + "url": "https://docs.langchain.com/oss/python/langgraph/persistence", + "evidence": "Official documentation for checkpoints, replay, fork, and persistence behavior." + } + ], + "setup_path": "Build a tiny LangGraph agent with a checkpointer and explicit memory read/write steps before scoring.", + "runtime_boundary": "Docker-only Python harness with checkpoint store under the artifact directory.", + "resource_expectation": "Small runtime expected, but LLM calls and side effects must be stubbed or deterministic before replay claims.", + "retry_guidance": [ + "Encode one replay/fork failure recovery job.", + "Keep LangGraph classified as replay reference unless memory retrieval is actually exercised." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); replay/checkpoint reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "nanograph_research_gate", + "project": "nanograph", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "nanograph is D1 reviewed as typed graph DX, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No typed graph schema/query real_world_job run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No graph temporal or retrieval-debug result is claimed." + }, + "capabilities": [ + { + "capability": "typed_graph_schema", + "status": "not_encoded", + "evidence": "Schema-as-code and typed query ergonomics need a benchmark harness." + }, + { + "capability": "memory_backend_comparison", + "status": "unsupported", + "evidence": "nanograph is a graph database reference, not a complete agent memory service." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No nanograph materializer exists." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "not_encoded", + "evidence": "Typed current/historical fact jobs are not encoded." + }, + { + "suite_id": "retrieval", + "status": "not_encoded", + "evidence": "Typed query explainability is not scored." + } + ], + "scenarios": [], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nanograph/nanograph", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "nanograph repository", + "url": "https://github.com/nanograph/nanograph", + "evidence": "Official source for on-device typed property graph behavior." + } + ], + "setup_path": "Build or install nanograph inside Docker and load a typed graph fixture from generated corpus facts.", + "runtime_boundary": "Docker-only CLI run with graph folder under benchmark artifacts.", + "resource_expectation": "Light local graph runtime expected; record binary build/install time and graph artifact size.", + "retry_guidance": [ + "Define a minimal schema for memory_evolution facts.", + "Score typed query output only if it cites fixture evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); typed graph DX reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "llm_wiki_research_gate", + "project": "llm-wiki", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "llm-wiki is D1 reviewed as a knowledge-compilation reference, but no plugin or generated-page adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No llm-wiki corpus-to-page run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge page citation or lint result is claimed." + }, + "capabilities": [ + { + "capability": "knowledge_page_compilation", + "status": "not_encoded", + "evidence": "Wiki generation and citation lint are not executed by the runner." + }, + { + "capability": "live_service_runtime", + "status": "unsupported", + "evidence": "llm-wiki is a plugin/workflow reference rather than a service adapter." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No page materializer or scorer mapping exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Corpus-to-wiki output is not encoded." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from wiki pages are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "wiki_page_citation_lint", + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "llm-wiki remains a knowledge-workflow reference. No Docker-contained plugin or file-based page materializer emits cited wiki sections for scoring.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/nvk/llm-wiki", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "llm-wiki repository", + "url": "https://github.com/nvk/llm-wiki", + "evidence": "Official source for the LLM Wiki plugin and knowledge-base workflow." + } + ], + "setup_path": "Research plugin bootstrap inside a Docker-contained Codex or file-based harness, then materialize page artifacts.", + "runtime_boundary": "Docker-only plugin or fixture materializer; no user-global Codex plugin install.", + "resource_expectation": "LLM generation cost depends on page build; record provider boundary and generated artifact size.", + "retry_guidance": [ + "Prototype a fixture-only page build with explicit citations.", + "Do not score until generated sections can be mapped to evidence IDs." + ], + "research_depth": "D1 feasibility verdict: research_only (XY-882); derived wiki workflow reference, adapter not encoded" + }, + "notes": [] + }, + { + "adapter_id": "gbrain_research_gate", + "project": "gbrain", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "not_encoded", + "setup": { + "status": "not_encoded", + "evidence": "gbrain is D1 reviewed as a compiled-truth and timeline reference, but no Docker adapter is implemented." + }, + "run": { + "status": "not_encoded", + "evidence": "No gbrain brain-repo import or compiled-truth run is encoded." + }, + "result": { + "status": "not_encoded", + "evidence": "No knowledge-synthesis or operator-continuity result is claimed." + }, + "capabilities": [ + { + "capability": "compiled_truth_timeline", + "status": "not_encoded", + "evidence": "Compiled truth plus timeline output is a reference pattern but not scored." + }, + { + "capability": "postgres_backed_brain_repo", + "status": "blocked", + "evidence": "A Docker-local brain repo and Postgres setup path must be proven before execution." + }, + { + "capability": "real_world_job_adapter", + "status": "not_encoded", + "evidence": "No gbrain materializer exists." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "not_encoded", + "evidence": "Compiled truth and timeline pages are not scored." + }, + { + "suite_id": "operator_debugging_ux", + "status": "not_encoded", + "evidence": "Operator continuity through brain pages is not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "compiled_truth_timeline_export", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "gbrain compiled-truth and timeline scoring remains blocked until a Docker-local brain repository and database setup emits current-truth pages with source timeline evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain", + "status": "real" + }, + { + "kind": "source", + "ref": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "status": "real" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "gbrain repository", + "url": "https://github.com/garrytan/gbrain", + "evidence": "Official source for brain repo and retrieval workflow." + }, + { + "label": "compiled truth guide", + "url": "https://github.com/garrytan/gbrain/blob/master/docs/guides/compiled-truth.md", + "evidence": "Official guide for compiled truth plus timeline behavior." + } + ], + "setup_path": "Create a Docker-local brain repo fixture, run import/sync, and export compiled truth plus timeline evidence.", + "runtime_boundary": "Docker-only repository and database state with no operator-owned brain repo.", + "resource_expectation": "Postgres-backed sync and embedding choices must be explicit; record DB size and import time.", + "retry_guidance": [ + "Prototype a tiny brain repo with one current-truth page and timeline.", + "Score only if compiled truth cites the source timeline evidence." + ], + "research_depth": "D1 feasibility verdict: blocked (XY-882); Docker-local brain repo and database path not proven" + }, + "notes": [] + }, + { + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_real_world_job", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "XY-900 validation reached the Docker-only graph/report smoke setup inside the baseline runner without host-global assistant hooks.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "The smoke installed graphify in a container-local venv, ran over a generated public corpus, and produced graph/report/query output for scoring.", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "The smoke emits graphify-report.json and graphify-report.md from one generated knowledge_compilation job. The current scored report maps evidence ids but remains wrong_result because the scoring rubric still records a wrong-result signal.", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [ + { + "capability": "docker_cli_boundary", + "status": "pass", + "evidence": "The smoke uses docker-compose.baseline.yml baseline-runner, a container-local Python venv, and isolated assistant config paths; it does not install host-global assistant hooks." + }, + { + "capability": "graph_report_generation", + "status": "pass", + "evidence": "The smoke captures graphify-out/graph.json, GRAPH_REPORT.md, cache metadata, command logs, build time, graph size, and report size." + }, + { + "capability": "real_world_job_adapter", + "status": "wrong_result", + "evidence": "The smoke writes a generated real_world_job fixture and scored report; current knowledge_compilation scoring is wrong_result, not pass." + }, + { + "capability": "multimodal_code_graph", + "status": "not_encoded", + "evidence": "Multimodal extraction for videos, images, PDFs, or broad codebase understanding is a reference capability but not scored by this smoke." + }, + { + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "The smoke does not claim broad graph quality, private corpus behavior, scale, or authoritative memory-store behavior." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "The generated smoke exercised graph/report evidence mapping for one generated knowledge-compilation fixture and scored wrong_result with mean_score 0.75." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Graph-guided query output is present only as support for the generated knowledge_compilation smoke; broad retrieval quality scoring remains unclaimed." + }, + { + "suite_id": "work_resume", + "status": "not_encoded", + "evidence": "Resume answers from graph context are not encoded." + } + ], + "scenarios": [ + { + "scenario_id": "graph_report_navigation_lint", + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "XY-929 adds a representative graphify fixture that scores graph report navigation, source-location citations, stale-source lint, and unsupported-summary handling as wrong_result because stale-source lint is still missing. This remains graphify non-pass evidence, not an ELF victory claim.", + "command": "cargo make real-world-memory-graph-rag", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphify_graph_report_wrong_result.json" + }, + { + "scenario_id": "broad_graph_navigation_quality", + "suite_id": "retrieval", + "status": "not_encoded", + "elf_position": "untested", + "comparison_outcome": "not_tested", + "evidence": "Broad graph-navigation, codebase, multimodal, and private-corpus quality remain not_tested; the graphify evidence is bounded to generated graph/report artifacts.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/safishamsi/graphify", + "status": "real" + }, + { + "kind": "command", + "ref": "cargo make smoke-graphify-docker-graph-report", + "status": "wrong_result" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json", + "status": "pass" + }, + { + "kind": "artifact", + "ref": "tmp/real-world-memory/graphify-smoke/graphify-report.md", + "status": "wrong_result" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "graphify repository", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Official source for graphify graph extraction and query workflow." + }, + { + "label": "graphify README", + "url": "https://github.com/safishamsi/graphify/blob/v3/README.md", + "evidence": "Official CLI, output artifact, query, and source-location contract." + } + ], + "setup_path": "Run cargo make smoke-graphify-docker-graph-report to install graphify in Docker, build graph/report artifacts from a generated public corpus, and export query evidence without installing host-global assistant hooks.", + "runtime_boundary": "docker-compose.baseline.yml baseline-runner, container-local Python venv, isolated HOME/config paths, generated public corpus, and artifacts under tmp/real-world-memory/graphify-smoke.", + "resource_expectation": "Graph build cost scales with corpus and model choices; generated artifacts record package reference, provider/model boundary, build time, graph size, report size, cache size, timeout, and retry behavior.", + "retry_guidance": [ + "Run cargo make smoke-graphify-docker-graph-report first; setup/runtime failures must remain typed artifacts, not pass claims.", + "Do not use graphify host assistant hook installs or operator-owned assistant configuration as proof.", + "Score graph-guided answers only when graph.json, GRAPH_REPORT.md, and graphify query output map to generated evidence ids." + ], + "research_depth": "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + }, + "notes": [ + "Status class: live Docker scored smoke with a current wrong_result outcome.", + "Do not interpret graphify-report.json as broad graph-navigation or knowledge-compilation quality evidence; the tiny smoke is scored and currently non-pass." + ], + "follow_up": { + "title": "[ELF benchmark adapter] Implement graphify Docker graph-report adapter", + "reason": "Created as XY-889. XY-882 found a Docker-only CLI/materializer path and source-file/source-location output contract." + } + }, + { + "adapter_id": "vectifyai_pageindex_same_corpus_blocker", + "project": "VectifyAI PageIndex", + "adapter_kind": "typed_same_corpus_setup_blocker", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-1068 records a same-corpus PageIndex blocker: no contained PageIndex product installation, PageIndex MCP readback, or emitted tree artifact maps node paths back to ELF Source Library source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json" + }, + "run": { + "status": "blocked", + "evidence": "The checked-in blocker uses the same source-library long-document corpus ids as ELF but does not run PageIndex product runtime or emit PageIndex tree_nodes.json, cited node paths, traversal output, or MCP readback.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "tmp/real-world-memory/pageindex-openkb/report.json" + }, + "result": { + "status": "blocked", + "evidence": "No PageIndex parity, win, tie, loss, or comparable pass claim is allowed until a contained PageIndex run emits source-id-mapped tree artifacts and setup/runtime metadata.", + "artifact": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md" + }, + "capabilities": [ + { + "capability": "long_document_tree_retrieval", + "status": "blocked", + "evidence": "PageIndex remains the vectorless long-document tree retrieval reference, but no contained tree retrieval product output is checked in." + }, + { + "capability": "pageindex_mcp_readback", + "status": "blocked", + "evidence": "No PageIndex MCP readback artifact maps to ELF source ids." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "A runnable adapter must emit tree nodes, cited node paths, traversal output, source ids, and setup/runtime metadata before scoring." + } + ], + "suites": [ + { + "suite_id": "source_library", + "status": "blocked", + "evidence": "The PageIndex blocker compares against the same ELF Source Library long-document corpus ids, but PageIndex product artifacts are missing." + } + ], + "scenarios": [ + { + "scenario_id": "pageindex_long_document_tree_blocked", + "suite_id": "source_library", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "PageIndex remains blocked until tree artifacts and MCP readback map back to same-corpus source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/VectifyAI/PageIndex", + "status": "real" + }, + { + "kind": "fixture", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json", + "status": "blocked" + }, + { + "kind": "report", + "ref": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "PageIndex repository", + "url": "https://github.com/VectifyAI/PageIndex", + "evidence": "Repository provenance for the tracked PageIndex product row." + } + ], + "setup_path": "Resolve the XY-1068 setup blocker by running PageIndex in a contained runtime over the same Source Library corpus.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit PageIndex tree artifacts, MCP readback, source ids, and runtime metadata.", + "resource_expectation": "Unknown until PageIndex product runtime is materialized for the same corpus.", + "retry_guidance": [ + "Run cargo make real-world-memory-pageindex-openkb to regenerate the typed blocker report.", + "Do not claim PageIndex parity, win, tie, or loss from ELF Source Library fixture evidence.", + "Score PageIndex only after emitted tree artifacts and MCP readback map to benchmark source ids." + ], + "research_depth": "Typed same-corpus setup blocker from XY-1068" + }, + "notes": [ + "This row is intentionally non-comparable and preserves PageIndex as a blocked tracked product row." + ], + "follow_up": { + "title": "Run PageIndex same-corpus long-document tree adapter", + "reason": "The fair comparison needs PageIndex tree nodes and traversal output over the source-library long-document corpus with source ids mapped to benchmark evidence ids." + } + }, + { + "adapter_id": "vectifyai_openkb_same_corpus_blocker", + "project": "VectifyAI OpenKB", + "adapter_kind": "typed_same_corpus_setup_blocker", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "XY-1068 records a same-corpus OpenKB blocker: no contained OpenKB product run, generated wiki page export, entity/concept index export, saved exploration state, lint output, or watch/recompile trace maps back to ELF Knowledge Workspace source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json" + }, + "run": { + "status": "blocked", + "evidence": "The checked-in blocker uses the same knowledge corpus ids as ELF but does not run OpenKB product runtime or emit wiki, lint, saved exploration, or watch/recompile artifacts.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "tmp/real-world-memory/pageindex-openkb/report.json" + }, + "result": { + "status": "blocked", + "evidence": "No OpenKB parity, win, tie, loss, or comparable pass claim is allowed until a contained OpenKB run emits source-id-mapped wiki/index/lint/watch artifacts and setup/runtime metadata.", + "artifact": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md" + }, + "capabilities": [ + { + "capability": "compiled_wiki_export", + "status": "blocked", + "evidence": "OpenKB remains the compiled wiki/export reference, but no contained wiki export is checked in." + }, + { + "capability": "concept_entity_index_lint_watch", + "status": "blocked", + "evidence": "No entity/concept index export, lint output, saved exploration state, or watch/recompile trace maps to ELF Knowledge Workspace source ids." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "A runnable adapter must emit wiki pages, concept/entity indexes, lint output, saved exploration state, watch/recompile trace, source ids, and setup/runtime metadata before scoring." + } + ], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "blocked", + "evidence": "The OpenKB blocker compares against the same ELF Knowledge Workspace corpus ids, but OpenKB product artifacts are missing." + } + ], + "scenarios": [ + { + "scenario_id": "openkb_wiki_recompile_blocked", + "suite_id": "knowledge_compilation", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "OpenKB remains blocked until wiki, lint, saved exploration, and watch/recompile artifacts map back to same-corpus source ids.", + "command": "cargo make real-world-memory-pageindex-openkb", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/VectifyAI/OpenKB", + "status": "real" + }, + { + "kind": "fixture", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json", + "status": "blocked" + }, + { + "kind": "report", + "ref": "docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "OpenKB repository", + "url": "https://github.com/VectifyAI/OpenKB", + "evidence": "Repository provenance for the tracked OpenKB product row." + } + ], + "setup_path": "Resolve the XY-1068 setup blocker by running OpenKB in a contained runtime over the same Knowledge Workspace corpus.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit OpenKB wiki, index, lint, watch/recompile, source id, and runtime metadata artifacts.", + "resource_expectation": "Unknown until OpenKB product runtime is materialized for the same corpus.", + "retry_guidance": [ + "Run cargo make real-world-memory-pageindex-openkb to regenerate the typed blocker report.", + "Do not claim OpenKB parity, win, tie, or loss from ELF Knowledge Workspace fixture evidence.", + "Score OpenKB only after emitted wiki/index/lint/watch artifacts map to benchmark source ids." + ], + "research_depth": "Typed same-corpus setup blocker from XY-1068" + }, + "notes": [ + "This row is intentionally non-comparable and preserves OpenKB as a blocked tracked product row." + ], + "follow_up": { + "title": "Run OpenKB same-corpus wiki and watch/recompile adapter", + "reason": "The fair comparison needs OpenKB wiki/entity/concept outputs and watch/recompile artifacts over the same knowledge corpus with source ids mapped to benchmark evidence ids." + } + }, + { + "adapter_id": "plastic_labs_honcho_research_gate", + "project": "plastic-labs Honcho", + "adapter_kind": "research_gate", + "evidence_class": "research_gate", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "blocked", + "setup": { + "status": "blocked", + "evidence": "No Docker-contained Honcho product-runtime adapter, benchmark fixture, source-id mapping, held-out split, leakage audit, or container digest is checked in for this scoreboard.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "run": { + "status": "blocked", + "evidence": "Honcho is tracked as a requested public comparison row, but no same-task product surface has been materialized against the ELF real_world_job corpus.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "result": { + "status": "blocked", + "evidence": "No Honcho parity, win, tie, loss, or comparable pass claim is allowed until product-runtime evidence maps returned context to benchmark source ids.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + }, + "capabilities": [ + { + "capability": "stateful_agent_memory_runtime", + "status": "blocked", + "evidence": "Honcho is a requested memory/runtime comparison target, but no contained benchmark runtime evidence is checked in." + }, + { + "capability": "source_id_mapping", + "status": "blocked", + "evidence": "No Honcho output maps returned context to ELF benchmark source ids." + }, + { + "capability": "container_digest_evidence", + "status": "blocked", + "evidence": "No Honcho container image digest or runtime metadata is recorded for the public scoreboard." + } + ], + "suites": [ + { + "suite_id": "memory_evolution", + "status": "blocked", + "evidence": "Honcho memory-quality behavior is not scored until benchmark runtime output exists." + }, + { + "suite_id": "work_resume", + "status": "blocked", + "evidence": "Honcho work-continuity or resume behavior is not scored until benchmark runtime output exists." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "Honcho retrieval/context output is not scored until returned context maps to expected evidence ids." + } + ], + "scenarios": [ + { + "scenario_id": "honcho_product_runtime_blocked", + "suite_id": "retrieval", + "status": "blocked", + "elf_position": "untested", + "comparison_outcome": "blocked", + "evidence": "Honcho remains blocked until a contained product-runtime adapter emits benchmark retrieved context, source ids, latency/cost/resource metadata, and container digest evidence.", + "artifact": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json" + } + ], + "evidence": [ + { + "kind": "source", + "ref": "https://github.com/plastic-labs/honcho", + "status": "real" + }, + { + "kind": "source", + "ref": "https://honcho.dev/docs/v3/documentation/introduction/vibecoding", + "status": "real" + }, + { + "kind": "manifest", + "ref": "apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json", + "status": "blocked" + } + ], + "execution_metadata": { + "sources": [ + { + "label": "Honcho repository", + "url": "https://github.com/plastic-labs/honcho", + "evidence": "Repository provenance for the tracked Honcho product row." + }, + { + "label": "Honcho documentation", + "url": "https://honcho.dev/docs/v3/documentation/introduction/vibecoding", + "evidence": "Documentation provenance for Honcho as a stateful agent-memory/runtime comparison target." + } + ], + "setup_path": "Create a Docker-contained Honcho adapter over the real_world_job corpus before scoring.", + "runtime_boundary": "Future evidence must run in Docker or another contained product-runtime boundary and emit Honcho context/memory output, source ids, latency/cost/resource metadata, and container digest evidence.", + "resource_expectation": "Unknown until Honcho product runtime is materialized for the benchmark corpus.", + "retry_guidance": [ + "Do not claim Honcho parity, win, tie, or loss from the research-gate row.", + "Score Honcho only after product-runtime output maps to benchmark source ids.", + "Record held-out, leakage-audit, and container digest metadata before comparability." + ], + "research_depth": "D0 tracked product row with typed runtime blocker" + }, + "notes": [ + "This row is intentionally non-comparable and records the user-requested Honcho comparison target as a typed blocker." + ], + "follow_up": { + "title": "Run Honcho product-runtime adapter", + "reason": "The fair comparison needs Honcho runtime context output, source-id mapping, held-out/leakage evidence, and container digest metadata over the ELF real_world_job corpus." + } + } + ] + }, + "capture_integration": { + "real": [ + "ELF live add_note capture can persist public evidence with source ids and skip excluded evidence ids through the Docker live adapter.", + "ELF live add_note capture stores source_id values in source_ref and returns evidence-bound notes through search_raw.", + "ELF live add_note capture applies write_policy redactions before storage and records write-policy audit counts in materialization artifacts.", + "The runner validates fixture evidence ids, required evidence links, and inline quote substrings." + ], + "fixture_backed": [ + "The adversarial fixture encodes one public source, one write-policy audit, and one excluded private span as a privacy_leak trap.", + "The fixture encodes public capture, write-policy audit evidence, and a private excluded span as a negative trap.", + "Linear issue status, GitHub PR review summary, command transcript, and capture exclusion timeline are encoded as checked-in fixture text.", + "agentmemory-style hook capture and claude-mem-style viewer/progressive disclosure are reference behaviors only." + ], + "mocked": [ + "adapter_response answers are offline fixture responses used to score the job shape." + ], + "blocked": [ + "agentmemory hook breadth remains blocked until a durable local session/capture adapter replaces the in-memory mock.", + "claude-mem hook and viewer capture remain not encoded because the Docker baseline does not execute hooks, timeline, or viewer workflows.", + "agentmemory host-global capture hooks are not installed; durable capture breadth remains blocked until a Docker-local session path exists.", + "claude-mem hook/viewer capture breadth remains not encoded in the Docker baseline.", + "agentmemory hook breadth remains blocked by the current in-memory storage adapter.", + "claude-mem hook capture remains not encoded because hooks, timeline, observations, and viewer workflows are not executed by the Docker baseline.", + "Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters require separate credentials or runtime wiring." + ], + "not_encoded": [ + "This fixture does not claim live browser, Slack, or credentialed capture coverage.", + "Host-global capture hooks, Slack/browser capture, and credentialed tool capture are outside this Docker-scoped benchmark.", + "Host-global automatic capture hooks are intentionally not installed by this benchmark.", + "No live external hook ingestion, viewer session readback, Slack capture, browser capture, or credentialed tool execution is encoded in this suite." + ], + "notes": [ + "The private excluded span is synthetic and must remain absent from the produced answer and evidence ids.", + "Live ELF scoring must not store or retrieve the private-excluded-text evidence id.", + "This job is a source-id and evidence-binding check, not a host-global hook installation.", + "The synthetic private token label is a negative trap and must not appear in live generated answers.", + "The fixture is intended to test capture boundaries before implementing live adapters." + ] + }, + "summary": { + "job_count": 82, + "encoded_suite_count": 19, + "pass": 75, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 7, + "not_encoded": 0, + "unsupported_claim": 0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 11, + "update_rationale_available_count": 16, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 4, + "expected_evidence_total": 172, + "expected_evidence_matched": 172, + "expected_evidence_recall": 1.0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 5, + "wrong_result_stage_attribution_count": 0, + "mean_score": 0.915, + "mean_latency_ms": 2.885, + "total_cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "evidence_required_count": 180, + "evidence_covered_count": 180, + "evidence_coverage": 1.0, + "source_ref_required_count": 180, + "source_ref_covered_count": 180, + "source_ref_coverage": 1.0, + "quote_required_count": 180, + "quote_covered_count": 180, + "quote_coverage": 1.0, + "stale_retrieval_count": 0, + "scope_check_count": 3, + "scope_correct_count": 3, + "scope_correctness": 1.0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case_count": 3, + "qdrant_rebuild_pass_count": 3, + "operator_debug_job_count": 2, + "raw_sql_needed_count": 0, + "trace_incomplete_count": 0, + "operator_ux_gap_count": 0, + "consolidation": { + "proposal_count": 5, + "proposal_usefulness": 0.91, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "executable_gap_count": 0 + }, + "memory_summary": { + "job_count": 1, + "summary_count": 1, + "entry_count": 7, + "required_category_count": 6, + "covered_required_category_count": 6, + "missing_required_category_count": 0, + "top_of_mind_count": 1, + "background_count": 1, + "stale_count": 1, + "superseded_count": 1, + "tombstone_count": 1, + "derived_project_profile_count": 2, + "source_ref_required_count": 6, + "source_ref_entry_count": 6, + "source_ref_coverage": 1.0, + "freshness_marker_count": 7, + "freshness_coverage": 1.0, + "rationale_count": 7, + "rationale_coverage": 1.0, + "invalid_top_of_mind_count": 0, + "untraced_entry_count": 0, + "derived_with_source_or_unsupported_count": 2, + "derived_missing_source_or_unsupported_count": 0, + "unsupported_derived_entry_count": 1, + "unsupported_current_entry_count": 0, + "tombstone_ref_count": 1, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 1, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "proactive_brief": { + "job_count": 4, + "brief_count": 4, + "suggestion_count": 5, + "required_suggestion_kind_count": 4, + "covered_required_suggestion_kind_count": 4, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_suggestion_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "recommended_count": 2, + "deferred_count": 2, + "rejected_count": 1, + "current_suggestion_count": 2, + "non_current_suggestion_count": 3, + "stale_warning_count": 3, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "scheduled_memory": { + "job_count": 4, + "task_run_count": 4, + "output_count": 5, + "required_task_kind_count": 4, + "covered_required_task_kind_count": 4, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 5, + "evidence_ref_output_count": 5, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 5, + "freshness_coverage": 1.0, + "action_rationale_count": 5, + "action_rationale_coverage": 1.0, + "trace_required_count": 4, + "trace_complete_count": 4, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 2, + "non_current_output_count": 3, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 7, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 2, + "source_trace_superseded_count": 3, + "source_trace_tombstone_count": 1 + }, + "work_continuity": { + "job_count": 8, + "readback_count": 8, + "entry_count": 8, + "reset_resume_required_count": 1, + "reset_resume_success_count": 1, + "reset_resume_success_rate": 1.0, + "decision_rationale_required_count": 1, + "decision_rationale_recalled_count": 1, + "decision_rationale_recall_rate": 1.0, + "rejected_option_required_count": 1, + "rejected_option_suppressed_count": 1, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 1.0, + "explicit_next_step_required_count": 1, + "explicit_next_step_returned_count": 1, + "explicit_next_step_correct_count": 1, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 1, + "inferred_next_step_labeled_count": 1, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 1.0, + "handoff_source_ref_required_count": 1, + "handoff_source_ref_covered_count": 1, + "handoff_source_ref_coverage": 1.0, + "redaction_required_count": 1, + "redaction_applied_count": 1, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 1.0, + "janitor_candidate_count": 1, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "knowledge": { + "job_count": 3, + "page_count": 5, + "section_count": 13, + "backlink_count": 11, + "pages_with_backlinks": 5, + "pages_with_version_diff": 1, + "citation_coverage": 0.923, + "stale_claim_detection": 1.0, + "rebuild_determinism": 1.0, + "backlink_coverage": 1.0, + "version_diff_coverage": 0.2, + "page_usefulness": 0.979, + "unsupported_summary_count": 1, + "untraced_section_count": 0, + "allowed_variance_count": 1 + } + }, + "suites": [ + { + "suite_id": "trust_source_of_truth", + "status": "pass", + "encoded_job_count": 1, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 1 encoded job(s) passed." + }, + { + "suite_id": "work_resume", + "status": "pass", + "encoded_job_count": 6, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 6 encoded job(s) passed." + }, + { + "suite_id": "project_decisions", + "status": "pass", + "encoded_job_count": 5, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 2, + "update_rationale_available_count": 5, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 5 encoded job(s) passed." + }, + { + "suite_id": "retrieval", + "status": "pass", + "encoded_job_count": 5, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 5 encoded job(s) passed." + }, + { + "suite_id": "memory_evolution", + "status": "pass", + "encoded_job_count": 8, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 7, + "update_rationale_available_count": 8, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 3, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 8 encoded job(s) passed." + }, + { + "suite_id": "adversarial_quality", + "status": "pass", + "encoded_job_count": 5, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 2, + "update_rationale_available_count": 3, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 1, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 5 encoded job(s) passed." + }, + { + "suite_id": "consolidation", + "status": "pass", + "encoded_job_count": 5, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 1, + "reason": "All 5 encoded job(s) passed." + }, + { + "suite_id": "memory_summary", + "status": "pass", + "encoded_job_count": 1, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 1 encoded job(s) passed." + }, + { + "suite_id": "proactive_brief", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "scheduled_memory", + "status": "blocked", + "encoded_job_count": 5, + "score_mean": 0.8, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "knowledge_compilation", + "status": "pass", + "encoded_job_count": 3, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 3 encoded job(s) passed." + }, + { + "suite_id": "source_library", + "status": "pass", + "encoded_job_count": 2, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 2 encoded job(s) passed." + }, + { + "suite_id": "operator_debugging_ux", + "status": "pass", + "encoded_job_count": 1, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 1, + "reason": "All 1 encoded job(s) passed." + }, + { + "suite_id": "capture_integration", + "status": "pass", + "encoded_job_count": 4, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 4 encoded job(s) passed." + }, + { + "suite_id": "work_continuity", + "status": "pass", + "encoded_job_count": 8, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 8 encoded job(s) passed." + }, + { + "suite_id": "production_ops", + "status": "blocked", + "encoded_job_count": 8, + "score_mean": 0.75, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "At least one encoded job is blocked." + }, + { + "suite_id": "personalization", + "status": "pass", + "encoded_job_count": 1, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 1 encoded job(s) passed." + }, + { + "suite_id": "core_archival_memory", + "status": "pass", + "encoded_job_count": 6, + "score_mean": 1.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 0, + "reason": "All 6 encoded job(s) passed." + }, + { + "suite_id": "context_trajectory", + "status": "blocked", + "encoded_job_count": 3, + "score_mean": 0.0, + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available_count": 0, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 0, + "expected_evidence_recall": 1.0, + "irrelevant_context_ratio": 0.0, + "trace_explainability_count": 3, + "reason": "At least one encoded job is blocked." + } + ], + "jobs": [ + { + "suite_id": "adversarial_quality", + "job_id": "adversarial-quality-conflicting-source-authority-001", + "title": "Prefer authoritative current source over conflicting stale source", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "current-provider-report", + "claim_id": "current_provider_dimension", + "requirement": "cite" + }, + { + "evidence_id": "authority-ordering-rule", + "claim_id": "authority_rationale", + "requirement": "explain" + } + ], + "produced_answer": "Use the authoritative current report: provider-backed ELF evidence uses Qwen3-Embedding-8B with 4096-dimensional embeddings. The older 1536-dimensional note is historical because the authority rule says to prefer the latest checked-in report with source refs.", + "produced_evidence": [ + "authority-ordering-rule", + "current-provider-report", + "old-provider-note" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "current-provider-report" + ], + "historical_evidence": [ + "old-provider-note" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "current-provider-report" + ], + "selected_historical_evidence": [ + "old-provider-note" + ], + "selected_rationale_evidence": [ + "authority-ordering-rule" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "current-provider-report", + "old-provider-note", + "authority-ordering-rule" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "adversarial_quality", + "job_id": "adversarial-quality-correction-persistence-001", + "title": "Keep a correction persistent across rollback readback", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scoreboard-correction-event", + "claim_id": "correction_persisted", + "requirement": "cite" + }, + { + "evidence_id": "scoreboard-rollback-readback", + "claim_id": "rollback_restored_scoreboard_rule", + "requirement": "cite" + }, + { + "evidence_id": "current-scoreboard-rule", + "claim_id": "current_scoreboard_rule", + "requirement": "use" + } + ], + "produced_answer": "Use the current corrected rule: reports must show typed non-pass states and must not collapse wrong_result, incomplete, blocked, not_tested, not_encoded, unsupported_claim, fixture_backed, live_baseline, or research_gate evidence into wins. The win-by-majority memory was superseded, and rollback readback restored the corrected scoreboard rule.", + "produced_evidence": [ + "current-scoreboard-rule", + "obsolete-scoreboard-memory", + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.05 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "current-scoreboard-rule" + ], + "historical_evidence": [ + "obsolete-scoreboard-memory" + ], + "tombstone_evidence": [ + "scoreboard-correction-event" + ], + "invalidation_evidence": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ], + "selected_current_evidence": [ + "current-scoreboard-rule" + ], + "selected_historical_evidence": [ + "obsolete-scoreboard-memory" + ], + "selected_rationale_evidence": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ], + "selected_tombstone_evidence": [ + "scoreboard-correction-event" + ], + "selected_invalidation_evidence": [ + "scoreboard-correction-event", + "scoreboard-rollback-readback" + ], + "conflict_candidate_evidence": [ + "current-scoreboard-rule", + "obsolete-scoreboard-memory", + "scoreboard-correction-event" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "history_event_types": [ + "supersede", + "restore", + "correction" + ], + "history_requires_note_version_links": true + } + }, + { + "suite_id": "adversarial_quality", + "job_id": "adversarial-quality-private-excluded-span-001", + "title": "Use public evidence while excluding private spans", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "public-quality-rule", + "claim_id": "public_quality_rule", + "requirement": "cite" + }, + { + "evidence_id": "private-exclusion-audit", + "claim_id": "private_span_excluded", + "requirement": "explain" + } + ], + "produced_answer": "Remember the public quality rule: benchmark reports must show typed non-pass states before any claim summary. The write-policy audit says the private span was excluded before evidence binding, so it must not be repeated.", + "produced_evidence": [ + "private-exclusion-audit", + "public-quality-rule" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "adversarial_quality", + "job_id": "adversarial-quality-stale-fact-current-answer-001", + "title": "Suppress a stale memory when a newer fact supersedes it", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "current-ops-runbook-v2", + "claim_id": "current_gate_sequence", + "requirement": "cite" + }, + { + "evidence_id": "runbook-change-rationale", + "claim_id": "stale_gate_rationale", + "requirement": "explain" + } + ], + "produced_answer": "Use the current runbook: before review handoff run cargo make fmt, cargo make lint-fix, and then cargo make checks. The older cargo make check-only memory is historical because the Decodex gate was expanded.", + "produced_evidence": [ + "current-ops-runbook-v2", + "runbook-change-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "current-ops-runbook-v2" + ], + "historical_evidence": [ + "stale-ops-runbook-v1" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "current-ops-runbook-v2" + ], + "selected_historical_evidence": [], + "selected_rationale_evidence": [ + "runbook-change-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "adversarial_quality", + "job_id": "adversarial-quality-unsupported-claim-refusal-001", + "title": "Refuse an unsupported quality claim", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": true, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "quality-boundary-evidence", + "claim_id": "unsupported_quality_claim_refused", + "requirement": "cite" + } + ], + "produced_answer": "I cannot claim private-corpus production quality, hosted product parity, or broad competitor superiority from this fixture. The evidence supports only encoded fixture behavior.", + "produced_evidence": [ + "quality-boundary-evidence" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "capture_integration", + "job_id": "capture-redaction-exclusion-001", + "title": "Capture a public decision while excluding private text", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "public-captured-decision", + "claim_id": "public_requirement_remembered", + "requirement": "cite" + }, + { + "evidence_id": "write-policy-audit", + "claim_id": "private_span_excluded", + "requirement": "explain" + } + ], + "produced_answer": "Remember the public benchmark-report requirement: reports should include evidence coverage, unsupported claim count, stale retrieval count, scope correctness, and redaction leak count. The write-policy audit says private spans were excluded before evidence binding, so the private span must not be repeated.", + "produced_evidence": [ + "public-captured-decision", + "write-policy-audit" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "capture_integration", + "job_id": "capture-source-id-binding-001", + "title": "Preserve source ids and evidence binding during live capture", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "source-id-release-summary", + "claim_id": "source_id_bound", + "requirement": "cite" + }, + { + "evidence_id": "source-id-command-log", + "claim_id": "docker_no_host_hooks", + "requirement": "cite" + } + ], + "produced_answer": "The release-summary requirement is bound to source id capture:issue-comment-42, and source ids remained attached to evidence-bound notes. The command-log evidence says the benchmark ran inside Docker and did not require host-global hooks.", + "produced_evidence": [ + "source-id-command-log", + "source-id-release-summary" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "capture_integration", + "job_id": "capture-write-policy-redaction-001", + "title": "Apply write-policy redaction before live capture storage", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "redacted-source-message", + "claim_id": "source_id_preserved", + "requirement": "cite" + }, + { + "evidence_id": "redacted-source-message", + "claim_id": "redaction_applied", + "requirement": "explain" + } + ], + "produced_answer": "The live capture report must preserve source id capture:terminal-log-17. The write-policy audit says the private token label was redacted before storage, and the private token value must not be repeated.", + "produced_evidence": [ + "redacted-source-message" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "consolidation", + "job_id": "consolidation-contradiction-report-discard-001", + "title": "Discard a stale contradiction proposal instead of mutating source truth", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "debug_report", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "current-consolidation-source-rule", + "claim_id": "contradiction_report_discarded", + "requirement": "cite" + }, + { + "evidence_id": "unsupported-applied-draft", + "claim_id": "contradiction_report_created", + "requirement": "explain" + } + ], + "produced_answer": "Create a contradiction report proposal and discard the stale source-rewrite synthesis. The report flags one unsupported applied-worker claim while keeping source mutation count at zero.", + "produced_evidence": [ + "current-consolidation-source-rule", + "stale-consolidation-source-rewrite", + "unsupported-applied-draft" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lineage_completeness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "proposal_usefulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "review_action_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "consolidation": { + "proposal_count": 1, + "proposal_usefulness": 0.9, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 1, + "executable_gaps": [], + "proposals": [ + { + "proposal_id": "proposal-contradiction-report-discard", + "proposal_kind": "contradiction_report", + "usefulness_score": 0.9, + "min_usefulness_score": 0.8, + "lineage_completeness": 1.0, + "expected_review_action": "discard", + "actual_review_action": "discard", + "review_action_correct": true, + "source_mutation_count": 0, + "unsupported_claim_count": 1 + } + ] + } + }, + { + "suite_id": "consolidation", + "job_id": "consolidation-preference-candidate-defer-001", + "title": "Defer a preference candidate until reviewer confirmation", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "session-preference-1", + "claim_id": "preference_candidate_created", + "requirement": "cite" + }, + { + "evidence_id": "session-preference-2", + "claim_id": "preference_candidate_deferred", + "requirement": "use" + } + ], + "produced_answer": "Propose a preference candidate for concise, evidence-focused updates, but defer application because the fixture does not include explicit reviewer confirmation.", + "produced_evidence": [ + "session-preference-1", + "session-preference-2" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lineage_completeness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "proposal_usefulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "review_action_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "consolidation": { + "proposal_count": 1, + "proposal_usefulness": 0.86, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gaps": [], + "proposals": [ + { + "proposal_id": "proposal-preference-candidate-defer", + "proposal_kind": "preference_candidate", + "usefulness_score": 0.86, + "min_usefulness_score": 0.75, + "lineage_completeness": 1.0, + "expected_review_action": "defer", + "actual_review_action": "defer", + "review_action_correct": true, + "source_mutation_count": 0, + "unsupported_claim_count": 0 + } + ] + } + }, + { + "suite_id": "consolidation", + "job_id": "consolidation-project-summary-apply-001", + "title": "Create a reviewable project summary proposal without source mutation", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "compiled_knowledge", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "session-project-summary-1", + "claim_id": "project_summary_proposal_created", + "requirement": "cite" + }, + { + "evidence_id": "session-project-summary-2", + "claim_id": "project_summary_proposal_created", + "requirement": "use" + } + ], + "produced_answer": "Create a derived project summary proposal and keep the source notes unchanged. The proposal cites the source-of-truth and adoption-caveat sessions.", + "produced_evidence": [ + "session-project-summary-1", + "session-project-summary-2" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lineage_completeness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "proposal_usefulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "review_action_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "consolidation": { + "proposal_count": 1, + "proposal_usefulness": 0.93, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gaps": [], + "proposals": [ + { + "proposal_id": "proposal-project-summary-apply", + "proposal_kind": "project_summary", + "usefulness_score": 0.93, + "min_usefulness_score": 0.8, + "lineage_completeness": 1.0, + "expected_review_action": "apply", + "actual_review_action": "apply", + "review_action_correct": true, + "source_mutation_count": 0, + "unsupported_claim_count": 0 + } + ] + } + }, + { + "suite_id": "consolidation", + "job_id": "consolidation-weekly-decision-summary-apply-001", + "title": "Apply a weekly decision summary proposal with complete lineage", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "weekly-decision-typed-failures", + "claim_id": "weekly_summary_proposal_created", + "requirement": "cite" + }, + { + "evidence_id": "weekly-decision-private-caveat", + "claim_id": "weekly_summary_proposal_created", + "requirement": "use" + } + ], + "produced_answer": "Apply a weekly decision summary proposal covering typed failure states and the bounded production-adoption caveat. Keep it derived and source-linked.", + "produced_evidence": [ + "weekly-decision-private-caveat", + "weekly-decision-typed-failures" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lineage_completeness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "proposal_usefulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "review_action_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "consolidation": { + "proposal_count": 1, + "proposal_usefulness": 0.91, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gaps": [], + "proposals": [ + { + "proposal_id": "proposal-weekly-decision-summary-apply", + "proposal_kind": "weekly_decision_summary", + "usefulness_score": 0.91, + "min_usefulness_score": 0.8, + "lineage_completeness": 1.0, + "expected_review_action": "apply", + "actual_review_action": "apply", + "review_action_correct": true, + "source_mutation_count": 0, + "unsupported_claim_count": 0 + } + ] + } + }, + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-hierarchy-selection-001", + "title": "Gate OpenViking hierarchy selection scoring on scored hierarchy output", + "status": "blocked", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "hierarchy-selection-output-contract", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite" + }, + { + "evidence_id": "same-corpus-before-hierarchy", + "claim_id": "hierarchy_selection_blocked", + "requirement": "cite" + }, + { + "evidence_id": "hierarchy-comparison-requires-elf-equivalent", + "claim_id": "design_reference_not_score", + "requirement": "cite" + } + ], + "produced_answer": "OpenViking hierarchy selection is blocked until selected hierarchy nodes and evidence ids are materialized. OpenViking's hierarchy design remains a reference, not a scored win, tie, or loss, until comparable output exists.", + "produced_evidence": [ + "hierarchy-comparison-requires-elf-equivalent", + "hierarchy-selection-output-contract", + "same-corpus-before-hierarchy" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-hierarchy-selection-blocked", + "failure_stage": "openviking.hierarchy_artifact_gate", + "failure_reason": "Selected parent, child, resource, and rejected sibling evidence is not materialized, so hierarchy selection remains a typed blocker.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": [ + "same-corpus-before-hierarchy" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Hierarchy scoring is gated behind same-corpus expected evidence id coverage." + }, + { + "stage_name": "openviking.hierarchy_artifact_gate", + "kept_evidence": [ + "hierarchy-selection-output-contract", + "hierarchy-comparison-requires-elf-equivalent" + ], + "dropped_evidence": [ + "hierarchy-design-win-decoy" + ], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The required artifact must show selected hierarchy nodes plus the rejected sibling or decoy context before any ELF/OpenViking comparison is scored." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "OpenViking hierarchy selection is encoded as a benchmark job, but scoring is blocked until the adapter emits selected hierarchy nodes with evidence ids after the same-corpus precondition passes.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-recursive-expansion-001", + "title": "Gate OpenViking recursive context expansion on materialized expansion paths", + "status": "blocked", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "recursive-expansion-output-contract", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite" + }, + { + "evidence_id": "recursive-same-corpus-gate", + "claim_id": "recursive_expansion_blocked", + "requirement": "cite" + }, + { + "evidence_id": "recursive-elf-comparison-gate", + "claim_id": "recursive_comparison_not_scored", + "requirement": "cite" + } + ], + "produced_answer": "OpenViking recursive/context expansion is blocked until expansion paths and expected evidence ids are materialized. No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario.", + "produced_evidence": [ + "recursive-elf-comparison-gate", + "recursive-expansion-output-contract", + "recursive-same-corpus-gate" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-recursive-expansion-blocked", + "failure_stage": "openviking.recursive_expansion_gate", + "failure_reason": "Seed, expanded child, final evidence, and pruned-branch artifacts are not materialized, so recursive/context expansion remains blocked.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": [ + "recursive-same-corpus-gate" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Recursive expansion scoring remains gated behind expected evidence id coverage." + }, + { + "stage_name": "openviking.recursive_expansion_gate", + "kept_evidence": [ + "recursive-expansion-output-contract" + ], + "dropped_evidence": [ + "recursive-expansion-win-decoy" + ], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "The missing expansion-path artifact must show seed context, expanded child contexts, final evidence ids, and pruned branches." + }, + { + "stage_name": "openviking.comparison_gate", + "kept_evidence": [ + "recursive-elf-comparison-gate" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "No ELF tie, win, or loss is allowed until both systems publish comparable expansion-path artifacts for the same scenario." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "OpenViking recursive/context expansion is encoded as a benchmark job, but scoring is blocked until the adapter materializes expansion paths and same-corpus evidence ids are correct.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-staged-retrieval-001", + "title": "Gate OpenViking staged retrieval trajectory on evidence-bearing same-corpus output", + "status": "blocked", + "operational_evidence_tier": "local_fixture", + "answer_type": "debug_report", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "openviking-evidence-id-output-contract", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite" + }, + { + "evidence_id": "openviking-same-corpus-precondition-blocked", + "claim_id": "staged_trajectory_blocked", + "requirement": "cite" + }, + { + "evidence_id": "elf-comparison-requires-comparable-trajectory", + "claim_id": "elf_comparison_not_scored", + "requirement": "cite" + } + ], + "produced_answer": "OpenViking staged retrieval trajectory is blocked until same-corpus output matches expected evidence ids. No ELF win, tie, or loss is allowed until both systems publish comparable stage artifacts for the same context-trajectory scenario.", + "produced_evidence": [ + "elf-comparison-requires-comparable-trajectory", + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 0.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "fixture-openviking-staged-retrieval-blocked", + "failure_stage": "openviking.stage_artifact_gate", + "failure_reason": "Stage-level OpenViking trajectory output is not materialized, so the fixture keeps the context-trajectory comparison blocked.", + "stages": [ + { + "stage_name": "openviking.same_corpus_gate", + "kept_evidence": [ + "openviking-evidence-id-output-contract", + "openviking-same-corpus-precondition-blocked" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Same-corpus expected, matched, and missing evidence ids must be correct before stage scoring is allowed." + }, + { + "stage_name": "openviking.stage_artifact_gate", + "kept_evidence": [ + "elf-comparison-requires-comparable-trajectory" + ], + "dropped_evidence": [ + "trajectory-win-decoy" + ], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Comparable stage artifacts are missing, and the decoy ELF win claim is explicitly dropped." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "debuggability", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "OpenViking staged retrieval trajectory is encoded as a benchmark job, but scoring is blocked until same-corpus output returns expected evidence ids and comparable staged artifacts exist.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-archival-fallback-001", + "title": "Fall back to archival notes when core memory is insufficient", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "archival_fallback_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "fallback-core-insufficient", + "claim_id": "core_memory_insufficient", + "requirement": "explain" + }, + { + "evidence_id": "fallback-archival-runbook", + "claim_id": "archival_fallback_steps", + "requirement": "cite" + } + ], + "produced_answer": "The core block is insufficient because it says the rollback runbook exists but omits the steps. Fall back to archival note search: restore the Postgres backup, rebuild Qdrant from Postgres chunk vectors, and verify search recovers the restored note.", + "produced_evidence": [ + "fallback-archival-runbook", + "fallback-core-insufficient" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-core-block-attachment-001", + "title": "Read an explicitly attached core block without treating it as archival search", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "core-attachment-active", + "claim_id": "attached_core_block_readback", + "requirement": "cite" + }, + { + "evidence_id": "core-attachment-not-search", + "claim_id": "core_not_archival_search", + "requirement": "cite" + } + ], + "produced_answer": "Return the project_style core block because it has an active attachment for the exact tenant, project, agent, and private_plus_project read profile. Keep that readback separate from archival search because core blocks do not embed, rerank, search Qdrant, create search sessions, or record note hits.", + "produced_evidence": [ + "core-attachment-active", + "core-attachment-not-search" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-core-block-provenance-001", + "title": "Return source refs and audit events for core block assertions", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "provenance_bundle", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "core-provenance-source-ref", + "claim_id": "core_source_ref_returned", + "requirement": "cite" + }, + { + "evidence_id": "core-provenance-audit-events", + "claim_id": "core_audit_history_returned", + "requirement": "cite" + } + ], + "produced_answer": "The release_policy core block must return its source_ref with source_ref/v1 resolver data and retain the locator quote for inspection. Its provenance also includes append-only block_created, block_updated, and attachment_added events in audit_history.", + "produced_evidence": [ + "core-provenance-audit-events", + "core-provenance-source-ref" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-core-block-scope-001", + "title": "Apply core block scope and private-owner checks before readback", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "core-scope-project-shared-readable", + "claim_id": "shared_core_scope_allowed", + "requirement": "cite" + }, + { + "evidence_id": "core-scope-private-owner", + "claim_id": "private_core_scope_denied", + "requirement": "cite" + } + ], + "produced_answer": "Return the release_gate core block only when the active attachment and all_scopes read profile allow project_shared. Do not return agent_a_workflow to agent-b, because private-owner checks still apply to agent_private core blocks.", + "produced_evidence": [ + "core-scope-private-owner", + "core-scope-project-shared-readable" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "ownership_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 1, + "scope_correct_count": 1, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-project-decision-recovery-001", + "title": "Recover a project decision from core routing and archival rationale", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "decision-core-routing-block", + "claim_id": "core_routes_to_archival_rationale", + "requirement": "cite" + }, + { + "evidence_id": "decision-archival-outcome-policy", + "claim_id": "outcomes_require_evidence", + "requirement": "cite" + }, + { + "evidence_id": "decision-archival-core-search-boundary", + "claim_id": "core_archival_boundary_preserved", + "requirement": "cite" + }, + { + "evidence_id": "decision-letta-export-boundary", + "claim_id": "letta_comparison_requires_export", + "requirement": "cite" + } + ], + "produced_answer": "Use the always-attached core routing block to find the benchmark outcome policy, then cite archival notes for the detailed decision. The archival decision says to use win, tie, loss, not_tested, blocked, or non_goal only when scenario evidence supports them. It also says core blocks stay separate from archival note search and Qdrant-derived retrieval. Letta remains blocked or not_tested until a contained export/readback artifact maps core and archival source ids, so no ELF-over-Letta claim follows from ELF having core blocks.", + "produced_evidence": [ + "decision-archival-core-search-boundary", + "decision-archival-outcome-policy", + "decision-core-routing-block", + "decision-letta-export-boundary" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "core_archival_memory", + "job_id": "core-archival-stale-core-detection-001", + "title": "Detect a stale core block when archival evidence supersedes it", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "current_state_with_stale_core_caveat", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "archival-current-validation-gate", + "claim_id": "archival_current_gate", + "requirement": "cite" + }, + { + "evidence_id": "archival-supersedes-core-rationale", + "claim_id": "stale_core_detected", + "requirement": "explain" + } + ], + "produced_answer": "Treat the attached validation-gate core block as stale. The current archival decision says to run cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head, and the archival rationale says that evidence supersedes the core block until it is updated from source-of-truth state.", + "produced_evidence": [ + "archival-current-validation-gate", + "archival-supersedes-core-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-benchmark-verdict-001", + "title": "Use the current production adoption verdict after an older conclusion changed", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "verdict-current-ready-bounded", + "claim_id": "current_benchmark_verdict", + "requirement": "cite" + }, + { + "evidence_id": "verdict-bounded-private-caveat", + "claim_id": "private_corpus_caveat", + "requirement": "cite" + }, + { + "evidence_id": "verdict-update-rationale", + "claim_id": "benchmark_update_rationale", + "requirement": "explain" + } + ], + "produced_answer": "The current verdict is that ELF is ready for personal production use with bounded caveats; the older not-ready conclusion is historical, and the private corpus remains an explicit caveat rather than a private-corpus pass.", + "produced_evidence": [ + "verdict-bounded-private-caveat", + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "verdict-current-ready-bounded" + ], + "historical_evidence": [ + "verdict-old-not-ready" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "verdict-current-ready-bounded" + ], + "selected_historical_evidence": [ + "verdict-old-not-ready" + ], + "selected_rationale_evidence": [ + "verdict-update-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "verdict-current-ready-bounded", + "verdict-old-not-ready", + "verdict-update-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-deploy-method-001", + "title": "Prefer the superseding production deployment method over the old smoke path", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "ops_runbook", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "deploy-current-production-runbook", + "claim_id": "current_deployment_method", + "requirement": "cite" + }, + { + "evidence_id": "deploy-supersession-rationale", + "claim_id": "deployment_update_rationale", + "requirement": "explain" + } + ], + "produced_answer": "Use the Docker Compose production runbook with backup, restore, and Qdrant rebuild for production; the cargo run quickstart is only historical local-smoke guidance because production recovery handling must be explicit.", + "produced_evidence": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "deploy-current-production-runbook" + ], + "historical_evidence": [ + "deploy-old-quickstart" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "deploy-current-production-runbook" + ], + "selected_historical_evidence": [ + "deploy-old-quickstart" + ], + "selected_rationale_evidence": [ + "deploy-supersession-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "deploy-current-production-runbook", + "deploy-old-quickstart", + "deploy-supersession-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-issue-state-001", + "title": "Report an issue as done after an earlier blocker cleared", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "resume_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "issue-xy900-done", + "claim_id": "current_issue_state", + "requirement": "cite" + }, + { + "evidence_id": "issue-xy900-resolution-rationale", + "claim_id": "issue_update_rationale", + "requirement": "explain" + } + ], + "produced_answer": "XY-900 is currently done after PR #200; the earlier missing real_world_job fixture/report blocker is historical and cleared because the runner now publishes typed reports.", + "produced_evidence": [ + "issue-xy900-blocked", + "issue-xy900-done", + "issue-xy900-resolution-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "issue-xy900-done" + ], + "historical_evidence": [ + "issue-xy900-blocked" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "issue-xy900-done" + ], + "selected_historical_evidence": [ + "issue-xy900-blocked" + ], + "selected_rationale_evidence": [ + "issue-xy900-resolution-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "issue-xy900-done", + "issue-xy900-blocked", + "issue-xy900-resolution-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-preference-001", + "title": "Apply the current user preference while preserving the historical one", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "pref-current-concise-rationale", + "claim_id": "current_preference", + "requirement": "cite" + }, + { + "evidence_id": "pref-update-rationale", + "claim_id": "preference_update_rationale", + "requirement": "explain" + } + ], + "produced_answer": "Use concise prose with explicit evidence before bullets; the terse bullet-only preference is historical because it hid rationale.", + "produced_evidence": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "pref-current-concise-rationale" + ], + "historical_evidence": [ + "pref-old-terse-bullets" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "pref-current-concise-rationale" + ], + "selected_historical_evidence": [ + "pref-old-terse-bullets" + ], + "selected_rationale_evidence": [ + "pref-update-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "pref-current-concise-rationale", + "pref-old-terse-bullets", + "pref-update-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "history_event_types": [ + "add", + "update", + "ignore" + ], + "history_requires_note_version_links": true + } + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-relation-temporal-001", + "title": "Distinguish current and historical relation validity in graph-lite context", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "relation-current-owner", + "claim_id": "relation_current_owner", + "requirement": "cite" + }, + { + "evidence_id": "relation-old-owner", + "claim_id": "relation_historical_owner", + "requirement": "cite" + } + ], + "produced_answer": "Team Echo currently owns deployment method review. Team Delta owned deployment method review historically. The ownership moved after the single-user production runbook scope changed.", + "produced_evidence": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": null, + "cost": null, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.4 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "relation-current-owner" + ], + "historical_evidence": [ + "relation-old-owner" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "relation-current-owner" + ], + "selected_historical_evidence": [ + "relation-old-owner" + ], + "selected_rationale_evidence": [ + "relation-owner-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "relation-current-owner", + "relation-old-owner", + "relation-owner-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": true, + "temporal_validity_encoded": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "knowledge_compilation", + "job_id": "knowledge-watch-rebuild-003", + "title": "Rebuild changed-source knowledge pages without mutating source memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "compiled_knowledge", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "watch-source-updated", + "claim_id": "watch_rebuild_scope", + "requirement": "cite" + }, + { + "evidence_id": "watch-lint-output", + "claim_id": "stale_adapter_lint", + "requirement": "cite" + }, + { + "evidence_id": "watch-memory-candidate-proposal", + "claim_id": "memory_candidate_boundary", + "requirement": "cite" + } + ], + "produced_answer": "The changed-source watch/rebuild page selects only pages citing the updated source ref, reports the stale PageIndex/OpenKB adapter claim as lint evidence, preserves a reviewed memory-candidate boundary, and leaves source documents plus Memory Notes unmodified.", + "produced_evidence": [ + "watch-lint-output", + "watch-memory-candidate-proposal", + "watch-source-original", + "watch-source-updated" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "knowledge": { + "page_count": 1, + "section_count": 3, + "traced_section_count": 3, + "flagged_unsupported_section_count": 0, + "untraced_section_count": 0, + "unsupported_summary_count": 0, + "backlink_count": 2, + "pages_with_backlinks": 1, + "pages_with_version_diff": 1, + "stale_trap_count": 1, + "stale_traps_detected": 1, + "rebuild_page_count": 1, + "deterministic_rebuild_count": 1, + "rebuild_failure_count": 0, + "allowed_variance_count": 0, + "citation_coverage": 1.0, + "stale_claim_detection": 1.0, + "rebuild_determinism": 1.0, + "backlink_coverage": 1.0, + "version_diff_coverage": 1.0, + "page_usefulness": 1.0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "knowledge_compilation", + "job_id": "knowledge-entity-concept-002", + "title": "Compile entity, concept, and issue timeline pages with stale lint", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "compiled_knowledge", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "qdrant-rebuild-entity", + "claim_id": "qdrant_rebuild_entity", + "requirement": "cite" + }, + { + "evidence_id": "derived-pages-concept", + "claim_id": "derived_pages_concept", + "requirement": "cite" + }, + { + "evidence_id": "xy848-current-timeline", + "claim_id": "issue_timeline_current", + "requirement": "use" + } + ], + "produced_answer": "Generated entity, concept, and issue timeline pages cite Qdrant rebuild evidence, derived-page concept evidence, and the current XY-848 timeline; stale Qdrant-authoritative text is linted, and one rebuild explains allowed ordering variance.", + "produced_evidence": [ + "derived-pages-concept", + "qdrant-rebuild-entity", + "xy848-current-timeline" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 3.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "knowledge": { + "page_count": 3, + "section_count": 6, + "traced_section_count": 6, + "flagged_unsupported_section_count": 0, + "untraced_section_count": 0, + "unsupported_summary_count": 0, + "backlink_count": 6, + "pages_with_backlinks": 3, + "pages_with_version_diff": 0, + "stale_trap_count": 1, + "stale_traps_detected": 1, + "rebuild_page_count": 3, + "deterministic_rebuild_count": 3, + "rebuild_failure_count": 0, + "allowed_variance_count": 1, + "citation_coverage": 1.0, + "stale_claim_detection": 1.0, + "rebuild_determinism": 1.0, + "backlink_coverage": 1.0, + "version_diff_coverage": 0.0, + "page_usefulness": 1.0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "knowledge_compilation", + "job_id": "knowledge-project-page-001", + "title": "Compile a pointer-backed project page with current truth and history", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "compiled_knowledge", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "elf-knowledge-current-truth", + "claim_id": "derived_not_authoritative", + "requirement": "cite" + }, + { + "evidence_id": "elf-knowledge-history", + "claim_id": "reference_patterns", + "requirement": "cite" + }, + { + "evidence_id": "xy848-issue-timeline", + "claim_id": "rebuild_deterministic", + "requirement": "use" + } + ], + "produced_answer": "Generated benchmark page `project_elf_benchmark_suite.md` keeps ELF source notes authoritative, cites current truth and history, links the XY-848 issue timeline, flags one unsupported summary, and rebuilds deterministically.", + "produced_evidence": [ + "elf-knowledge-current-truth", + "elf-knowledge-history", + "xy848-issue-timeline" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "knowledge": { + "page_count": 1, + "section_count": 4, + "traced_section_count": 3, + "flagged_unsupported_section_count": 1, + "untraced_section_count": 0, + "unsupported_summary_count": 1, + "backlink_count": 3, + "pages_with_backlinks": 1, + "pages_with_version_diff": 0, + "stale_trap_count": 1, + "stale_traps_detected": 1, + "rebuild_page_count": 1, + "deterministic_rebuild_count": 1, + "rebuild_failure_count": 0, + "allowed_variance_count": 0, + "citation_coverage": 0.75, + "stale_claim_detection": 1.0, + "rebuild_determinism": 1.0, + "backlink_coverage": 1.0, + "version_diff_coverage": 0.0, + "page_usefulness": 0.938 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "memory_evolution", + "job_id": "memory-evolution-delete-ttl-001", + "title": "Suppress a deleted temporary fact and answer with the current memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "delete-tombstone", + "claim_id": "deleted_fact_suppressed", + "requirement": "cite" + }, + { + "evidence_id": "current-benchmark-plan", + "claim_id": "current_plan", + "requirement": "cite" + } + ], + "produced_answer": "Do not use the expired Redis branch plan. The tombstone says deleted-temp-plan expired and search must suppress it after the worker DELETE outbox completes. The current plan is to add real_world_memory trust and personalization cases, then run cargo make smoke-real-world-job and cargo make real-world-memory.", + "produced_evidence": [ + "current-benchmark-plan", + "delete-tombstone" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "current-benchmark-plan" + ], + "historical_evidence": [], + "tombstone_evidence": [ + "delete-tombstone" + ], + "invalidation_evidence": [ + "delete-tombstone" + ], + "selected_current_evidence": [ + "current-benchmark-plan" + ], + "selected_historical_evidence": [], + "selected_rationale_evidence": [ + "delete-tombstone" + ], + "selected_tombstone_evidence": [ + "delete-tombstone" + ], + "selected_invalidation_evidence": [ + "delete-tombstone" + ], + "conflict_candidate_evidence": [], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "memory_summary", + "job_id": "memory-summary-source-trace-001", + "title": "Read back a reviewable current memory summary with source trace", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "reviewable_memory_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "summary-contract-current", + "claim_id": "summary_contract_reviewable", + "requirement": "cite" + }, + { + "evidence_id": "xy952-summary-contract", + "claim_id": "summary_stage_now_fixture_backed", + "requirement": "cite" + }, + { + "evidence_id": "summary-ttl-tombstone", + "claim_id": "summary_preserves_tombstone", + "requirement": "cite" + }, + { + "evidence_id": "summary-contract-non-parity-boundary", + "claim_id": "summary_excludes_unsupported_parity", + "requirement": "cite" + } + ], + "produced_answer": "The reviewable memory summary keeps the current XY-952 source-trace contract top of mind, keeps the Postgres/Qdrant source-of-truth rule as background, downgrades the old not-tested summary gap and pre-XY-905 live loss, preserves the TTL tombstone for the parity claim, and excludes unsupported managed-memory parity as a derived project-profile candidate.", + "produced_evidence": [ + "summary-contract-current", + "summary-contract-non-parity-boundary", + "summary-ttl-tombstone", + "xy952-summary-contract" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "memory_summary": { + "summary_count": 1, + "entry_count": 7, + "required_category_count": 6, + "covered_required_category_count": 6, + "missing_required_category_count": 0, + "top_of_mind_count": 1, + "background_count": 1, + "stale_count": 1, + "superseded_count": 1, + "tombstone_count": 1, + "derived_project_profile_count": 2, + "source_ref_required_count": 6, + "source_ref_entry_count": 6, + "source_ref_coverage": 1.0, + "freshness_marker_count": 7, + "freshness_coverage": 1.0, + "rationale_count": 7, + "rationale_coverage": 1.0, + "invalid_top_of_mind_count": 0, + "untraced_entry_count": 0, + "derived_with_source_or_unsupported_count": 2, + "derived_missing_source_or_unsupported_count": 0, + "unsupported_derived_entry_count": 1, + "unsupported_current_entry_count": 0, + "tombstone_ref_count": 1, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 1, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "memory_evolution", + "job_id": "p1-closeout-correction-persistence-rollback-001", + "title": "Persist a correction and rollback an overbroad P2-ready memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "p1-correction-event", + "claim_id": "correction_persisted", + "requirement": "cite" + }, + { + "evidence_id": "p1-rollback-event", + "claim_id": "rollback_restored_gate", + "requirement": "cite" + }, + { + "evidence_id": "p1-current-corrected-memory", + "claim_id": "current_corrected_memory", + "requirement": "use" + } + ], + "produced_answer": "Use the current corrected memory: P2 remains unqueued until the P1 closeout report proves correction persistence, rollback, unsupported-claim refusal, and work-resume memory use. The previous P2-ready memory was superseded because rollback evidence and unsupported-claim refusal evidence were missing, and the rollback restored the prior safe phase gate.", + "produced_evidence": [ + "p1-correction-event", + "p1-current-corrected-memory", + "p1-incorrect-p2-ready-memory", + "p1-rollback-event" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.9, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.05 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "p1-current-corrected-memory" + ], + "historical_evidence": [ + "p1-incorrect-p2-ready-memory" + ], + "tombstone_evidence": [ + "p1-correction-event" + ], + "invalidation_evidence": [ + "p1-correction-event", + "p1-rollback-event" + ], + "selected_current_evidence": [ + "p1-current-corrected-memory" + ], + "selected_historical_evidence": [ + "p1-incorrect-p2-ready-memory" + ], + "selected_rationale_evidence": [ + "p1-correction-event", + "p1-rollback-event" + ], + "selected_tombstone_evidence": [ + "p1-correction-event" + ], + "selected_invalidation_evidence": [ + "p1-correction-event", + "p1-rollback-event" + ], + "conflict_candidate_evidence": [ + "p1-current-corrected-memory", + "p1-incorrect-p2-ready-memory", + "p1-correction-event" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "history_event_types": [ + "supersede", + "restore", + "correction" + ], + "history_requires_note_version_links": true + } + }, + { + "suite_id": "consolidation", + "job_id": "p1-closeout-source-candidate-approval-recall-001", + "title": "Promote a source-linked P1 closeout candidate and recall it with trace evidence", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "compiled_knowledge", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "p1-source-library-record", + "claim_id": "source_to_candidate", + "requirement": "cite" + }, + { + "evidence_id": "p1-memory-candidate", + "claim_id": "source_to_candidate", + "requirement": "use" + }, + { + "evidence_id": "p1-approved-memory", + "claim_id": "p2_queue_boundary", + "requirement": "cite" + }, + { + "evidence_id": "p1-recall-debug-trace", + "claim_id": "approved_memory_recalled", + "requirement": "explain" + } + ], + "produced_answer": "Promote the P1 closeout requirement through a reviewed memory candidate, not by mutating source records. The approved memory says the P1 closeout requires benchmark evidence across Source Library, Memory Candidate, approved memory, recall/debug, correction, and rollback before P2 queueing is allowed, and the recall/debug panel selected that approved memory while dropping the stale P2 queue decoy.", + "produced_evidence": [ + "p1-approved-memory", + "p1-memory-candidate", + "p1-recall-debug-trace", + "p1-source-library-record" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "10631063-3000-4063-8063-106310631063", + "stages": [ + { + "stage_name": "source.search", + "kept_evidence": [ + "p1-source-library-record" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Source Library evidence was retained as the authority root." + }, + { + "stage_name": "memory.candidate_review", + "kept_evidence": [ + "p1-memory-candidate" + ], + "dropped_evidence": [ + "p1-source-mutation-trap" + ], + "demoted_evidence": [], + "distractor_evidence": [ + "p1-source-mutation-trap" + ], + "notes": "Candidate review rejected source mutation." + }, + { + "stage_name": "recall.selection", + "kept_evidence": [ + "p1-approved-memory", + "p1-recall-debug-trace" + ], + "dropped_evidence": [ + "p2-queue-decoy" + ], + "demoted_evidence": [], + "distractor_evidence": [ + "p2-queue-decoy" + ], + "notes": "Recall selected approved memory and dropped stale P2 queue evidence." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "debuggability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "lineage_completeness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "proposal_usefulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "review_action_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.05 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "operator_debug": { + "failure_mode": "none_closeout_trace_readback", + "trace_id": "10631063-3000-4063-8063-106310631063", + "viewer_url": "/viewer?trace_id=10631063-3000-4063-8063-106310631063", + "admin_trace_bundle_url": "/v2/admin/traces/10631063-3000-4063-8063-106310631063/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "Recall/debug selected the approved P1 closeout memory and dropped the stale P2 queue decoy.", + "steps_to_root_cause": 2, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in trace_explainability recall.selection", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": true, + "replay_command_available": true, + "replay_command": "cargo make real-world-memory-p1-closeout", + "replay_artifact": "tmp/real-world-memory/p1-closeout/report.json", + "viewer_panels": [ + "Recall Debug", + "Source Library", + "Memory Candidate", + "Approved Memory" + ], + "cli_steps": [ + "run p1 closeout benchmark", + "inspect generated report", + "verify P2 queue boundary" + ], + "trace_evidence": [ + "p1-source-library-record", + "p1-memory-candidate", + "p1-approved-memory", + "p1-recall-debug-trace" + ], + "ux_gaps": [] + }, + "consolidation": { + "proposal_count": 1, + "proposal_usefulness": 0.95, + "lineage_completeness": 1.0, + "review_action_correctness": 1.0, + "source_mutation_count": 0, + "proposal_unsupported_claim_count": 0, + "executable_gaps": [], + "proposals": [ + { + "proposal_id": "p1-memory-candidate-approval", + "proposal_kind": "memory_candidate", + "usefulness_score": 0.95, + "min_usefulness_score": 0.8, + "lineage_completeness": 1.0, + "expected_review_action": "apply", + "actual_review_action": "apply", + "review_action_correct": true, + "source_mutation_count": 0, + "unsupported_claim_count": 0 + } + ] + } + }, + { + "suite_id": "memory_evolution", + "job_id": "p1-closeout-stale-decision-suppression-001", + "title": "Suppress a stale P2 queue decision and keep the current phase gate", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "p1-current-phase-gate", + "claim_id": "current_phase_gate", + "requirement": "cite" + }, + { + "evidence_id": "p1-phase-gate-rationale", + "claim_id": "closeout_rationale", + "requirement": "explain" + } + ], + "produced_answer": "Use the current phase gate: P2 Knowledge Workspace must not receive decodex:queued:elf until the XY-1063 P1 closeout benchmark and self-assessment pass and the main thread accepts the next phase. The older P1-skip decision is historical because the closeout still must prove stale/correction behavior, unsupported-claim refusal, work-resume memory use, and no broad competitor wins.", + "produced_evidence": [ + "p1-current-phase-gate", + "p1-old-skip-decision", + "p1-phase-gate-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 1.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "p1-current-phase-gate" + ], + "historical_evidence": [ + "p1-old-skip-decision" + ], + "tombstone_evidence": [], + "invalidation_evidence": [ + "p1-phase-gate-rationale" + ], + "selected_current_evidence": [ + "p1-current-phase-gate" + ], + "selected_historical_evidence": [ + "p1-old-skip-decision" + ], + "selected_rationale_evidence": [ + "p1-phase-gate-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [ + "p1-phase-gate-rationale" + ], + "conflict_candidate_evidence": [ + "p1-current-phase-gate", + "p1-old-skip-decision", + "p1-phase-gate-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": true, + "history_event_types": [ + "decision_superseded", + "phase_gate_updated" + ], + "history_requires_note_version_links": true + } + }, + { + "suite_id": "work_resume", + "job_id": "p1-closeout-unsupported-claim-refusal-work-resume-001", + "title": "Resume P1 closeout work while refusing unsupported P2 and competitor claims", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_resume", + "requires_caveat": false, + "requires_refusal": true, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "p1-work-resume-checkpoint", + "claim_id": "work_resume_next_action", + "requirement": "cite" + }, + { + "evidence_id": "p1-claim-boundary", + "claim_id": "unsupported_claim_refused", + "requirement": "cite" + }, + { + "evidence_id": "p1-no-p2-queue", + "claim_id": "p2_remains_unqueued", + "requirement": "use" + } + ], + "produced_answer": "I cannot approve P2 queueing or broad competitor-win claims from this fixture-only closeout. Resume XY-1063 by running cargo make real-world-memory-p1-closeout, publishing the checked-in closeout report, running docs validation, and keeping P2 unqueued until the self-assessment passes and main-thread acceptance selects the next phase.", + "produced_evidence": [ + "p1-claim-boundary", + "p1-no-p2-queue", + "p1-work-resume-checkpoint" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "personalization", + "job_id": "personalization-scoped-preference-001", + "title": "Apply the current project preference without cross-project leakage", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "current_preference", + "requirement": "cite" + }, + { + "evidence_id": "elf-current-stable-preference", + "claim_id": "preference_reason", + "requirement": "explain" + } + ], + "produced_answer": "For tenant local-tenant / project ELF / agent local-agent, use concise evidence-linked bullets. This preference is remembered because the user repeated it in two durable ELF sessions and marked it stable. Do not apply the old long-tutorial preference, the demo-only pirate tone, or the PubFi project preference.", + "produced_evidence": [ + "elf-current-stable-preference" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "personalization_fit", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 1, + "scope_correct_count": 1, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-daily-project-brief-001", + "title": "Generate a daily project brief from current project memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "daily-current-validation-gate", + "claim_id": "daily_validation_gate", + "requirement": "cite" + }, + { + "evidence_id": "daily-current-ledger-update", + "claim_id": "daily_ledger_update", + "requirement": "cite" + } + ], + "produced_answer": "Daily brief: run the proactive brief benchmark command, keep the XY-951 ledger update next, and do not claim Pulse or hosted managed-product parity from fixture-only evidence.", + "produced_evidence": [ + "daily-current-ledger-update", + "daily-current-validation-gate" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 1, + "deferred_count": 0, + "rejected_count": 0, + "current_suggestion_count": 1, + "non_current_suggestion_count": 0, + "stale_warning_count": 0, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-private-corpus-refresh-blocked-001", + "title": "Block private-corpus refresh suggestions when no operator manifest exists", + "status": "blocked", + "operational_evidence_tier": "private_corpus", + "answer_type": "proactive_project_brief", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": null, + "cost": null, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest is available; private-corpus refresh suggestions stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-resume-work-brief-001", + "title": "Generate a resume-work brief from current handoff memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "resume-current-handoff", + "claim_id": "resume_current_handoff", + "requirement": "cite" + }, + { + "evidence_id": "resume-current-validation", + "claim_id": "resume_validation", + "requirement": "cite" + } + ], + "produced_answer": "Resume brief: stay on y/elf-xy-953, finish proactive brief fixture/scoring work, and validate with cargo make real-world-memory-proactive-brief plus targeted elf-eval tests.", + "produced_evidence": [ + "resume-current-handoff", + "resume-current-validation" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 1, + "deferred_count": 0, + "rejected_count": 0, + "current_suggestion_count": 1, + "non_current_suggestion_count": 0, + "stale_warning_count": 0, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-stale-decision-audit-001", + "title": "Warn about a stale project decision before suggesting work", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "stale-decision-old-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite" + }, + { + "evidence_id": "stale-decision-new-gate", + "claim_id": "stale_decision_replaced", + "requirement": "cite" + } + ], + "produced_answer": "Stale decision audit: defer the old operator-ux-only readiness decision and use the direct real-world-memory-proactive-brief suite for any proactive pass claim.", + "produced_evidence": [ + "stale-decision-new-gate", + "stale-decision-old-gate" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 1, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_suggestion_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "recommended_count": 0, + "deferred_count": 1, + "rejected_count": 0, + "current_suggestion_count": 0, + "non_current_suggestion_count": 1, + "stale_warning_count": 1, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-stale-plan-preference-warning-001", + "title": "Reject stale plan and preference suggestions after TTL invalidation", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "proactive_project_brief", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "stale-plan-ttl", + "claim_id": "stale_plan_rejected", + "requirement": "cite" + }, + { + "evidence_id": "current-preference-concise-brief", + "claim_id": "current_preference_concise", + "requirement": "cite" + } + ], + "produced_answer": "Stale plan/preference warning: reject the expired publish-first plan, use the current run-gate plan, and prefer concise evidence-linked briefs without broad hosted-product parity claims.", + "produced_evidence": [ + "current-plan-run-gate", + "current-preference-concise-brief", + "old-preference-long-brief", + "stale-plan-old", + "stale-plan-ttl" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "proactive_brief": { + "brief_count": 1, + "suggestion_count": 2, + "required_suggestion_kind_count": 1, + "covered_required_suggestion_kind_count": 1, + "missing_required_suggestion_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_suggestion_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "recommended_count": 0, + "deferred_count": 1, + "rejected_count": 1, + "current_suggestion_count": 0, + "non_current_suggestion_count": 2, + "stale_warning_count": 2, + "invalid_current_suggestion_count": 0, + "untraced_suggestion_count": 0, + "unsupported_current_suggestion_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-authority-plane-recovery-001", + "title": "Recover authority-plane records with degraded derived indexes labeled", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "authority-recovery-backup-pitr", + "claim_id": "backup_pitr_restored", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-counts", + "claim_id": "authority_counts_preserved", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-degraded-read", + "claim_id": "degraded_read_labeled", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-replay-rebuild", + "claim_id": "replay_rebuild_dead_letter", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-repair-dead-letter", + "claim_id": "replay_rebuild_dead_letter", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-rpo-rto", + "claim_id": "rpo_rto_reported", + "requirement": "cite" + }, + { + "evidence_id": "authority-recovery-topology", + "claim_id": "failover_not_encoded", + "requirement": "cite" + } + ], + "produced_answer": "The authority-plane recovery drill restored the Postgres authority store from backup/PITR, preserved source refs and lifecycle history for source, journal, memory, knowledge, proposal, trace, and audit records, labeled unavailable derived indexes and adapters during degraded read, replayed outbox work idempotently, completed Qdrant rebuild with rebuilt_count=9, missing_vector_count=0, error_count=0, handled 2 dead-letter rows, applied 1 migration repair, and met RPO 12/60 seconds plus RTO 184/300 seconds. Failover remains not_encoded because no standby authority service was part of the drill.", + "produced_evidence": [ + "authority-recovery-backup-pitr", + "authority-recovery-counts", + "authority-recovery-degraded-read", + "authority-recovery-repair-dead-letter", + "authority-recovery-replay-rebuild", + "authority-recovery-rpo-rto", + "authority-recovery-topology" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 7, + "expected_evidence_matched": 7, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 7, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "recovery_drills": [ + { + "drill_id": "authority-plane-drill-20260627", + "contract_schema": "elf.authority_recovery_drill/v1", + "generated_at": "2026-06-27T09:18:00Z", + "topology": { + "authority_store": "postgres", + "derived_indexes": [ + "qdrant_memory", + "qdrant_docs" + ], + "adapters": [ + "external_adapter_probe" + ], + "failover": "not_encoded_no_standby_authority_service" + }, + "failure_injections": [ + { + "injection_id": "qdrant-memory-index-unavailable", + "target": "qdrant_memory", + "fault": "derived_index_unavailable", + "started_at": "2026-06-27T09:15:00Z", + "completed_at": "2026-06-27T09:16:00Z", + "evidence_refs": [ + "authority-recovery-degraded-read" + ] + }, + { + "injection_id": "outbox-replay-after-pitr", + "target": "indexing_outbox", + "fault": "pending_jobs_replayed_after_restore", + "started_at": "2026-06-27T09:16:00Z", + "completed_at": "2026-06-27T09:17:00Z", + "evidence_refs": [ + "authority-recovery-replay-rebuild" + ] + } + ], + "backup_pitr": { + "backup_ref": "backup-20260627T090000Z", + "pitr_target": "2026-06-27T09:12:00Z", + "restored": true, + "evidence_refs": [ + "authority-recovery-backup-pitr" + ] + }, + "degraded_read": { + "source_of_truth_visible": true, + "unavailable_derived_indexes": [ + "qdrant_memory", + "qdrant_docs" + ], + "unavailable_adapters": [ + "external_adapter_probe" + ], + "unavailable_labels": [ + "unavailable_derived_index", + "unavailable_adapter" + ], + "evidence_refs": [ + "authority-recovery-degraded-read" + ] + }, + "rpo": { + "target_seconds": 60.0, + "measured_seconds": 12.0, + "evidence_refs": [ + "authority-recovery-rpo-rto" + ] + }, + "rto": { + "target_seconds": 300.0, + "measured_seconds": 184.0, + "evidence_refs": [ + "authority-recovery-rpo-rto" + ] + }, + "authority_record_counts": [ + { + "plane": "source", + "before_count": 3, + "after_count": 3, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "journal", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "memory", + "before_count": 4, + "after_count": 4, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "knowledge", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "proposal", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "trace", + "before_count": 3, + "after_count": 3, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + }, + { + "plane": "audit", + "before_count": 5, + "after_count": 5, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": [ + "authority-recovery-counts" + ] + } + ], + "outbox_replay": { + "idempotent": true, + "replayed_count": 6, + "duplicate_write_count": 0, + "evidence_refs": [ + "authority-recovery-replay-rebuild" + ] + }, + "qdrant_rebuild": { + "complete": true, + "rebuilt_count": 9, + "missing_vector_count": 0, + "error_count": 0, + "evidence_refs": [ + "authority-recovery-replay-rebuild" + ] + }, + "migration_repair": { + "applied": true, + "repaired_count": 1, + "evidence_refs": [ + "authority-recovery-repair-dead-letter" + ] + }, + "dead_letter": { + "dead_letter_count": 2, + "handled_count": 2, + "evidence_refs": [ + "authority-recovery-repair-dead-letter" + ] + } + } + ], + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 7, + "evidence_covered_count": 7, + "source_ref_required_count": 7, + "source_ref_covered_count": 7, + "quote_required_count": 7, + "quote_covered_count": 7, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": true + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-restore-cold-start-001", + "title": "Read back restored memory after Docker cold start and Qdrant rebuild", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "restore-search-before", + "claim_id": "restore_recovered_key", + "requirement": "cite" + }, + { + "evidence_id": "restore-qdrant-rebuild", + "claim_id": "qdrant_rebuild_counts", + "requirement": "cite" + }, + { + "evidence_id": "restore-search-after", + "claim_id": "cold_start_readback", + "requirement": "cite" + } + ], + "produced_answer": "The restore proof recovered key single_user_restore_probe after a Docker cold start. Qdrant rebuild returned rebuilt_count=1, missing_vector_count=0, error_count=0, and search after cold start returned one result for the restored key.", + "produced_evidence": [ + "restore-qdrant-rebuild", + "restore-search-after", + "restore-search-before" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": true + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-cold-start-dependency-001", + "title": "Report pinned OpenViking cold-start path reaching behavioral wrong-result", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "pinned-local-embed-runtime-reached", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite" + }, + { + "evidence_id": "pinned-local-embed-retry", + "claim_id": "pinned_openviking_runtime_reached", + "requirement": "cite" + }, + { + "evidence_id": "openviking-wrong-result-behavior", + "claim_id": "openviking_wrong_result_after_runtime", + "requirement": "cite" + }, + { + "evidence_id": "typed-incomplete-policy", + "claim_id": "setup_failure_stays_incomplete", + "requirement": "cite" + } + ], + "produced_answer": "The pinned OpenViking Docker local embedding path reached add_resource/find. OpenViking now reports wrong_result/retrieval_wrong_result because all three smoke queries missed expected evidence terms. If the pinned llama-cpp-python install or import fails on another Docker platform, classify that setup boundary as incomplete, not pass.", + "produced_evidence": [ + "openviking-wrong-result-behavior", + "pinned-local-embed-retry", + "pinned-local-embed-runtime-reached", + "typed-incomplete-policy" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-credential-boundary-001", + "title": "Keep provider credential requirement blocked without committing secrets", + "status": "blocked", + "operational_evidence_tier": "provider_backed", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "provider-credential-boundary", + "claim_id": "provider_credentials_blocked", + "requirement": "cite" + }, + { + "evidence_id": "checked-in-secret-boundary", + "claim_id": "no_checked_in_secrets", + "requirement": "cite" + } + ], + "produced_answer": "Do not commit or require provider secrets in checked-in fixtures. Provider-backed production-ops checks are blocked until operator credentials are supplied, and reports may record provider metadata but never the API key.", + "produced_evidence": [ + "checked-in-secret-boundary", + "provider-credential-boundary" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Provider-backed production operations require operator-owned credentials; checked-in fixtures must not include or require secrets.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-backfill-resume-001", + "title": "Resume interrupted generated backfill from checkpoint without duplicate source notes", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "backfill-checkpoint-state", + "claim_id": "resume_checkpoint", + "requirement": "cite" + }, + { + "evidence_id": "backfill-clean-compare", + "claim_id": "clean_compare_matched", + "requirement": "cite" + } + ], + "produced_answer": "Resume from checkpoint offset 1000 to 2000 completed the 2000 document backfill. The resumed backfill found zero duplicate source notes, and search quality after resume matched the clean run for all 16 queries.", + "produced_evidence": [ + "backfill-checkpoint-state", + "backfill-clean-compare" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-private-manifest-blocked-001", + "title": "Report missing private production manifest as bounded blocked caveat", + "status": "blocked", + "operational_evidence_tier": "private_corpus", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "private-manifest-guard", + "claim_id": "private_manifest_blocked", + "requirement": "cite" + }, + { + "evidence_id": "private-bounded-failure-policy", + "claim_id": "private_bounded_failure", + "requirement": "cite" + } + ], + "produced_answer": "No private-corpus pass is claimed. The private production corpus path is blocked until an operator supplies a sanitized manifest, and the current evidence is a bounded failure, not a pass.", + "produced_evidence": [ + "private-bounded-failure-policy", + "private-manifest-guard" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.6, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 0.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "workflow_helpfulness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "No operator-owned private production corpus manifest is checked in or available to this fixture; no private-corpus pass can be claimed.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-public-proxy-addendum-001", + "title": "Separate operator-approved public-proxy evidence from private and provider proof", + "status": "pass", + "operational_evidence_tier": "public_proxy", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "public-proxy-addendum-pass", + "claim_id": "public_proxy_passed", + "requirement": "cite" + }, + { + "evidence_id": "public-proxy-latency-resource-cost", + "claim_id": "public_proxy_operational_envelope", + "requirement": "cite" + }, + { + "evidence_id": "public-proxy-claim-boundary", + "claim_id": "public_proxy_boundary", + "requirement": "cite" + } + ], + "produced_answer": "The operator-approved public-proxy addendum passed 8/8 query checks with 0 wrong_result. It recorded query mean latency 10.842727625 ms, P95 30.443385 ms, elapsed 1.313984156 seconds, RSS 37656 KB, and 386 estimated input tokens. This is public-proxy evidence only: it is not real private-corpus production proof and does not prove provider-backed production quality because embedding mode was local-hash.", + "produced_evidence": [ + "public-proxy-addendum-pass", + "public-proxy-claim-boundary", + "public-proxy-latency-resource-cost" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 10.842727625, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 386, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-resource-envelope-001", + "title": "Report generated backfill resource envelope and operator planning caveat", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "resource-envelope-check", + "claim_id": "resource_envelope_passed", + "requirement": "cite" + }, + { + "evidence_id": "large-import-planning-caveat", + "claim_id": "large_import_batch_caveat", + "requirement": "cite" + } + ], + "produced_answer": "The resource envelope passed: 2793.629 seconds was within the 3600-second limit, and 167652 KB RSS was within the 1500000 KB limit. Large imports should be planned as batch jobs, not interactive operations.", + "produced_evidence": [ + "large-import-planning-caveat", + "resource-envelope-check" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "project_decisions", + "job_id": "project-decision-accepted-typed-failures-001", + "title": "Recover an accepted benchmark reporting decision with its rationale", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "typed-failure-decision-accepted", + "claim_id": "accepted_typed_failure_policy", + "requirement": "cite" + }, + { + "evidence_id": "typed-failure-decision-rationale", + "claim_id": "typed_failure_rationale", + "requirement": "explain" + } + ], + "produced_answer": "The accepted decision is to preserve typed benchmark outcomes instead of flattening them, because the typed states keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.", + "produced_evidence": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "historical_evidence": [], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "selected_historical_evidence": [], + "selected_rationale_evidence": [ + "typed-failure-decision-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "project_decisions", + "job_id": "project-decision-current-validation-gate-001", + "title": "Recover the current validation gate instead of an old gate", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "validation-gate-current-decodex", + "claim_id": "current_validation_gate", + "requirement": "cite" + }, + { + "evidence_id": "validation-gate-old-lint-test", + "claim_id": "current_validation_gate", + "requirement": "use" + }, + { + "evidence_id": "validation-gate-current-rationale", + "claim_id": "validation_gate_rationale", + "requirement": "explain" + } + ], + "produced_answer": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make check before pushing a refreshed PR head. The older lint-and-test gate is historical; the current gate adds formatting, automatic lint repair, and full checks to prevent avoidable review churn before Decodex review handoff.", + "produced_evidence": [ + "validation-gate-current-decodex", + "validation-gate-current-rationale", + "validation-gate-old-lint-test" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "validation-gate-current-decodex" + ], + "historical_evidence": [ + "validation-gate-old-lint-test" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "validation-gate-current-decodex" + ], + "selected_historical_evidence": [ + "validation-gate-old-lint-test" + ], + "selected_rationale_evidence": [ + "validation-gate-current-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "project_decisions", + "job_id": "project-decision-private-manifest-caveat-001", + "title": "State the bounded private-manifest caveat instead of overclaiming", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "private-manifest-current-caveat", + "claim_id": "private_manifest_caveat", + "requirement": "cite" + }, + { + "evidence_id": "private-manifest-uncertainty-policy", + "claim_id": "private_manifest_caveat", + "requirement": "use" + }, + { + "evidence_id": "private-manifest-supported-scope", + "claim_id": "supported_project_decision_scope", + "requirement": "cite" + } + ], + "produced_answer": "No private production corpus manifest is available, so the project_decisions report cannot claim private-corpus validation. The supported scope is synthetic fixture-backed decision recovery for ELF, and the correct uncertainty policy is to keep that bounded caveat instead of inventing a private-manifest pass.", + "produced_evidence": [ + "private-manifest-current-caveat", + "private-manifest-supported-scope", + "private-manifest-uncertainty-policy" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "historical_evidence": [], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "selected_historical_evidence": [], + "selected_rationale_evidence": [ + "private-manifest-uncertainty-policy" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "project_decisions", + "job_id": "project-decision-reversal-live-baseline-001", + "title": "Distinguish a superseded live-baseline claim from the current suite boundary", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "live-baseline-suite-win-current", + "claim_id": "current_live_baseline_boundary", + "requirement": "cite" + }, + { + "evidence_id": "live-baseline-suite-win-old", + "claim_id": "current_live_baseline_boundary", + "requirement": "use" + }, + { + "evidence_id": "live-baseline-reversal-rationale", + "claim_id": "live_baseline_reversal_rationale", + "requirement": "explain" + } + ], + "produced_answer": "The current decision is that live-baseline passes are retrieval and lifecycle evidence only; real-world job suite wins require fixture-backed real_world_job reports. The earlier draft that allowed live-baseline suite wins is historical, and it changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "produced_evidence": [ + "live-baseline-reversal-rationale", + "live-baseline-suite-win-current", + "live-baseline-suite-win-old" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "live-baseline-suite-win-current" + ], + "historical_evidence": [ + "live-baseline-suite-win-old" + ], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "live-baseline-suite-win-current" + ], + "selected_historical_evidence": [ + "live-baseline-suite-win-old" + ], + "selected_rationale_evidence": [ + "live-baseline-reversal-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 1, + "conflict_detection_count": 1, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "project_decisions", + "job_id": "project-decision-tradeoff-fixture-backed-001", + "title": "Explain the rationale and caveat for fixture-backed project decision jobs", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": true, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "tradeoff-fixture-backed-first", + "claim_id": "fixture_backed_tradeoff", + "requirement": "cite" + }, + { + "evidence_id": "tradeoff-fixture-backed-rationale", + "claim_id": "fixture_backed_tradeoff_rationale", + "requirement": "explain" + }, + { + "evidence_id": "tradeoff-fixture-backed-caveat", + "claim_id": "fixture_backed_parity_caveat", + "requirement": "cite" + } + ], + "produced_answer": "The accepted tradeoff is to encode project_decisions first as offline fixture-backed jobs. The rationale is that fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun. The caveat is that this suite must not claim external-project parity until external adapters actually run these jobs.", + "produced_evidence": [ + "tradeoff-fixture-backed-caveat", + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 3, + "expected_evidence_matched": 3, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 3, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 3, + "evidence_covered_count": 3, + "source_ref_required_count": 3, + "source_ref_covered_count": 3, + "quote_required_count": 3, + "quote_covered_count": 3, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "evolution": { + "current_evidence": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "historical_evidence": [], + "tombstone_evidence": [], + "invalidation_evidence": [], + "selected_current_evidence": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "selected_historical_evidence": [], + "selected_rationale_evidence": [ + "tradeoff-fixture-backed-rationale" + ], + "selected_tombstone_evidence": [], + "selected_invalidation_evidence": [], + "conflict_candidate_evidence": [], + "retrieved_but_dropped_evidence": [], + "selected_but_not_narrated_evidence": [], + "stale_trap_ids_used": [], + "stale_answer_count": 0, + "conflict_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": true, + "temporal_validity_required": false, + "temporal_validity_encoded": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "history_event_types": [], + "history_requires_note_version_links": false + } + }, + { + "suite_id": "retrieval", + "job_id": "retrieval-alt-phrasing-001", + "title": "Recover current handoff evidence from alternate phrasing", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy840-current-handoff", + "claim_id": "branch", + "requirement": "cite" + }, + { + "evidence_id": "xy840-current-handoff", + "claim_id": "gate", + "requirement": "use" + } + ], + "produced_answer": "Use branch y/elf-xy-840 for XY-840 and run `cargo make check` before review handoff.", + "produced_evidence": [ + "xy840-current-handoff" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 13.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "retrieval", + "job_id": "retrieval-current-vs-obsolete-001", + "title": "Select current benchmark context over obsolete live-baseline claims", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "current-real-world-boundary", + "claim_id": "current_boundary", + "requirement": "cite" + } + ], + "produced_answer": "Use the current boundary: live-baseline reports stay valid for Docker retrieval and lifecycle evidence, but they are not real-world job suite wins; publish real-world job reports separately.", + "produced_evidence": [ + "current-real-world-boundary" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 15.7, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "uncertainty_handling", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "retrieval", + "job_id": "retrieval-distractor-heavy-001", + "title": "Find provider stress evidence in a distractor-heavy corpus", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "elf-provider-stress-target", + "claim_id": "stress_target", + "requirement": "cite" + } + ], + "produced_answer": "Target ELF only with the stress profile: set ELF_BASELINE_PROJECTS=ELF and ELF_BASELINE_PROFILE=stress with provider embeddings.", + "produced_evidence": [ + "elf-provider-stress-target" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 22.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "retrieval", + "job_id": "retrieval-minimal-context-001", + "title": "Return minimal sufficient restore evidence without irrelevant context", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "restore-minimal-proof", + "claim_id": "restore_minimal", + "requirement": "cite" + } + ], + "produced_answer": "The minimal sufficient restore proof is that note ingest returned ADD/remember, Qdrant rebuild returned rebuilt_count=1 with zero missing vectors and zero errors, and search recovered the restored note.", + "produced_evidence": [ + "restore-minimal-proof" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 9.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "retrieval", + "job_id": "retrieval-multi-hop-routing-001", + "title": "Answer a multi-hop benchmark routing question", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "decision_record", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy845-landing-zone", + "claim_id": "landing_zone", + "requirement": "cite" + }, + { + "evidence_id": "routing-reference-boundary", + "claim_id": "reference_boundary", + "requirement": "cite" + } + ], + "produced_answer": "Add the new cases under apps/elf-eval/fixtures/real_world_memory/retrieval/ and extend the runner/report seams, while treating qmd and OpenViking only as references unless their adapters actually run.", + "produced_evidence": [ + "routing-reference-boundary", + "xy845-landing-zone" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 31.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "operator_debugging_ux", + "job_id": "operator-debug-stage-attribution-001", + "title": "Attribute a wrong result to the retrieval stage that demoted evidence", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "debug_report", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "stage-target", + "claim_id": "stage_attribution", + "requirement": "explain" + } + ], + "produced_answer": "Expected evidence was present in recall.candidates but demoted at rerank.score; the selected stale top-k smoke-only evidence was the decoy to repair against.", + "produced_evidence": [ + "stage-target" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 18.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": { + "trace_id": "66666666-6666-4666-8666-666666666666", + "failure_stage": "rerank.score", + "failure_reason": "Expected evidence survived candidate recall but was demoted below a stale decoy during rerank.", + "stages": [ + { + "stage_name": "rewrite.expansion", + "kept_evidence": [], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [], + "notes": "Alternate phrasing preserved the original intent." + }, + { + "stage_name": "recall.candidates", + "kept_evidence": [ + "stage-target", + "stage-decoy" + ], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": [ + "stage-decoy" + ], + "notes": "Candidate recall found both expected evidence and stale decoy evidence." + }, + { + "stage_name": "rerank.score", + "kept_evidence": [ + "stage-decoy" + ], + "dropped_evidence": [], + "demoted_evidence": [ + "stage-target" + ], + "distractor_evidence": [ + "stage-decoy" + ], + "notes": "The stale decoy outranked the expected evidence." + }, + { + "stage_name": "selection.final", + "kept_evidence": [ + "stage-decoy" + ], + "dropped_evidence": [ + "stage-target" + ], + "demoted_evidence": [], + "distractor_evidence": [ + "stage-decoy" + ], + "notes": "Final selection missed the required evidence." + } + ] + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "debuggability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false, + "operator_debug": { + "failure_mode": "rerank_demoted_expected_evidence", + "trace_id": "66666666-6666-4666-8666-666666666666", + "viewer_url": "/viewer?trace_id=66666666-6666-4666-8666-666666666666", + "admin_trace_bundle_url": "/v2/admin/traces/66666666-6666-4666-8666-666666666666/bundle?mode=full&stage_items_limit=128&candidates_limit=200", + "root_cause": "The expected evidence survived recall.candidates but was demoted below a stale decoy during rerank.score.", + "steps_to_root_cause": 3, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in trace_explainability rerank.score and selection.final stages", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "viewer_panels": [ + "Trace", + "Retrieval Funnel", + "Replay Candidates", + "Stage Details" + ], + "cli_steps": [ + "open trace explainability bundle", + "compare recall.candidates with rerank.score", + "inspect selected stale decoy", + "repair rerank inputs or stale-context filtering" + ], + "trace_evidence": [ + "stage-target", + "stage-decoy" + ], + "ux_gaps": [] + } + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-knowledge-page-refresh-suggestion-001", + "title": "Suggest a knowledge-page refresh from scheduled memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-knowledge-page-stale-finding", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-knowledge-reviewable-refresh", + "claim_id": "scheduled_knowledge_refresh_suggested", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled knowledge-page refresh suggestion: suggest a reviewable rebuild because lint found the old scheduled-memory blocked state, and do not silently rewrite source notes.", + "produced_evidence": [ + "scheduled-knowledge-page-stale-finding", + "scheduled-knowledge-reviewable-refresh" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "source_immutability", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "Block private/provider scheduled tasks without operator inputs", + "status": "blocked", + "operational_evidence_tier": "private_corpus", + "answer_type": "scheduled_memory_task", + "requires_caveat": true, + "requires_refusal": true, + "can_answer_unknown": true, + "normalized_score": 0.0, + "hard_fail_hits": [], + "expected_evidence": [], + "produced_answer": "", + "produced_evidence": [], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 0, + "expected_evidence_matched": 0, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 0, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": null, + "cost": null, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 0.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 0.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "uncertainty_handling", + "score": 0.0, + "max_points": 1.0, + "weight": 0.25 + } + ], + "reason": "No operator-owned private production corpus manifest, provider credentials, or hosted scheduler configuration is available; private/provider scheduled tasks stay blocked under XY-930.", + "evidence_required_count": 0, + "evidence_covered_count": 0, + "source_ref_required_count": 0, + "source_ref_covered_count": 0, + "quote_required_count": 0, + "quote_covered_count": 0, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-decision-audit-001", + "title": "Audit a stale project decision during a scheduled task", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-old-consolidation-only-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-direct-suite-decision", + "claim_id": "scheduled_decision_superseded", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale decision audit: the consolidation-only readiness decision is superseded by the direct real-world-memory-scheduled fixture suite plus aggregate real-world-memory regression guard.", + "produced_evidence": [ + "scheduled-current-direct-suite-decision", + "scheduled-old-consolidation-only-decision" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 1, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 1, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 1, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-stale-preference-plan-audit-001", + "title": "Audit stale preferences and plans during a scheduled task", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-stale-old-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-stale-plan-expired", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-trace-plan", + "claim_id": "scheduled_stale_plan_expired", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-current-reviewable-preference", + "claim_id": "scheduled_silent_mutation_rejected", + "requirement": "cite" + } + ], + "produced_answer": "Scheduled stale preference/plan audit: the old report plan is expired, the silent-mutation preference is historical, and the current path requires trace/readback plus reviewable derived output.", + "produced_evidence": [ + "scheduled-current-reviewable-preference", + "scheduled-current-trace-plan", + "scheduled-old-silent-mutation-preference", + "scheduled-stale-old-plan", + "scheduled-stale-plan-expired" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 5, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 1 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 2, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 2, + "evidence_ref_output_count": 2, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 2, + "freshness_coverage": 1.0, + "action_rationale_count": 2, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 0, + "non_current_output_count": 2, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 0, + "source_trace_superseded_count": 2, + "source_trace_tombstone_count": 1 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-weekly-project-status-summary-001", + "title": "Run a weekly project status summary from current memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "scheduled_memory_task", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": true, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "scheduled-weekly-current-gate", + "claim_id": "scheduled_weekly_gate", + "requirement": "cite" + }, + { + "evidence_id": "scheduled-weekly-ledger-update", + "claim_id": "scheduled_weekly_ledger", + "requirement": "cite" + } + ], + "produced_answer": "Weekly scheduled summary: run cargo make real-world-memory-scheduled, update the XY-951 scheduled-memory-task readiness ledger, and do not claim hosted scheduled-product parity from fixture evidence.", + "produced_evidence": [ + "scheduled-weekly-current-gate", + "scheduled-weekly-ledger-update" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "scheduled_memory": { + "task_run_count": 1, + "output_count": 1, + "required_task_kind_count": 1, + "covered_required_task_kind_count": 1, + "missing_required_task_kind_count": 0, + "evidence_ref_required_count": 1, + "evidence_ref_output_count": 1, + "evidence_ref_coverage": 1.0, + "freshness_marker_count": 1, + "freshness_coverage": 1.0, + "action_rationale_count": 1, + "action_rationale_coverage": 1.0, + "trace_required_count": 1, + "trace_complete_count": 1, + "trace_coverage": 1.0, + "source_mutation_count": 0, + "current_output_count": 1, + "non_current_output_count": 0, + "invalid_current_output_count": 0, + "untraced_output_count": 0, + "unsupported_current_output_count": 0, + "tombstone_violation_count": 0, + "source_trace_selected_count": 2, + "source_trace_dropped_count": 0, + "source_trace_stale_count": 1, + "source_trace_superseded_count": 0, + "source_trace_tombstone_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trace_readback", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "source_library", + "job_id": "source-library-long-doc-001", + "title": "Saved long-form article keeps stable source metadata and hydrates an excerpt pointer", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "article-source-record", + "claim_id": "long_doc_metadata", + "requirement": "cite" + }, + { + "evidence_id": "article-hydrated-excerpt", + "claim_id": "long_doc_hydration", + "requirement": "cite" + } + ], + "produced_answer": "The saved article is a Source Library document with canonical_uri=https://example.com/research/agent-memory-os, source_kind=article, author=Example Research Group, captured_at=2026-06-20T01:10:00Z, source_created_at=2026-06-19T21:00:00Z, and trust_label=public_web. Hydration uses the source_ref/v1 pointer with resolver elf_doc_ext/v1 and the verified excerpt says source libraries preserve long-form evidence. This source-only ingest does not automatically create a durable Memory Note.", + "produced_evidence": [ + "article-hydrated-excerpt", + "article-source-record" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "source_library", + "job_id": "source-library-social-thread-001", + "title": "Saved social thread keeps handle metadata and remains source-only until promoted", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "thread-source-record", + "claim_id": "thread_metadata", + "requirement": "cite" + }, + { + "evidence_id": "thread-promotion-boundary", + "claim_id": "thread_promotion_boundary", + "requirement": "cite" + } + ], + "produced_answer": "The social thread was saved as a Source Library chat document with canonical_uri=https://example.com/thread/agent-knowledge-os, source_kind=social_thread, author=Example Builder, handle=example-builder, captured/source-created timestamps, trust_label=public_web, and thread_id=thread-agent-knowledge-os. It stays source-only until an explicit add_note or reviewed promotion path creates Memory Notes; background organization may propose tags or summaries without mutating memory.", + "produced_evidence": [ + "thread-promotion-boundary", + "thread-source-record" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "trust_source_of_truth", + "job_id": "trust-sot-rebuild-001", + "title": "Answer from restored Postgres source-of-truth after Qdrant rebuild", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "restore-proof-note", + "claim_id": "authoritative_restore_source", + "requirement": "cite" + }, + { + "evidence_id": "qdrant-rebuild-proof", + "claim_id": "qdrant_rebuilt_from_sot", + "requirement": "cite" + } + ], + "produced_answer": "Use the restored Postgres note as authoritative evidence: Postgres backup restored note single_user_restore_probe. Qdrant was rebuilt from Postgres chunk embeddings before the final search, with rebuilt_count=1, missing_vector_count=0, and error_count=0. This does not prove a private production corpus pass.", + "produced_evidence": [ + "qdrant-rebuild-proof", + "restore-proof-note" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 2, + "expected_evidence_matched": 2, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 2, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": true + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-decision-rationale-001", + "title": "Decision rationale survives Work Journal readback", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-decision-rationale-source", + "claim_id": "decision_rationale_recall", + "requirement": "cite" + } + ], + "produced_answer": "The rationale is to keep Work Journal entries source-adjacent unless they are promoted through Memory Authority or accepted Dreaming Review.", + "produced_evidence": [ + "wj-decision-rationale-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 1, + "decision_rationale_recalled_count": 1, + "decision_rationale_recall_rate": 1.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-explicit-next-step-001", + "title": "Explicit next steps are returned as instructions", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-explicit-step-source", + "claim_id": "explicit_next_step", + "requirement": "cite" + } + ], + "produced_answer": "The explicit next step is to run cargo make real-world-memory-work-continuity after adding the fixtures.", + "produced_evidence": [ + "wj-explicit-step-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 1, + "explicit_next_step_returned_count": 1, + "explicit_next_step_correct_count": 1, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-handoff-source-ref-001", + "title": "Handoff readback preserves source refs", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-handoff-source-ref", + "claim_id": "handoff_source_ref_present", + "requirement": "cite" + } + ], + "produced_answer": "The handoff includes source ref wj-handoff-source-ref for the next agent to inspect.", + "produced_evidence": [ + "wj-handoff-source-ref" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 1, + "handoff_source_ref_covered_count": 1, + "handoff_source_ref_coverage": 1.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-inferred-next-step-001", + "title": "Inferred next steps are labeled and not treated as instructions", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-inferred-step-source", + "claim_id": "inferred_next_step_label", + "requirement": "cite" + } + ], + "produced_answer": "The comparison task is labeled as inferred and is not an operator instruction.", + "produced_evidence": [ + "wj-inferred-step-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 1, + "inferred_next_step_labeled_count": 1, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 1.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-janitor-false-promotion-001", + "title": "Janitor candidates are not falsely promoted to memory", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-janitor-candidate-source", + "claim_id": "janitor_no_false_promotion", + "requirement": "cite" + } + ], + "produced_answer": "The janitor candidate remains review-required and source-adjacent.", + "produced_evidence": [ + "wj-janitor-candidate-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 1, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-redaction-001", + "title": "Sensitive markers are redacted from Work Journal readback", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-redaction-source", + "claim_id": "redaction_applied", + "requirement": "cite" + } + ], + "produced_answer": "The Work Journal redaction audit shows marker secret-demo-token was redacted and no sensitive marker persisted.", + "produced_evidence": [ + "wj-redaction-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 1, + "redaction_applied_count": 1, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 1.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-rejected-option-001", + "title": "Rejected options remain suppressed on resume", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-rejected-option-source", + "claim_id": "rejected_option_suppressed", + "requirement": "cite" + } + ], + "produced_answer": "The rejected janitor rewrite option remains suppressed and is not presented as a current instruction.", + "produced_evidence": [ + "wj-rejected-option-source" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 0, + "reset_resume_success_count": 0, + "reset_resume_success_rate": 0.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 1, + "rejected_option_suppressed_count": 1, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 1.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_continuity", + "job_id": "work-continuity-reset-resume-001", + "title": "Reset and resume reads the last Work Journal checkpoint", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_journal_readback", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "wj-reset-resume-checkpoint", + "claim_id": "reset_resume_readback", + "requirement": "cite" + } + ], + "produced_answer": "After reset, resume from Work Journal entry wj-reset-entry and rerun cargo make real-world-memory-work-continuity before review.", + "produced_evidence": [ + "wj-reset-resume-checkpoint" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 1.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "work_continuity": { + "readback_count": 1, + "entry_count": 1, + "reset_resume_required_count": 1, + "reset_resume_success_count": 1, + "reset_resume_success_rate": 1.0, + "decision_rationale_required_count": 0, + "decision_rationale_recalled_count": 0, + "decision_rationale_recall_rate": 0.0, + "rejected_option_required_count": 0, + "rejected_option_suppressed_count": 0, + "rejected_option_resurrection_count": 0, + "rejected_option_suppression_rate": 0.0, + "explicit_next_step_required_count": 0, + "explicit_next_step_returned_count": 0, + "explicit_next_step_correct_count": 0, + "explicit_next_step_precision": 1.0, + "inferred_next_step_required_count": 0, + "inferred_next_step_labeled_count": 0, + "inferred_step_instruction_count": 0, + "inferred_next_step_labeling_rate": 0.0, + "handoff_source_ref_required_count": 0, + "handoff_source_ref_covered_count": 0, + "handoff_source_ref_coverage": 0.0, + "redaction_required_count": 0, + "redaction_applied_count": 0, + "sensitive_marker_persistence_count": 0, + "redaction_rate": 0.0, + "janitor_candidate_count": 0, + "janitor_false_promotion_count": 0, + "janitor_false_promotion_rate": 0.0, + "journal_only_authority_claim_count": 0 + }, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.35 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 1, + "evidence_covered_count": 1, + "source_ref_required_count": 1, + "source_ref_covered_count": 1, + "quote_required_count": 1, + "quote_covered_count": 1, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "capture_integration", + "job_id": "capture-integration-boundaries-001", + "title": "Explain cross-tool capture boundaries without claiming live integrations", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "direct_answer", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-capture-log", + "claim_id": "fixture_backed_capture", + "requirement": "cite" + }, + { + "evidence_id": "agentmemory-hook-reference", + "claim_id": "reference_points", + "requirement": "use" + }, + { + "evidence_id": "claude-mem-viewer-reference", + "claim_id": "reference_points", + "requirement": "use" + }, + { + "evidence_id": "live-adapter-follow-up", + "claim_id": "live_adapter_boundary", + "requirement": "cite" + } + ], + "produced_answer": "The encoded capture is fixture-backed: it includes Linear issue status, GitHub PR review summary, command transcript, and a local runbook note, with [private-local-env] spans excluded. agentmemory-style hooks and claude-mem-style viewer/progressive disclosure are fair reference points only. Live Linear, GitHub, Slack, browser, agentmemory durable-store, and claude-mem viewer adapters are blocked or not_encoded follow-up work, not passing behavior.", + "produced_evidence": [ + "agentmemory-hook-reference", + "claude-mem-viewer-reference", + "live-adapter-follow-up", + "xy844-capture-log" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 4, + "expected_evidence_matched": 4, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 4, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.8, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "lifecycle_behavior", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 4, + "evidence_covered_count": 4, + "source_ref_required_count": 4, + "source_ref_covered_count": 4, + "quote_required_count": 4, + "quote_covered_count": 4, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_resume", + "job_id": "work-resume-decodex-linear-status-001", + "title": "Recover Decodex and Linear lane ownership before handoff", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "resume_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "ownership", + "requirement": "cite" + }, + { + "evidence_id": "xy844-current-lane-state", + "claim_id": "handoff_path", + "requirement": "use" + } + ], + "produced_answer": "Keep XY-844 on branch y/elf-xy-844 with routed GitHub identity y and Linear workspace hackink. Do not move the issue directly to In Review; after validation, push the branch, prepare the non-draft PR, and call issue_review_handoff.", + "produced_evidence": [ + "xy844-current-lane-state" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "ownership_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 1, + "scope_correct_count": 1, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_resume", + "job_id": "work-resume-failed-command-recovery-001", + "title": "Recover from the current failed command without chasing stale lint evidence", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "resume_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "current_stop", + "requirement": "cite" + }, + { + "evidence_id": "xy844-current-command-failure", + "claim_id": "recovery_action", + "requirement": "use" + } + ], + "produced_answer": "The current stop is the missing work_resume fixture suite, not the old lint warning. Create the missing fixtures under apps/elf-eval/fixtures/real_world_memory/work_resume/, then rerun `cargo make smoke-real-world-job`.", + "produced_evidence": [ + "xy844-current-command-failure" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_resume", + "job_id": "work-resume-next-action-extraction-001", + "title": "Extract the exact next action across issue, PR, command, and runbook evidence", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "work_plan", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "exact_next_action", + "requirement": "cite" + }, + { + "evidence_id": "xy844-next-action-handoff", + "claim_id": "validation_sequence", + "requirement": "use" + } + ], + "produced_answer": "The exact next action is to add the six work-resume continuity and capture/integration fixtures, run `cargo make smoke-real-world-job`, then run the pre-push gate: `cargo make fmt`, `cargo make lint-fix`, and `cargo make check` before pushing y/elf-xy-844.", + "produced_evidence": [ + "xy844-next-action-handoff" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.5, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_resume", + "job_id": "work-resume-pr-review-blocker-001", + "title": "Recover the current PR review blocker without reviving resolved tracker work", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "resume_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "current_review_blocker", + "requirement": "cite" + }, + { + "evidence_id": "xy844-current-pr-review", + "claim_id": "review_next_action", + "requirement": "use" + } + ], + "produced_answer": "The current PR blocker is the unsupported live-adapter claim. Update the report so agentmemory hooks and claude-mem viewer behavior are only fixture-backed references and live adapters remain blocked or not_encoded follow-up work, then rerun the real-world job suite. The old missing issue_transition blocker is resolved.", + "produced_evidence": [ + "xy844-current-pr-review" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + }, + { + "suite_id": "work_resume", + "job_id": "work-resume-stale-worktree-001", + "title": "Resume a retained lane with stale worktree blocker evidence", + "status": "pass", + "operational_evidence_tier": "local_fixture", + "answer_type": "resume_summary", + "requires_caveat": false, + "requires_refusal": false, + "can_answer_unknown": false, + "normalized_score": 1.0, + "hard_fail_hits": [], + "expected_evidence": [ + { + "evidence_id": "xy844-current-worktree", + "claim_id": "next_action", + "requirement": "cite" + }, + { + "evidence_id": "xy844-current-worktree", + "claim_id": "stale_blocker", + "requirement": "use" + } + ], + "produced_answer": "Resume XY-844 on branch y/elf-xy-844. The stale blocker is the old claim that untracked Decodex runtime files require manual cleanup; current evidence says those files are runtime artifacts. The exact next action is to add fixture-backed JSON jobs under apps/elf-eval/fixtures/real_world_memory/work_resume/ and update the runner/report tests.", + "produced_evidence": [ + "xy844-current-worktree" + ], + "unsupported_claim_count": 0, + "wrong_result_count": 0, + "stale_answer_count": 0, + "conflict_detection_count": 0, + "update_rationale_available": false, + "temporal_validity_not_encoded": false, + "history_readback_encoded": false, + "retrieval_quality": { + "expected_evidence_total": 1, + "expected_evidence_matched": 1, + "expected_evidence_recall": 1.0, + "produced_evidence_total": 1, + "irrelevant_context_count": 0, + "irrelevant_context_ratio": 0.0, + "trap_context_count": 0 + }, + "latency_ms": 2.0, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + }, + "trace_explainability": null, + "trap_ids_used": [], + "dimension_scores": [ + { + "dimension": "answer_correctness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.3 + }, + { + "dimension": "evidence_grounding", + "score": 1.0, + "max_points": 1.0, + "weight": 0.25 + }, + { + "dimension": "latency_resource", + "score": 1.0, + "max_points": 1.0, + "weight": 0.1 + }, + { + "dimension": "trap_avoidance", + "score": 1.0, + "max_points": 1.0, + "weight": 0.2 + }, + { + "dimension": "workflow_helpfulness", + "score": 1.0, + "max_points": 1.0, + "weight": 0.15 + } + ], + "reason": "Job passed with normalized_score 1.000.", + "evidence_required_count": 2, + "evidence_covered_count": 2, + "source_ref_required_count": 2, + "source_ref_covered_count": 2, + "quote_required_count": 2, + "quote_covered_count": 2, + "stale_retrieval_count": 0, + "scope_check_count": 0, + "scope_correct_count": 0, + "scope_violation_count": 0, + "redaction_leak_count": 0, + "qdrant_rebuild_case": false + } + ], + "unsupported_claims": [], + "not_encoded_suites": [], + "private_corpus_redaction": { + "policy": "publish evidence ids and bounded score summaries only; do not publish private text", + "private_fixture_count": 3 + }, + "evolution": { + "stale_answer_count": 0, + "conflict_detection_count": 11, + "update_rationale_available_count": 16, + "temporal_validity_not_encoded_count": 0, + "history_readback_encoded_count": 4 + }, + "follow_ups": [ + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-hierarchy-selection-001", + "title": "Materialize OpenViking selected hierarchy nodes", + "reason": "The context-trajectory adapter must return selected parent, child, and resource nodes with evidence ids before hierarchy quality can be scored against ELF." + }, + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-recursive-expansion-001", + "title": "Materialize OpenViking recursive context expansion paths", + "reason": "The adapter must emit the seed context, expanded child contexts, final evidence ids, and pruned branches before recursive expansion quality can be scored." + }, + { + "suite_id": "context_trajectory", + "job_id": "context-trajectory-openviking-staged-retrieval-001", + "title": "Run OpenViking staged trajectory after same-corpus evidence passes", + "reason": "The adapter must first publish matched expected evidence ids for every same-corpus query, then emit stage-level context trajectory output that can be compared with the equivalent ELF trace/session trajectory." + }, + { + "suite_id": "proactive_brief", + "job_id": "proactive-private-corpus-refresh-blocked-001", + "title": "XY-930 private-corpus input gate", + "reason": "Run private-corpus and credentialed production gates only when operator-owned inputs exist." + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-credential-boundary-001", + "title": "Run provider-backed production-ops gate with routed operator credentials", + "reason": "Credential-bound checks need an operator shell with provider environment variables; fixture reports can only encode the boundary." + }, + { + "suite_id": "production_ops", + "job_id": "production-ops-private-manifest-blocked-001", + "title": "Supply an operator-owned private production corpus manifest", + "reason": "A real private-corpus pass requires a sanitized local manifest supplied outside checked-in fixtures." + }, + { + "suite_id": "scheduled_memory", + "job_id": "scheduled-private-provider-scheduler-blocked-001", + "title": "XY-930 private/provider scheduled-memory input gate", + "reason": "Run private-corpus, provider-backed, and hosted scheduler gates only when operator-owned inputs exist." + } + ] +} \ No newline at end of file diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 3101070a..f55f1c7c 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -73,10 +73,12 @@ const SCOREBOARD_RESULT_STATES: &[&str] = &[ "blocked", "not_tested", "not_encoded", + "not_comparable", "unsupported_claim", ]; const SCOREBOARD_EVIDENCE_CLASSES: &[&str] = &["fixture_backed", "live_baseline", "live_real_world", "research_gate"]; +const SCOREBOARD_RETRIEVAL_K: usize = 5; const OPERATIONAL_EVIDENCE_TIERS: &[&str] = &["local_fixture", "public_proxy", "private_corpus", "provider_backed"]; const REQUIRED_AUTHORITY_PLANES: [&str; 7] = @@ -1026,6 +1028,8 @@ struct ScoreboardReport { schema: String, result_states: Vec, evidence_classes: Vec, + metric_basis: String, + retrieval_k: usize, job_typed_non_pass_count: usize, job_typed_non_pass_states_present: Vec, job_summary_claim: String, @@ -1037,6 +1041,106 @@ struct ScoreboardReport { summary_claim: String, unqualified_win_claim_allowed: bool, claim_boundary: String, + #[serde(default)] + rows: Vec, + #[serde(default)] + optimization_roadmap: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardRow { + product_id: String, + product_name: String, + row_source: String, + evidence_class: String, + result_state: String, + comparable: bool, + same_corpus: bool, + source_id_mapped: bool, + held_out: bool, + leakage_audited: bool, + product_runtime: bool, + container_digest_identified: bool, + metrics: ScoreboardMetrics, + #[serde(default)] + strengths: Vec, + #[serde(default)] + weaknesses: Vec, + #[serde(default)] + next_evidence: Vec, + #[serde(default)] + source_provenance: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardMetrics { + retrieval: ScoreboardRetrievalMetrics, + lifecycle: ScoreboardLifecycleMetrics, + answer_safety: ScoreboardAnswerSafetyMetrics, + operations: ScoreboardOperationalMetrics, + coverage: ScoreboardCoverageMetrics, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardRetrievalMetrics { + k: usize, + metric_basis: String, + recall_at_k: Option, + precision_at_k: Option, + mrr: Option, + ndcg: Option, + expected_evidence_recall: Option, + citation_source_ref_coverage: Option, + expected_evidence_matched: usize, + expected_evidence_total: usize, + produced_evidence_total: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardLifecycleMetrics { + stale_suppression: Option, + stale_suppressed_count: usize, + stale_check_count: usize, + update_correctness: Option, + update_correct_count: usize, + update_check_count: usize, + delete_correctness: Option, + delete_correct_count: usize, + delete_check_count: usize, + rollback_history_readback_rate: Option, + rollback_history_readback_count: usize, + rollback_history_check_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardAnswerSafetyMetrics { + unsupported_claim_rate: Option, + unsupported_claim_count: usize, + stale_answer_rate: Option, + stale_answer_count: usize, + hallucinated_evidence_rate: Option, + redaction_leak_count: usize, + irrelevant_context_ratio: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardOperationalMetrics { + mean_latency_ms: Option, + total_cost: Option, + resource_envelope_status: String, + resource_envelope_job_count: usize, + resource_envelope_pass_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct ScoreboardCoverageMetrics { + job_count: usize, + encoded_suite_count: usize, + pass_count: usize, + typed_non_pass_count: usize, + source_ref_coverage: Option, + evidence_coverage: Option, + evidence_class: String, } #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -2079,6 +2183,13 @@ struct JobMetrics { qdrant_rebuild_case: bool, } +struct ScoreboardRankedMetrics { + relevant_at_k: usize, + precision_denominator_at_k: usize, + reciprocal_rank: f64, + ndcg: f64, +} + #[derive(Debug, Subcommand)] #[command(rename_all = "kebab")] enum Command { @@ -4187,7 +4298,7 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result &ProducedAnswer { } fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { - let mut evidence = answer.evidence_ids.iter().cloned().collect::>(); + ordered_produced_evidence_ids(answer).into_iter().collect() +} +fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { + let mut seen = BTreeSet::new(); + let mut evidence = Vec::new(); + + for evidence_id in &answer.evidence_ids { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } for claim in &answer.claims { - evidence.extend(claim.evidence_ids.iter().cloned()); + for evidence_id in &claim.evidence_ids { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } for brief in &answer.proactive_briefs { for suggestion in &brief.suggestions { - evidence.extend(suggestion.evidence_refs.iter().cloned()); + for evidence_id in &suggestion.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } } for task in &answer.scheduled_tasks { for output in &task.outputs { - evidence.extend(output.evidence_refs.iter().cloned()); + for evidence_id in &output.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } } for readback in &answer.work_journal_readbacks { for entry in &readback.items { - evidence.extend(entry.source_refs.iter().cloned()); - + for evidence_id in &entry.source_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } for step in entry.explicit_next_steps.iter().chain(entry.inferred_next_steps.iter()) { - evidence.extend(step.evidence_refs.iter().cloned()); + for evidence_id in &step.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } for option in &entry.rejected_options { - evidence.extend(option.evidence_refs.iter().cloned()); + for evidence_id in &option.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } } if let Some(where_stopped) = &readback.where_stopped { - evidence.extend(where_stopped.decision_rationale_evidence_ids.iter().cloned()); - evidence.extend(where_stopped.handoff_source_refs.iter().cloned()); + for evidence_id in &where_stopped.decision_rationale_evidence_ids { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &where_stopped.handoff_source_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } for candidate in &readback.janitor_candidates { - evidence.extend(candidate.evidence_refs.iter().cloned()); + for evidence_id in &candidate.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } } for drill in &answer.recovery_drills { - evidence.extend(drill.backup_pitr.evidence_refs.iter().cloned()); - evidence.extend(drill.degraded_read.evidence_refs.iter().cloned()); - evidence.extend(drill.rpo.evidence_refs.iter().cloned()); - evidence.extend(drill.rto.evidence_refs.iter().cloned()); - evidence.extend(drill.outbox_replay.evidence_refs.iter().cloned()); - evidence.extend(drill.qdrant_rebuild.evidence_refs.iter().cloned()); - evidence.extend(drill.migration_repair.evidence_refs.iter().cloned()); - evidence.extend(drill.dead_letter.evidence_refs.iter().cloned()); - + for evidence_id in &drill.backup_pitr.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.degraded_read.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.rpo.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.rto.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.outbox_replay.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.qdrant_rebuild.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.migration_repair.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } + for evidence_id in &drill.dead_letter.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } for injection in &drill.failure_injections { - evidence.extend(injection.evidence_refs.iter().cloned()); + for evidence_id in &injection.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } for count in &drill.authority_record_counts { - evidence.extend(count.evidence_refs.iter().cloned()); + for evidence_id in &count.evidence_refs { + push_ordered_evidence(&mut evidence, &mut seen, evidence_id); + } } } evidence } +fn push_ordered_evidence( + evidence: &mut Vec, + seen: &mut BTreeSet, + evidence_id: &str, +) { + if seen.insert(evidence_id.to_string()) { + evidence.push(evidence_id.to_string()); + } +} + fn missing_required_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { job.expected_answer .must_include @@ -6871,13 +7036,15 @@ fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { } fn scoreboard_report( - jobs: &[JobReport], + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, external_adapters: &ExternalAdapterSection, ) -> ScoreboardReport { let job_typed_non_pass_count = - jobs.iter().filter(|job| job.status != TypedStatus::Pass).count(); + job_reports.iter().filter(|job| job.status != TypedStatus::Pass).count(); let external_typed_non_pass_count = external_typed_non_pass_count(&external_adapters.summary); - let job_typed_non_pass_states_present = typed_non_pass_states_present(jobs); + let job_typed_non_pass_states_present = typed_non_pass_states_present(job_reports); let external_adapter_typed_non_pass_states_present = external_typed_non_pass_states_present(&external_adapters.summary); let mut typed_non_pass_states_present = job_typed_non_pass_states_present.clone(); @@ -6892,18 +7059,797 @@ fn scoreboard_report( schema: SCOREBOARD_SCHEMA.to_string(), result_states: SCOREBOARD_RESULT_STATES.iter().map(ToString::to_string).collect(), evidence_classes: SCOREBOARD_EVIDENCE_CLASSES.iter().map(ToString::to_string).collect(), + metric_basis: "produced_evidence_order".to_string(), + retrieval_k: SCOREBOARD_RETRIEVAL_K, job_typed_non_pass_count, job_typed_non_pass_states_present, - job_summary_claim: scoreboard_summary_claim(jobs, job_typed_non_pass_count).to_string(), + job_summary_claim: scoreboard_summary_claim(job_reports, job_typed_non_pass_count).to_string(), external_adapter_typed_non_pass_count: external_typed_non_pass_count, external_adapter_typed_non_pass_states_present, typed_non_pass_count, typed_non_pass_states_present, evidence_class_counts: scoreboard_evidence_class_counts(external_adapters), - summary_claim: scoreboard_summary_claim(jobs, typed_non_pass_count).to_string(), + summary_claim: scoreboard_summary_claim(job_reports, typed_non_pass_count).to_string(), unqualified_win_claim_allowed: false, claim_boundary: "Typed non-pass states and non-live evidence classes must remain visible; reports must not collapse them into unqualified wins.".to_string(), + rows: scoreboard_rows(raw_jobs, job_reports, summary, external_adapters), + optimization_roadmap: scoreboard_optimization_roadmap(), + } +} + +fn scoreboard_rows( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, + external_adapters: &ExternalAdapterSection, +) -> Vec { + let mut rows = vec![elf_scoreboard_row(raw_jobs, job_reports, summary)]; + + rows.extend(external_project_scoreboard_rows(&external_adapters.adapters)); + + rows +} + +fn elf_scoreboard_row( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardRow { + let source_id_mapped = + summary.source_ref_required_count > 0 && summary.source_ref_coverage >= 1.0; + let result_state = aggregate_job_report_state(job_reports); + let metrics = scoreboard_metrics_for_reports(raw_jobs, job_reports, summary); + let typed_non_pass_count = + job_reports.iter().filter(|job| job.status != TypedStatus::Pass).count(); + let mut row = ScoreboardRow { + product_id: "elf_current_report".to_string(), + product_name: "ELF".to_string(), + row_source: "current_real_world_job_report".to_string(), + evidence_class: "fixture_backed".to_string(), + result_state, + comparable: false, + same_corpus: true, + source_id_mapped, + held_out: jobs_have_tag(raw_jobs, "held_out"), + leakage_audited: jobs_have_tag(raw_jobs, "leakage_audited"), + product_runtime: false, + container_digest_identified: false, + metrics, + strengths: elf_scoreboard_strengths(summary), + weaknesses: Vec::new(), + next_evidence: Vec::new(), + source_provenance: vec![ + "apps/elf-eval/fixtures/real_world_memory/".to_string(), + "apps/elf-eval/src/bin/real_world_job_benchmark.rs".to_string(), + ], + }; + + if typed_non_pass_count > 0 { + row.weaknesses + .push(format!("{typed_non_pass_count} encoded job row(s) are typed non-pass.")); + } + + scoreboard_apply_comparability_gaps(&mut row); + + row +} + +fn aggregate_job_report_state(job_reports: &[JobReport]) -> String { + if job_reports.is_empty() { + return "not_tested".to_string(); + } + + let refs = job_reports.iter().collect::>(); + + scoreboard_result_state(aggregate_status(&refs)).to_string() +} + +fn jobs_have_tag(jobs: &[RealWorldJob], tag: &str) -> bool { + !jobs.is_empty() && jobs.iter().all(|job| job.tags.iter().any(|candidate| candidate == tag)) +} + +fn scoreboard_metrics_for_reports( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardMetrics { + ScoreboardMetrics { + retrieval: scoreboard_retrieval_metrics(job_reports, summary), + lifecycle: scoreboard_lifecycle_metrics(raw_jobs, job_reports), + answer_safety: scoreboard_answer_safety_metrics(summary), + operations: scoreboard_operational_metrics(raw_jobs, job_reports, summary), + coverage: ScoreboardCoverageMetrics { + job_count: summary.job_count, + encoded_suite_count: summary.encoded_suite_count, + pass_count: summary.pass, + typed_non_pass_count: job_reports + .iter() + .filter(|job| job.status != TypedStatus::Pass) + .count(), + source_ref_coverage: Some(summary.source_ref_coverage), + evidence_coverage: Some(summary.evidence_coverage), + evidence_class: "fixture_backed".to_string(), + }, + } +} + +fn scoreboard_retrieval_metrics( + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardRetrievalMetrics { + let produced_evidence_total = + job_reports.iter().map(|job| job.retrieval_quality.produced_evidence_total).sum(); + let mut relevant_at_k = 0; + let mut precision_denominator_at_k = 0; + let mut reciprocal_rank_sum = 0.0; + let mut ndcg_sum = 0.0; + let mut ranked_job_count = 0; + + for job in job_reports { + let expected = job + .expected_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>(); + let ranked = scoreboard_ranked_metrics_for_job(job, &expected); + + relevant_at_k += ranked.relevant_at_k; + precision_denominator_at_k += ranked.precision_denominator_at_k; + reciprocal_rank_sum += ranked.reciprocal_rank; + ndcg_sum += ranked.ndcg; + ranked_job_count += 1; + } + + ScoreboardRetrievalMetrics { + k: SCOREBOARD_RETRIEVAL_K, + metric_basis: "produced_evidence_order".to_string(), + recall_at_k: Some(ratio_or(relevant_at_k, summary.expected_evidence_total, 1.0)), + precision_at_k: Some(ratio_or(relevant_at_k, precision_denominator_at_k, 1.0)), + mrr: Some(scoreboard_mean_metric(reciprocal_rank_sum, ranked_job_count)), + ndcg: Some(scoreboard_mean_metric(ndcg_sum, ranked_job_count)), + expected_evidence_recall: Some(summary.expected_evidence_recall), + citation_source_ref_coverage: Some(summary.source_ref_coverage), + expected_evidence_matched: summary.expected_evidence_matched, + expected_evidence_total: summary.expected_evidence_total, + produced_evidence_total, + } +} + +fn scoreboard_ranked_metrics_for_job( + job: &JobReport, + expected: &BTreeSet<&str>, +) -> ScoreboardRankedMetrics { + let precision_denominator_at_k = SCOREBOARD_RETRIEVAL_K; + let relevant_at_k = job + .produced_evidence + .iter() + .take(SCOREBOARD_RETRIEVAL_K) + .filter(|evidence_id| expected.contains(evidence_id.as_str())) + .count(); + let reciprocal_rank = job + .produced_evidence + .iter() + .position(|evidence_id| expected.contains(evidence_id.as_str())) + .map_or_else(|| f64::from(expected.is_empty()), |index| 1.0 / (index + 1) as f64); + let ndcg = scoreboard_ndcg(job.produced_evidence.as_slice(), expected); + + ScoreboardRankedMetrics { relevant_at_k, precision_denominator_at_k, reciprocal_rank, ndcg } +} + +fn scoreboard_ndcg(produced_evidence: &[String], expected: &BTreeSet<&str>) -> f64 { + if expected.is_empty() { + return 1.0; + } + + let dcg = produced_evidence + .iter() + .take(SCOREBOARD_RETRIEVAL_K) + .enumerate() + .filter(|(_, evidence_id)| expected.contains(evidence_id.as_str())) + .map(|(index, _)| 1.0 / ((index + 2) as f64).log2()) + .sum::(); + let ideal_hits = expected.len().min(SCOREBOARD_RETRIEVAL_K); + let idcg = (0..ideal_hits).map(|index| 1.0 / ((index + 2) as f64).log2()).sum::(); + + if idcg > 0.0 { dcg / idcg } else { 0.0 } +} + +fn scoreboard_mean_metric(sum: f64, count: usize) -> f64 { + if count == 0 { 1.0 } else { round3(sum / count as f64) } +} + +fn scoreboard_lifecycle_metrics( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], +) -> ScoreboardLifecycleMetrics { + let stale_check_count: usize = raw_jobs + .iter() + .map(|job| { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used && trap.trap_type == "stale_fact") + .count() + }) + .sum(); + let stale_failure_count = job_reports + .iter() + .map(|job| job.stale_answer_count + job.stale_retrieval_count) + .sum::(); + let update_check_count = scoreboard_lifecycle_check_count(raw_jobs, scoreboard_is_update_job); + let update_correct_count = + scoreboard_lifecycle_correct_count(raw_jobs, job_reports, scoreboard_is_update_job); + let delete_check_count = scoreboard_lifecycle_check_count(raw_jobs, scoreboard_is_delete_job); + let delete_correct_count = + scoreboard_lifecycle_correct_count(raw_jobs, job_reports, scoreboard_is_delete_job); + let rollback_history_check_count = + scoreboard_lifecycle_check_count(raw_jobs, scoreboard_is_rollback_history_job); + let rollback_history_readback_count = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| { + scoreboard_is_rollback_history_job(job) && report.status == TypedStatus::Pass + }) + .count(); + + ScoreboardLifecycleMetrics { + stale_suppression: Some(ratio_or( + stale_check_count.saturating_sub(stale_failure_count), + stale_check_count, + 1.0, + )), + stale_suppressed_count: stale_check_count.saturating_sub(stale_failure_count), + stale_check_count, + update_correctness: Some(ratio_or(update_correct_count, update_check_count, 1.0)), + update_correct_count, + update_check_count, + delete_correctness: Some(ratio_or(delete_correct_count, delete_check_count, 1.0)), + delete_correct_count, + delete_check_count, + rollback_history_readback_rate: Some(ratio_or( + rollback_history_readback_count, + rollback_history_check_count, + 1.0, + )), + rollback_history_readback_count, + rollback_history_check_count, + } +} + +fn scoreboard_lifecycle_check_count( + jobs: &[RealWorldJob], + predicate: fn(&RealWorldJob) -> bool, +) -> usize { + jobs.iter().filter(|job| predicate(job)).count() +} + +fn scoreboard_lifecycle_correct_count( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + predicate: fn(&RealWorldJob) -> bool, +) -> usize { + raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| predicate(job) && report.status == TypedStatus::Pass) + .count() +} + +fn scoreboard_is_update_job(job: &RealWorldJob) -> bool { + scoreboard_has_any_tag( + job, + &["update", "correction_persistence", "current_authority", "conflicting_source_authority"], + ) +} + +fn scoreboard_is_delete_job(job: &RealWorldJob) -> bool { + scoreboard_has_any_tag(job, &["delete", "ttl", "tombstone"]) +} + +fn scoreboard_is_rollback_history_job(job: &RealWorldJob) -> bool { + scoreboard_has_any_tag(job, &["rollback", "correction_persistence"]) +} + +fn scoreboard_has_any_tag(job: &RealWorldJob, tags: &[&str]) -> bool { + job.tags.iter().any(|tag| tags.contains(&tag.as_str())) +} + +fn scoreboard_answer_safety_metrics(summary: &ReportSummary) -> ScoreboardAnswerSafetyMetrics { + ScoreboardAnswerSafetyMetrics { + unsupported_claim_rate: Some(ratio(summary.unsupported_claim_count, summary.job_count)), + unsupported_claim_count: summary.unsupported_claim_count, + stale_answer_rate: Some(ratio(summary.stale_answer_count, summary.job_count)), + stale_answer_count: summary.stale_answer_count, + hallucinated_evidence_rate: Some(summary.irrelevant_context_ratio), + redaction_leak_count: summary.redaction_leak_count, + irrelevant_context_ratio: Some(summary.irrelevant_context_ratio), + } +} + +fn scoreboard_operational_metrics( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardOperationalMetrics { + let resource_envelope_job_count = + raw_jobs.iter().filter(|job| scoreboard_has_any_tag(job, &["resource_envelope"])).count(); + let resource_envelope_pass_count = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| { + scoreboard_has_any_tag(job, &["resource_envelope"]) + && report.status == TypedStatus::Pass + }) + .count(); + + ScoreboardOperationalMetrics { + mean_latency_ms: summary.mean_latency_ms, + total_cost: summary.total_cost.clone(), + resource_envelope_status: if resource_envelope_job_count == resource_envelope_pass_count { + "pass".to_string() + } else { + "typed_non_pass_present".to_string() + }, + resource_envelope_job_count, + resource_envelope_pass_count, + } +} + +fn elf_scoreboard_strengths(summary: &ReportSummary) -> Vec { + let mut strengths = Vec::new(); + + if summary.expected_evidence_recall >= 1.0 { + strengths.push("Expected evidence recall is complete for encoded jobs.".to_string()); + } + if summary.source_ref_coverage >= 1.0 { + strengths + .push("Source-ref coverage is complete for encoded required evidence.".to_string()); + } + if summary.stale_answer_count == 0 && summary.stale_retrieval_count == 0 { + strengths.push("Encoded stale-answer and stale-retrieval counters are zero.".to_string()); + } + if summary.redaction_leak_count == 0 { + strengths.push("Encoded redaction leak count is zero.".to_string()); } + if summary.work_continuity.is_some() { + strengths.push("Work Continuity readback metrics are encoded in the report.".to_string()); + } + + strengths +} + +fn external_project_scoreboard_rows(adapters: &[ExternalAdapterReport]) -> Vec { + let mut by_project: BTreeMap> = BTreeMap::new(); + + for adapter in adapters.iter().filter(|adapter| adapter.project != "ELF") { + by_project.entry(adapter.project.clone()).or_default().push(adapter); + } + + by_project + .into_iter() + .map(|(project, adapters)| external_project_scoreboard_row(project, adapters.as_slice())) + .collect() +} + +fn external_project_scoreboard_row( + project: String, + adapters: &[&ExternalAdapterReport], +) -> ScoreboardRow { + let evidence_class = strongest_scoreboard_evidence_class(adapters); + let result_state = external_project_result_state(adapters); + let source_id_mapped = external_project_source_id_mapped(adapters); + let same_corpus = external_project_same_corpus(adapters); + let product_runtime = + adapters.iter().any(|adapter| adapter.evidence_class == "live_real_world"); + let container_digest_identified = + adapters.iter().any(|adapter| adapter_has_container_digest(adapter)); + let typed_non_pass_count = + adapters.iter().map(|adapter| adapter_typed_non_pass_count(adapter)).sum(); + let mut row = ScoreboardRow { + product_id: scoreboard_project_id(project.as_str()), + product_name: project, + row_source: "external_adapter_manifest".to_string(), + evidence_class: evidence_class.clone(), + result_state, + comparable: false, + same_corpus, + source_id_mapped, + held_out: false, + leakage_audited: false, + product_runtime, + container_digest_identified, + metrics: external_project_scoreboard_metrics( + adapters, + evidence_class.as_str(), + typed_non_pass_count, + ), + strengths: external_project_strengths(adapters), + weaknesses: external_project_weaknesses(adapters), + next_evidence: Vec::new(), + source_provenance: external_project_source_provenance(adapters), + }; + + scoreboard_apply_comparability_gaps(&mut row); + + row +} + +fn external_project_scoreboard_metrics( + adapters: &[&ExternalAdapterReport], + evidence_class: &str, + typed_non_pass_count: usize, +) -> ScoreboardMetrics { + let pass_count = adapters + .iter() + .flat_map(|adapter| adapter.suites.iter()) + .filter(|suite| suite.status == AdapterCoverageStatus::Pass) + .count(); + let suite_count = adapters.iter().map(|adapter| adapter.suites.len()).sum(); + + ScoreboardMetrics { + retrieval: ScoreboardRetrievalMetrics { + k: SCOREBOARD_RETRIEVAL_K, + metric_basis: "external_adapter_manifest_no_ordered_evidence".to_string(), + ..ScoreboardRetrievalMetrics::default() + }, + coverage: ScoreboardCoverageMetrics { + job_count: 0, + encoded_suite_count: suite_count, + pass_count, + typed_non_pass_count, + source_ref_coverage: None, + evidence_coverage: None, + evidence_class: evidence_class.to_string(), + }, + ..ScoreboardMetrics::default() + } +} + +fn strongest_scoreboard_evidence_class(adapters: &[&ExternalAdapterReport]) -> String { + for evidence_class in ["live_real_world", "live_baseline", "fixture_backed", "research_gate"] { + if adapters.iter().any(|adapter| { + scoreboard_evidence_class(adapter.evidence_class.as_str()) == evidence_class + }) { + return evidence_class.to_string(); + } + } + + "research_gate".to_string() +} + +fn external_project_result_state(adapters: &[&ExternalAdapterReport]) -> String { + for status in [ + AdapterCoverageStatus::WrongResult, + AdapterCoverageStatus::Blocked, + AdapterCoverageStatus::Incomplete, + AdapterCoverageStatus::LifecycleFail, + AdapterCoverageStatus::NotEncoded, + AdapterCoverageStatus::Unsupported, + ] { + if adapters.iter().any(|adapter| adapter_has_status(adapter, status)) { + return adapter_status_to_scoreboard_state(status).to_string(); + } + } + + "not_comparable".to_string() +} + +fn adapter_has_status(adapter: &ExternalAdapterReport, status: AdapterCoverageStatus) -> bool { + adapter.overall_status == status + || adapter.setup.status == status + || adapter.run.status == status + || adapter.result.status == status + || adapter.capabilities.iter().any(|capability| capability.status == status) + || adapter.suites.iter().any(|suite| suite.status == status) + || adapter.scenarios.iter().any(|scenario| scenario.status == status) +} + +fn external_project_same_corpus(adapters: &[&ExternalAdapterReport]) -> bool { + let needles = &["same-corpus", "same corpus", "same_corpus", "shared corpus"]; + + adapters.iter().any(|adapter| { + text_mentions_any(adapter.adapter_kind.as_str(), needles) + || adapter_has_reported_same_corpus_text(adapter, needles) + }) +} + +fn external_project_source_id_mapped(adapters: &[&ExternalAdapterReport]) -> bool { + let needles = &[ + "source-id mapped", + "source ids mapped", + "maps to source ids", + "mapped to source ids", + "maps back to source ids", + "map to generated evidence ids", + "mapped to generated evidence ids", + "evidence ids match", + ]; + + adapters.iter().any(|adapter| adapter_has_passing_text(adapter, needles)) +} + +fn adapter_has_passing_text(adapter: &ExternalAdapterReport, needles: &[&str]) -> bool { + adapter_status_mentions_any(adapter.setup.status, adapter.setup.evidence.as_str(), needles) + || adapter_status_mentions_any(adapter.run.status, adapter.run.evidence.as_str(), needles) + || adapter_status_mentions_any( + adapter.result.status, + adapter.result.evidence.as_str(), + needles, + ) || adapter.capabilities.iter().any(|capability| { + adapter_status_mentions_any(capability.status, capability.capability.as_str(), needles) + || adapter_status_mentions_any(capability.status, capability.evidence.as_str(), needles) + }) || adapter.suites.iter().any(|suite| { + adapter_status_mentions_any(suite.status, suite.suite_id.as_str(), needles) + || adapter_status_mentions_any(suite.status, suite.evidence.as_str(), needles) + }) || adapter.scenarios.iter().any(|scenario| { + adapter_status_mentions_any(scenario.status, scenario.scenario_id.as_str(), needles) + || adapter_status_mentions_any(scenario.status, scenario.evidence.as_str(), needles) + }) +} + +fn adapter_has_reported_same_corpus_text( + adapter: &ExternalAdapterReport, + needles: &[&str], +) -> bool { + adapter_status_reports_same_corpus( + adapter.setup.status, + adapter.setup.evidence.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + adapter.run.status, + adapter.run.evidence.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + adapter.result.status, + adapter.result.evidence.as_str(), + needles, + ) || adapter.capabilities.iter().any(|capability| { + adapter_status_reports_same_corpus( + capability.status, + capability.capability.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + capability.status, + capability.evidence.as_str(), + needles, + ) + }) || adapter.suites.iter().any(|suite| { + adapter_status_reports_same_corpus(suite.status, suite.suite_id.as_str(), needles) + || adapter_status_reports_same_corpus(suite.status, suite.evidence.as_str(), needles) + }) || adapter.scenarios.iter().any(|scenario| { + adapter_status_reports_same_corpus(scenario.status, scenario.scenario_id.as_str(), needles) + || adapter_status_reports_same_corpus( + scenario.status, + scenario.evidence.as_str(), + needles, + ) + }) +} + +fn adapter_status_reports_same_corpus( + status: AdapterCoverageStatus, + text: &str, + needles: &[&str], +) -> bool { + matches!( + status, + AdapterCoverageStatus::Pass + | AdapterCoverageStatus::Real + | AdapterCoverageStatus::WrongResult + | AdapterCoverageStatus::LifecycleFail + ) && text_mentions_any(text, needles) +} + +fn adapter_status_mentions_any( + status: AdapterCoverageStatus, + text: &str, + needles: &[&str], +) -> bool { + matches!(status, AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real) + && text_mentions_any(text, needles) +} + +fn text_mentions_any(text: &str, needles: &[&str]) -> bool { + let text = text.to_ascii_lowercase(); + + needles.iter().any(|needle| text.contains(&needle.to_ascii_lowercase())) +} + +fn adapter_status_to_scoreboard_state(status: AdapterCoverageStatus) -> &'static str { + match status { + AdapterCoverageStatus::WrongResult | AdapterCoverageStatus::LifecycleFail => "wrong_result", + AdapterCoverageStatus::Blocked => "blocked", + AdapterCoverageStatus::Incomplete => "incomplete", + AdapterCoverageStatus::NotEncoded | AdapterCoverageStatus::Unsupported => "not_encoded", + AdapterCoverageStatus::Real + | AdapterCoverageStatus::Mocked + | AdapterCoverageStatus::Pass => "not_comparable", + } +} + +fn adapter_typed_non_pass_count(adapter: &ExternalAdapterReport) -> usize { + let direct_statuses = + [adapter.overall_status, adapter.setup.status, adapter.run.status, adapter.result.status]; + let direct = direct_statuses + .into_iter() + .filter(|status| adapter_status_is_typed_non_pass(*status)) + .count(); + let capability = adapter + .capabilities + .iter() + .filter(|capability| adapter_status_is_typed_non_pass(capability.status)) + .count(); + let suites = adapter + .suites + .iter() + .filter(|suite| adapter_status_is_typed_non_pass(suite.status)) + .count(); + let scenarios = adapter + .scenarios + .iter() + .filter(|scenario| adapter_status_is_typed_non_pass(scenario.status)) + .count(); + + direct + capability + suites + scenarios +} + +fn adapter_status_is_typed_non_pass(status: AdapterCoverageStatus) -> bool { + matches!( + status, + AdapterCoverageStatus::Unsupported + | AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::WrongResult + | AdapterCoverageStatus::LifecycleFail + | AdapterCoverageStatus::NotEncoded + ) +} + +fn adapter_has_container_digest(adapter: &ExternalAdapterReport) -> bool { + adapter.setup.evidence.contains("sha256:") + || adapter.run.evidence.contains("sha256:") + || adapter.result.evidence.contains("sha256:") + || adapter.evidence.iter().any(|evidence| { + evidence.reference.contains("sha256:") || evidence.reference.contains("digest") + }) +} + +fn external_project_strengths(adapters: &[&ExternalAdapterReport]) -> Vec { + let mut strengths = BTreeSet::new(); + + for adapter in adapters { + for capability in &adapter.capabilities { + if matches!( + capability.status, + AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real + ) { + strengths.insert(format!( + "{} capability is {}.", + capability.capability, + adapter_status_str(capability.status) + )); + } + } + for scenario in &adapter.scenarios { + if scenario_comparison_outcome(scenario) == ScenarioComparisonOutcome::Loss { + strengths.insert(format!( + "Scenario {} is recorded as a competitor strength.", + scenario.scenario_id + )); + } + } + } + + strengths.into_iter().take(6).collect() +} + +fn external_project_weaknesses(adapters: &[&ExternalAdapterReport]) -> Vec { + let mut weaknesses = BTreeSet::new(); + + for adapter in adapters { + if adapter.overall_status != AdapterCoverageStatus::Pass { + weaknesses.insert(format!( + "Adapter {} overall status is {}.", + adapter.adapter_id, + adapter_status_str(adapter.overall_status) + )); + } + + for suite in &adapter.suites { + if adapter_status_is_typed_non_pass(suite.status) { + weaknesses.insert(format!( + "Suite {} is {}.", + suite.suite_id, + adapter_status_str(suite.status) + )); + } + } + } + + weaknesses.into_iter().take(8).collect() +} + +fn external_project_source_provenance(adapters: &[&ExternalAdapterReport]) -> Vec { + let mut provenance = BTreeSet::new(); + + for adapter in adapters { + for evidence in &adapter.evidence { + provenance.insert(evidence.reference.clone()); + } + for artifact in [&adapter.setup.artifact, &adapter.run.artifact, &adapter.result.artifact] + .into_iter() + .flatten() + { + provenance.insert(artifact.clone()); + } + } + + provenance.into_iter().take(12).collect() +} + +fn scoreboard_project_id(project: &str) -> String { + project + .chars() + .map(|ch| if ch.is_ascii_alphanumeric() { ch.to_ascii_lowercase() } else { '_' }) + .collect::() + .split('_') + .filter(|part| !part.is_empty()) + .collect::>() + .join("_") +} + +fn scoreboard_apply_comparability_gaps(row: &mut ScoreboardRow) { + if !row.same_corpus { + row.next_evidence.push("Map this product to the same corpus.".to_string()); + } + if !row.source_id_mapped { + row.next_evidence.push("Map returned evidence to stable source ids.".to_string()); + } + if !row.held_out { + row.next_evidence.push("Publish a held-out split for this row.".to_string()); + } + if !row.leakage_audited { + row.next_evidence.push("Publish leakage-audit evidence for this row.".to_string()); + } + if !row.product_runtime { + row.next_evidence + .push("Run a Docker-contained product-runtime adapter for this row.".to_string()); + } + if !row.container_digest_identified { + row.next_evidence.push("Record container image digest evidence.".to_string()); + } + if row.result_state != "pass" { + row.next_evidence + .push("Resolve typed non-pass state before claiming a comparable pass.".to_string()); + } + + row.comparable = row.same_corpus + && row.source_id_mapped + && row.held_out + && row.leakage_audited + && row.product_runtime + && row.container_digest_identified + && row.result_state == "pass" + && row.metrics.retrieval.recall_at_k.is_some() + && row.metrics.retrieval.precision_at_k.is_some() + && row.metrics.retrieval.mrr.is_some() + && row.metrics.retrieval.ndcg.is_some(); + + if !row.comparable && row.result_state == "pass" { + row.result_state = "not_comparable".to_string(); + } + if !row.comparable { + row.weaknesses + .push("This row is not a comparable product-runtime scoreboard pass.".to_string()); + } +} + +fn scoreboard_optimization_roadmap() -> Vec { + vec![ + "Capture Docker image digests and runtime metadata for product-runtime rows.".to_string(), + "Add held-out and leakage-audit manifests before broad competitor comparisons.".to_string(), + "Promote external adapters from typed blockers to same-corpus source-id-mapped runtime rows only after they emit comparable evidence.".to_string(), + "Use row-level metrics for optimization direction; do not claim a universal leaderboard.".to_string(), + ] } fn typed_non_pass_states_present(jobs: &[JobReport]) -> Vec { @@ -8556,6 +9502,11 @@ fn render_markdown_scoreboard(out: &mut String, report: &RealWorldReport) { "- Evidence classes: `{}`\n", md_inline(report.scoreboard.evidence_classes.join(", ").as_str()) )); + out.push_str(&format!( + "- Metric basis: `{}` at k=`{}`\n", + md_inline(report.scoreboard.metric_basis.as_str()), + report.scoreboard.retrieval_k + )); out.push_str(&format!( "- Summary claim: `{}`\n", md_inline(report.scoreboard.summary_claim.as_str()) @@ -8598,6 +9549,40 @@ fn render_markdown_scoreboard(out: &mut String, report: &RealWorldReport) { "- Claim boundary: {}\n\n", md_cell(report.scoreboard.claim_boundary.as_str()) )); + out.push_str("| Product | State | Evidence | Comparable | Runtime Gates | Recall@k | Precision@k | MRR | nDCG | Stale Suppression | Update/Delete | Source Refs | Latency | Next Evidence |\n"); + out.push_str( + "| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | --- | ---: | --- | --- |\n", + ); + + for row in &report.scoreboard.rows { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n", + md_cell(row.product_name.as_str()), + md_inline(row.result_state.as_str()), + md_inline(row.evidence_class.as_str()), + row.comparable, + scoreboard_runtime_gate_cell(row), + scoreboard_optional_f64(row.metrics.retrieval.recall_at_k), + scoreboard_optional_f64(row.metrics.retrieval.precision_at_k), + scoreboard_optional_f64(row.metrics.retrieval.mrr), + scoreboard_optional_f64(row.metrics.retrieval.ndcg), + scoreboard_optional_f64(row.metrics.lifecycle.stale_suppression), + scoreboard_update_delete_cell(row), + scoreboard_optional_f64(row.metrics.coverage.source_ref_coverage), + scoreboard_latency_cell(row), + md_cell(scoreboard_list_cell(&row.next_evidence).as_str()) + )); + } + + if !report.scoreboard.optimization_roadmap.is_empty() { + out.push_str("\nOptimization direction:\n"); + + for item in &report.scoreboard.optimization_roadmap { + out.push_str(&format!("- {}\n", md_cell(item.as_str()))); + } + + out.push('\n'); + } } fn render_markdown_operational_evidence(out: &mut String, report: &RealWorldReport) { @@ -9886,6 +10871,45 @@ fn scoreboard_evidence_class_count_display(scoreboard: &ScoreboardReport) -> Str .join(", ") } +fn scoreboard_optional_f64(value: Option) -> String { + value.map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", round3(value))) +} + +fn scoreboard_optional_f64_plain(value: Option) -> String { + value.map_or_else(|| "n/a".to_string(), |value| round3(value).to_string()) +} + +fn scoreboard_runtime_gate_cell(row: &ScoreboardRow) -> String { + format!( + "`same_corpus={}`
`source_ids={}`
`held_out={}`
`leakage={}`
`runtime={}`
`digest={}`", + row.same_corpus, + row.source_id_mapped, + row.held_out, + row.leakage_audited, + row.product_runtime, + row.container_digest_identified + ) +} + +fn scoreboard_update_delete_cell(row: &ScoreboardRow) -> String { + format!( + "`update={}`
`delete={}`", + scoreboard_optional_f64_plain(row.metrics.lifecycle.update_correctness), + scoreboard_optional_f64_plain(row.metrics.lifecycle.delete_correctness) + ) +} + +fn scoreboard_latency_cell(row: &ScoreboardRow) -> String { + row.metrics + .operations + .mean_latency_ms + .map_or_else(|| "`n/a`".to_string(), |latency| format!("`{} ms`", round3(latency))) +} + +fn scoreboard_list_cell(values: &[String]) -> String { + if values.is_empty() { "none".to_string() } else { values.join("; ") } +} + fn status_str(status: TypedStatus) -> &'static str { match status { TypedStatus::Pass => "pass", diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 712965b8..8d950cdb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -625,7 +625,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(23) + Some(26) ); assert_eq!( report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), @@ -633,7 +633,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), - Some(11) + Some(14) ); let jobs = array_at(&report, "/jobs")?; @@ -792,6 +792,7 @@ fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { "blocked", "not_tested", "not_encoded", + "not_comparable", "unsupported_claim", ] .map(str::to_owned) @@ -814,11 +815,11 @@ fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { ); assert_eq!( report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), - Some(220) + Some(240) ); assert_eq!( report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), - Some(220) + Some(240) ); assert_eq!( string_array_at(&report, "/scoreboard/job_typed_non_pass_states_present")?, @@ -842,6 +843,13 @@ fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { report.pointer("/scoreboard/evidence_class_counts/live_baseline").and_then(Value::as_u64), Some(6) ); + assert_eq!( + report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); + + assert_scoreboard_rows_expose_quantitative_and_blocker_contract(&report)?; let suites = array_at(&report, "/suites")?; let adversarial = find_by_field(suites, "/suite_id", "adversarial_quality")?; @@ -852,6 +860,107 @@ fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { Ok(()) } +fn assert_scoreboard_rows_expose_quantitative_and_blocker_contract(report: &Value) -> Result<()> { + let rows = array_at(report, "/scoreboard/rows")?; + let elf = find_by_field(rows, "/product_id", "elf_current_report")?; + let qmd = find_by_field(rows, "/product_id", "qmd")?; + let pageindex = find_by_field(rows, "/product_id", "vectifyai_pageindex")?; + let openkb = find_by_field(rows, "/product_id", "vectifyai_openkb")?; + let honcho = find_by_field(rows, "/product_id", "plastic_labs_honcho")?; + + assert_eq!(rows.len(), 20); + assert_eq!(elf.pointer("/product_name").and_then(Value::as_str), Some("ELF")); + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("not_comparable")); + assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/held_out").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert_eq!( + elf.pointer("/metrics/retrieval/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(elf.pointer("/metrics/retrieval/k").and_then(Value::as_u64), Some(5)); + assert!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64).is_some()); + assert_eq!( + elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/coverage/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert!(array_contains_str( + elf, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + assert!(array_contains_str(elf, "/next_evidence", "Record container image digest evidence.")?); + assert_eq!(qmd.pointer("/product_name").and_then(Value::as_str), Some("qmd")); + assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(qmd.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(qmd.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); + assert_eq!(qmd.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert!(qmd.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(array_contains_str(qmd, "/next_evidence", "Record container image digest evidence.")?); + + assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; + assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; + assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; + + Ok(()) +} + +fn assert_tracked_external_blocker_row( + row: &Value, + product_name: &str, + same_corpus: bool, +) -> Result<()> { + assert_eq!(row.pointer("/product_name").and_then(Value::as_str), Some(product_name)); + assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/same_corpus").and_then(Value::as_bool), Some(same_corpus)); + assert_eq!(row.pointer("/source_id_mapped").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert!(row.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/precision_at_k").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/mrr").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/ndcg").is_some_and(Value::is_null)); + assert!(array_contains_str( + row, + "/next_evidence", + "Map returned evidence to stable source ids." + )?); + assert!(array_contains_str( + row, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + assert!(array_contains_str(row, "/next_evidence", "Record container image digest evidence.")?); + + if same_corpus { + assert!(!array_contains_str( + row, + "/next_evidence", + "Map this product to the same corpus." + )?); + } else { + assert!(array_contains_str(row, "/next_evidence", "Map this product to the same corpus.")?); + } + + Ok(()) +} + #[test] fn adversarial_quality_fixture_catches_unsupported_and_stale_regressions() -> Result<()> { let temp_dir = @@ -918,7 +1027,7 @@ fn assert_stale_regression_is_wrong_result(temp_dir: &Path) -> Result<()> { ); assert_eq!( stale_report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), - Some(221) + Some(241) ); assert!(array_contains_str( &stale_report, @@ -1072,7 +1181,7 @@ fn external_adapter_run_summarizes_nonzero_scenario_losses() -> Result<()> { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(49) + Some(52) ); assert_eq!( report @@ -1121,11 +1230,11 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(23) + Some(26) ); assert_eq!( report.pointer("/external_adapters/summary/external_project_count").and_then(Value::as_u64), - Some(16) + Some(19) ); assert_eq!( report.pointer("/external_adapters/summary/fixture_backed_count").and_then(Value::as_u64), @@ -1143,7 +1252,7 @@ fn assert_external_adapter_manifest_summary(report: &Value) { ); assert_eq!( report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), - Some(11) + Some(14) ); assert_external_adapter_manifest_status_summary(report); @@ -1179,7 +1288,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/overall_status_counts/blocked") .and_then(Value::as_u64), - Some(7) + Some(10) ); assert_eq!( report @@ -1203,7 +1312,7 @@ fn assert_external_adapter_manifest_status_summary(report: &Value) { report .pointer("/external_adapters/summary/suite_status_counts/blocked") .and_then(Value::as_u64), - Some(24) + Some(29) ); assert_eq!( report @@ -1248,7 +1357,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_status_counts/blocked") .and_then(Value::as_u64), - Some(21) + Some(24) ); assert_eq!( report @@ -1302,7 +1411,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_position_counts/untested") .and_then(Value::as_u64), - Some(50) + Some(53) ); assert_eq!( report @@ -1332,7 +1441,7 @@ fn assert_external_adapter_manifest_scenario_summary(report: &Value) { report .pointer("/external_adapters/summary/scenario_outcome_counts/blocked") .and_then(Value::as_u64), - Some(26) + Some(29) ); assert_eq!( report @@ -1543,8 +1652,8 @@ fn assert_elf_fixture_adapter_record(adapter: &Value) -> Result<()> { assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("blocked")); assert!(adapter.pointer("/run/evidence").and_then(Value::as_str).is_some_and(|evidence| { - evidence.contains("60 jobs across 16 suites") - && evidence.contains("53 pass") + evidence.contains("82 jobs across 19 suites") + && evidence.contains("75 pass") && evidence.contains("7 blocked") && evidence.contains("core_archival_memory") && evidence.contains("memory_summary") @@ -3757,7 +3866,7 @@ fn assert_qmd_debug_retest_markdown_and_indexes( ); assert!(readme.contains("qmd Debug-Ergonomics Dreaming Retest Report - June 19, 2026")); assert!(readme.contains("Temporal and Trajectory Adapter Coverage Report - June 23, 2026")); - assert!(readme.contains("Latest real-world benchmark report: June 23, 2026")); + assert!(readme.contains("Latest real-world benchmark report: June 27, 2026")); assert!(readme.contains("keeps the qmd edge unchanged")); } @@ -6307,9 +6416,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); assert!(markdown.contains("### Adapter Scenario Judgments")); - assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=50`")); + assert!(markdown.contains("ELF scenario positions: `wins=10, ties=11, loses=1, untested=53`")); assert!(markdown.contains( - "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=19, blocked=26, non_goal=5`" + "Scenario comparison outcomes: `win=10, tie=11, loss=1, not_tested=19, blocked=29, non_goal=5`" )); assert!(markdown.contains("| `claude_mem_live_baseline` | `same_corpus_retrieval`")); assert!(markdown.contains("| `memsearch_live_baseline` | `ttl_expiry_lifecycle`")); @@ -8563,16 +8672,24 @@ fn assert_root_scoreboard_summary(report: &Value) -> Result<()> { ); assert_eq!( report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), - Some(220) + Some(240) ); assert_eq!( report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), - Some(227) + Some(247) ); assert_eq!( report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), Some(false) ); + assert!(array_contains_str(report, "/scoreboard/result_states", "not_comparable")?); + assert_eq!( + report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); + + assert_root_scoreboard_rows(report)?; for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { assert!(array_contains_str(report, "/scoreboard/typed_non_pass_states_present", state)?); @@ -8594,6 +8711,80 @@ fn assert_root_scoreboard_summary(report: &Value) -> Result<()> { Ok(()) } +fn assert_root_scoreboard_rows(report: &Value) -> Result<()> { + let rows = array_at(report, "/scoreboard/rows")?; + let elf = find_by_field(rows, "/product_id", "elf_current_report")?; + let qmd = find_by_field(rows, "/product_id", "qmd")?; + let graphify = find_by_field(rows, "/product_id", "graphify")?; + let pageindex = find_by_field(rows, "/product_id", "vectifyai_pageindex")?; + let openkb = find_by_field(rows, "/product_id", "vectifyai_openkb")?; + let honcho = find_by_field(rows, "/product_id", "plastic_labs_honcho")?; + + assert_eq!(rows.len(), 20); + assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64), Some(0.988)); + assert_eq!( + elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64), + Some(0.415) + ); + assert_eq!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64), Some(0.988)); + assert_eq!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64), Some(0.985)); + assert_eq!( + elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/lifecycle/update_correctness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/lifecycle/delete_correctness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/coverage/typed_non_pass_count").and_then(Value::as_u64), + Some(7) + ); + assert!(array_contains_str( + elf, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + + for competitor in [qmd, graphify] { + assert_eq!( + competitor.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!( + competitor.pointer("/result_state").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!(competitor.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); + assert_eq!( + competitor.pointer("/container_digest_identified").and_then(Value::as_bool), + Some(false) + ); + assert!(competitor.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(array_contains_str( + competitor, + "/next_evidence", + "Record container image digest evidence." + )?); + } + + assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; + assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; + assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; + + Ok(()) +} + fn assert_root_proactive_brief_summary(report: &Value) { assert_eq!( report.pointer("/summary/proactive_brief/job_count").and_then(Value::as_u64), diff --git a/docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md b/docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md new file mode 100644 index 00000000..a61945d4 --- /dev/null +++ b/docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md @@ -0,0 +1,190 @@ +--- +type: Evidence +title: "Public Quantitative Competitor Scoreboard Report - June 27, 2026" +description: "Public evidence report for the ELF agent-memory quantitative competitor scoreboard and row-level comparability blockers." +resource: docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md +status: active +authority: evidence +owner: benchmarking +last_verified: 2026-06-27 +tags: + - docs + - evidence + - benchmarking + - competitor-scoreboard +source_refs: + - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json + - apps/elf-eval/fixtures/real_world_memory/ + - apps/elf-eval/fixtures/real_world_external_adapters/ +code_refs: + - Makefile.toml + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/tests/real_world_job_benchmark.rs + - docs/spec/agent_memory_quantitative_benchmark_v1.md +related: + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md + - docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md +drift_watch: + - docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md + - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - docs/spec/agent_memory_quantitative_benchmark_v1.md + - docs/evidence/benchmarking/index.md + - README.md +--- +# Public Quantitative Competitor Scoreboard Report - June 27, 2026 + +Purpose: Publish the public quantitative competitor scoreboard for agent-memory +retrieval and memory-quality evidence without turning typed blockers into broad +leaderboard claims. +Status: evidence +Read this when: You need the June 27 public scoreboard rows, quantitative metrics, +competitor strengths, typed non-pass states, or next optimization direction. +Not this document: Private-corpus production proof, provider-backed private quality +proof, or universal product-superiority evidence. +Inputs: `apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json`. + +## Commands + +The checked-in snapshot was generated from the full real-world memory fixture pack: + +```sh +cargo make real-world-memory-quantitative-scoreboard +``` + +That task writes the reproducible working artifacts to +`tmp/real-world-memory/quantitative-scoreboard/report.json` and +`tmp/real-world-memory/quantitative-scoreboard/report.md`. + +The checked-in snapshot uses the same runner and arguments with an evidence output +path: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- run \ + --fixtures apps/elf-eval/fixtures/real_world_memory \ + --out apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json \ + --run-id public-quantitative-competitor-scoreboard \ + --adapter-id elf_real_world_memory_fixture \ + --adapter-name "ELF real-world memory fixture" +``` + +The Markdown renderer was also exercised against the snapshot: + +```sh +cargo run -p elf-eval --bin real_world_job_benchmark -- publish \ + --report apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json \ + --out tmp/real-world-memory/public-quantitative-competitor-scoreboard-report.md +``` + +The source JSON remains the authoritative machine-readable evidence for +`source_provenance[]`, row strengths, weaknesses, metrics, and `next_evidence[]`. + +## Scoreboard Basis + +- Schema: `elf.quality_scoreboard/v1`. +- Metric basis: `produced_evidence_order`. +- Retrieval `k`: `5`. +- Fixture run: `82` jobs across `19` encoded suites. +- Encoded job status: `75` pass, `7` blocked, `0` wrong_result, `0` incomplete, + `0` not_encoded, and `0` unsupported_claim. +- Aggregate expected evidence recall: `1.000` (`172/172`). +- Aggregate source-ref coverage: `1.000` (`180/180`). +- Aggregate quote coverage: `1.000` (`180/180`). +- Mean fixture latency: `2.885 ms`; fixture cost: `0.000 USD`. +- Scoreboard rows: `20` tracked products. +- Aggregate scoreboard claim: `typed_non_pass_present`; unqualified win claim allowed: + `false`. + +## Public Rows + +No row is comparable in this snapshot. ELF has same-corpus source-id-mapped fixture +metrics, but it is not a held-out, leakage-audited, Docker-contained product-runtime +row with container digest evidence. qmd and graphify have `live_real_world` row +evidence, but digest evidence, held-out/leakage audits, and pass-state comparable +metrics are still missing. + +| Product | State | Evidence | Comparable | Runtime/Digest | Quantitative score or typed blocker | Primary source provenance | +| --- | --- | --- | --- | --- | --- | --- | +| ELF | `blocked` | `fixture_backed` | `false` | `false` / `false` | recall@5 `0.988`, precision@5 `0.415`, MRR `0.988`, nDCG `0.985`, stale/update/delete/source-ref rates `1.000`; 7 encoded blockers remain. | `apps/elf-eval/fixtures/real_world_memory/` | +| GraphRAG | `blocked` | `research_gate` | `false` | `false` / `false` | Research/setup blocker; no comparable retrieval metrics. | GraphRAG smoke/research-gate artifacts in snapshot `source_provenance[]`. | +| Graphiti/Zep | `blocked` | `research_gate` | `false` | `false` / `false` | Temporal graph validity blocker; no comparable retrieval metrics. | Graphiti/Zep smoke/research-gate artifacts in snapshot `source_provenance[]`. | +| LangGraph | `not_encoded` | `research_gate` | `false` | `false` / `false` | Persistence/work-resume scoring not encoded. | LangGraph persistence source in snapshot `source_provenance[]`. | +| Letta | `blocked` | `research_gate` | `false` | `false` / `false` | Core/archive and project-decision readback blockers remain. | Letta docs/export-readback artifacts in snapshot `source_provenance[]`. | +| LightRAG | `blocked` | `research_gate` | `false` | `false` / `false` | Retrieval/context-source adapter blocker; no comparable retrieval metrics. | LightRAG smoke/research-gate artifacts in snapshot `source_provenance[]`. | +| OpenViking | `wrong_result` | `live_baseline` | `false` | `false` / `false` | Local embedding setup passes, but retrieval/context-trajectory rows include wrong-result, blocked, and not-encoded evidence. | Live-baseline and OpenViking report artifacts in snapshot `source_provenance[]`. | +| RAGFlow | `blocked` | `research_gate` | `false` | `false` / `false` | RAGFlow retrieval/production-ops adapter blockers remain. | RAGFlow smoke artifacts in snapshot `source_provenance[]`. | +| VectifyAI OpenKB | `blocked` | `research_gate` | `false` | `false` / `false` | Same-corpus OpenKB fixture provenance exists, but product-runtime wiki/entity/concept output, source-id mapping, held-out/leakage evidence, and digest metadata remain blocked. | `apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/openkb_wiki_recompile_blocked.json`; `docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md`. | +| VectifyAI PageIndex | `blocked` | `research_gate` | `false` | `false` / `false` | Same-corpus PageIndex fixture provenance exists, but product-runtime tree artifacts, cited node paths, traversal output, source-id mapping, held-out/leakage evidence, and digest metadata remain blocked. | `apps/elf-eval/fixtures/real_world_external_adapters/pageindex_openkb/pageindex_long_document_tree_blocked.json`; `docs/evidence/benchmarking/2026-06-22-pageindex-openkb-same-corpus-adapter-report.md`. | +| agentmemory | `blocked` | `live_baseline` | `false` | `false` / `false` | Same-corpus retrieval has evidence, but lifecycle/capture/work-resume blockers remain. | agentmemory live-baseline artifacts in snapshot `source_provenance[]`. | +| claude-mem | `wrong_result` | `live_baseline` | `false` | `false` / `false` | Durable/progressive-disclosure strengths exist, but live-baseline, capture, and operator rows remain wrong-result or blocked. | claude-mem live-baseline artifacts in snapshot `source_provenance[]`. | +| gbrain | `blocked` | `research_gate` | `false` | `false` / `false` | Knowledge-compilation/operator-debug scoring not encoded. | gbrain source records in snapshot `source_provenance[]`. | +| graphify | `wrong_result` | `live_real_world` | `false` | `true` / `false` | Docker graph-report generation reaches runtime, but current scored row has wrong-result plus blocked/not-encoded evidence and lacks digest evidence. | graphify smoke artifacts in snapshot `source_provenance[]`. | +| llm-wiki | `not_encoded` | `research_gate` | `false` | `false` / `false` | Knowledge/work-resume scoring not encoded. | llm-wiki source record in snapshot `source_provenance[]`. | +| mem0/OpenMemory | `blocked` | `live_baseline` | `false` | `false` / `false` | History, personalization, delete audit, and local export readback strengths exist; product UI/export and broader personalization rows remain blocked/not encoded. | mem0/OpenMemory live-baseline artifacts in snapshot `source_provenance[]`. | +| memsearch | `not_encoded` | `live_baseline` | `false` | `false` / `false` | Markdown store, reindex, and same-corpus retrieval strengths exist; retrieval/evolution/trust rows are not encoded as comparable product runtime. | memsearch live-baseline artifacts in snapshot `source_provenance[]`. | +| nanograph | `not_encoded` | `research_gate` | `false` | `false` / `false` | Memory-evolution and retrieval scoring not encoded. | nanograph source record in snapshot `source_provenance[]`. | +| plastic-labs Honcho | `blocked` | `research_gate` | `false` | `false` / `false` | Requested public comparison row with source provenance, but no same-corpus benchmark adapter, product-runtime output, source-id mapping, held-out/leakage evidence, latency/cost/resource metrics, or digest evidence is checked in. | Honcho repository and documentation source records in snapshot `source_provenance[]`. | +| qmd | `wrong_result` | `live_real_world` | `false` | `true` / `false` | CLI retrieval/replay and targeted live-pass strengths exist, but current full live-real-world rows include wrong-result/not-encoded/blocker states and no digest evidence. | qmd live-adapter and live-baseline artifacts in snapshot `source_provenance[]`. | + +## Strengths and Weaknesses + +ELF strengths in this snapshot: + +- Complete expected-evidence, source-ref, and quote coverage for encoded fixture jobs. +- Zero unsupported claims, wrong results, stale answers, stale retrievals, redaction + leaks, or scope violations in the generated fixture report. +- Work Continuity readback metrics are encoded: reset/resume, decision-rationale + recall, rejected-option suppression, explicit next-step precision, inferred-step + labeling, handoff source-ref coverage, redaction, and janitor false-promotion + boundaries are all reported. +- Lifecycle scoreboard metrics for stale suppression, update correctness, and delete + correctness are all `1.000` for encoded rows. + +Competitor strengths preserved by the scoreboard: + +- qmd remains a strong local CLI retrieval/replay reference with live-real-world + adapter evidence. +- mem0/OpenMemory keeps measured strengths for history, entity-scoped + personalization, deletion audit, and local export-style readback. +- claude-mem keeps progressive-disclosure and durable local repository strengths. +- memsearch keeps Markdown canonical-store, reindex, and same-corpus retrieval + strengths. +- graphify has Docker runtime graph-report generation evidence. +- OpenViking has local embedding setup evidence and remains the trajectory reference, + but its context-trajectory comparison rows are still blocked. +- VectifyAI PageIndex remains the long-document tree retrieval and PageIndex MCP + reference, now represented as a typed same-corpus blocker row. +- VectifyAI OpenKB remains the compiled wiki, concept/entity index, lint, watch, and + recompile workflow reference, now represented as a typed same-corpus blocker row. +- plastic-labs Honcho is tracked as a requested public comparison target with source + provenance only; no product-runtime strength is scored in this snapshot. + +Shared weaknesses and claim boundaries: + +- No tracked product row is a comparable product-runtime pass in this snapshot. +- Missing held-out split and leakage-audit evidence block all rows. +- Missing container image digest evidence blocks live-real-world rows from + comparability. +- Missing product-runtime source-id mapping blocks every external row from + comparability, including PageIndex, OpenKB, and Honcho. +- Research-gate, fixture-backed, live-baseline, blocked, not-encoded, and + wrong-result evidence must remain visible and cannot be collapsed into wins or + parity. + +## Optimization Direction + +Next optimization work should: + +- Capture Docker image digests and runtime metadata for product-runtime rows. +- Add held-out and leakage-audit manifests before broad competitor comparisons. +- Promote external adapters from typed blockers to same-corpus source-id-mapped + runtime rows only after they emit comparable evidence. +- Add Honcho runtime adapter output before scoring Honcho retrieval, memory-quality, + or work-continuity behavior. +- Use row-level metrics for optimization direction; do not claim a universal + leaderboard. + +This report supports a public quantitative scoreboard shape and a current evidence +snapshot. It does not prove private-corpus, provider-backed, hosted managed-memory, +or universal product-superiority claims. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 4c23b778..73dd453b 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -58,3 +58,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-23-p3-competitor-strength-absorption-report.md`: P3 Competitor-Strength Absorption Report - June 23, 2026; closes XY-1072 by naming which qmd, PageIndex/OpenKB, mem0/OpenMemory, Letta, Graphiti/Zep, OpenViking, RAGFlow, GraphRAG, and LightRAG strengths ELF absorbed, which remain stronger elsewhere or blocked, and which P4 optimization queue items are ready for main-thread inspection without applying a queue label. - `2026-06-23-p4-production-readiness-evidence-gates-report.md`: P4 Production-Readiness Evidence Gates Report - June 23, 2026; adds `cargo make real-world-memory-p4-production-readiness`, records latency, cost, resource, cold-start, restore, and Qdrant rebuild evidence, separates local fixture, public-proxy, private-corpus, and provider-backed tiers, and preserves private/provider inputs as typed blockers. - `2026-06-23-p4-quality-hardening-productization-readiness-report.md`: P4 Quality Hardening and Productization Readiness Report - June 23, 2026; adds `cargo make real-world-memory-p4-quality-hardening-closeout`, reruns adversarial, source-library, knowledge, and production-readiness slices, preserves private/provider blockers, and keeps P5 queueing behind main-thread acceptance with a narrowed productization scope. +- `2026-06-27-public-quantitative-competitor-scoreboard-report.md`: Public Quantitative Competitor Scoreboard Report - June 27, 2026; publishes `elf.quality_scoreboard/v1` rows for 20 tracked products, including VectifyAI PageIndex, VectifyAI OpenKB, and plastic-labs Honcho typed rows. Rows expose recall@5, precision@5, MRR, nDCG, lifecycle, source-ref, and latency metrics where measured, and typed blocker, source-provenance, and next-evidence metadata where comparable metrics are not yet available, while preserving zero comparable product-runtime pass claims until held-out, leakage-audited, digest-identified runtime evidence exists. diff --git a/docs/log.md b/docs/log.md index 4a9337dc..fc20e60e 100644 --- a/docs/log.md +++ b/docs/log.md @@ -144,3 +144,9 @@ logs. `elf.authority_recovery_drill/v1` report artifacts, validating topology, degraded reads, RPO/RTO, authority record counts, idempotent outbox replay, Qdrant rebuild, migration repair, and dead-letter handling, and linking the drift audit. +- Added the XY-1120 public quantitative competitor scoreboard contract and report, + defining `elf.quality_scoreboard/v1` row metrics, comparability gates, typed + `not_comparable` boundaries, source-provenance and next-evidence metadata, and a + checked-in snapshot covering 20 tracked products, including explicit VectifyAI + PageIndex, VectifyAI OpenKB, and plastic-labs Honcho typed rows, without promoting + any row to a universal product leaderboard claim. diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md new file mode 100644 index 00000000..07023ca1 --- /dev/null +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -0,0 +1,216 @@ +--- +type: Spec +title: "Agent Memory Quantitative Benchmark v1" +description: "Define the public quantitative competitor scoreboard row contract and claim boundaries." +resource: docs/spec/agent_memory_quantitative_benchmark_v1.md +status: active +authority: normative +owner: spec +last_verified: 2026-06-27 +tags: + - docs + - spec + - benchmarking + - agent-memory +source_refs: + - XY-1098 + - XY-1120 +code_refs: + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/tests/real_world_job_benchmark.rs +related: + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md +drift_watch: + - docs/spec/agent_memory_quantitative_benchmark_v1.md + - docs/spec/real_world_agent_memory_benchmark_v1.md + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json +--- +# Agent Memory Quantitative Benchmark v1 + +Purpose: Define the public quantitative competitor scoreboard row contract and claim +boundaries. +Status: normative +Read this when: You are implementing, validating, or publishing the public +competitor-quality scoreboard for agent memory systems. +Not this document: Real-world job fixture schema, Work Journal behavior, operational +runbooks, or external adapter setup procedures. +Defines: `elf.quality_scoreboard/v1` quantitative rows, metrics, comparability gates, +typed non-pass behavior, and optimization-direction metadata. + +## Scope + +The quantitative scoreboard turns `real_world_job` reports and external adapter +manifest records into public product rows. It is a row-level evidence contract, not a +universal leaderboard. It is allowed to say which metrics are proven for a row, which +competitor strengths remain visible, and which evidence is missing before a row can be +treated as comparable. + +This contract applies to reports with schema `elf.quality_scoreboard/v1`. + +## Scoreboard Report + +A report MUST include: + +- `schema`: exactly `elf.quality_scoreboard/v1`. +- `result_states`: the public row-state enum. +- `evidence_classes`: the public evidence-class enum. +- `metric_basis`: the ranking basis used for retrieval metrics. +- `retrieval_k`: the `k` used for recall, precision, MRR, and nDCG. +- typed non-pass counts and visible typed non-pass states for encoded jobs, external + adapter rows, and the aggregate report. +- evidence-class counts. +- bounded encoded-job and aggregate summary claims. +- `unqualified_win_claim_allowed`, which MUST be `false` when any typed non-pass row + or non-comparable row exists. +- `claim_boundary`, a human-readable statement that prevents typed blockers or + fixture-only evidence from becoming broad superiority claims. +- `rows`: one row for ELF plus one row for each tracked external product represented + by the loaded adapter manifest. +- `optimization_roadmap`: concrete next optimization directions derived from missing + row evidence, not from hidden assumptions. + +## Public Row States + +| State | Meaning | +| --- | --- | +| `pass` | The row has a scored pass under its evidence class. A pass is comparable only when every comparability gate is also true. | +| `wrong_result` | The adapter or job reached the behavioral check but selected the wrong answer, evidence, lifecycle state, or action. | +| `incomplete` | Setup, build, parse, adapter wiring, or runtime execution did not reach the behavioral check. | +| `blocked` | The row cannot be completed safely without missing credentials, private input, durable runtime integration, Docker evidence, or manual product setup. | +| `not_tested` | No benchmark execution or comparable adapter output exists for the row. | +| `not_encoded` | The suite, scoring dimension, or adapter path is not implemented in the runner. | +| `not_comparable` | The row has useful evidence but lacks one or more required comparability gates, so it must not be used as a product-runtime comparison pass. | +| `unsupported_claim` | The row or source report made a substantive claim not supported by corpus evidence, source refs, or report metadata. | + +`not_comparable` is a public row state only. It is not a `real_world_job` status and +must not be written back into job or suite outcome fields. + +## Evidence Classes + +| Evidence class | Meaning | +| --- | --- | +| `fixture_backed` | Checked-in fixtures were scored. This is regression evidence, not live product-runtime evidence. | +| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists, but the row is not a real-world product-runtime scoreboard pass. | +| `live_real_world` | A live adapter executed real-world job paths and emitted typed outcomes. | +| `research_gate` | Research, source mapping, setup, credential, or resource gates are recorded before fair scoring can run. | + +## Row Fields + +Each `rows[]` entry MUST include: + +- `product_id` and `product_name`. +- `row_source`: stable source label, such as `elf_report` or + `external_adapter_manifest`. +- `evidence_class`. +- `result_state`. +- `comparable`: true only when all comparability gates are satisfied and the row has a + pass state with quantitative metrics. +- comparability gates: + - `same_corpus` + - `source_id_mapped` + - `held_out` + - `leakage_audited` + - `product_runtime` + - `container_digest_identified` +- `metrics`. +- `strengths`: product strengths supported by the row source. +- `weaknesses`: typed weaknesses, blockers, or non-pass evidence from the row source. +- `next_evidence`: row-level evidence needed before the row can become comparable. +- `source_provenance`: bounded source pointers to the input report, adapter record, or + suite records. + +`same_corpus = true` requires positive row evidence that the product or checked-in +adapter is mapped to the benchmark corpus. A blocker sentence that says same-corpus +evidence is missing is not sufficient. A typed same-corpus setup-blocker adapter may +set this gate to true only when its source provenance identifies the intended shared +benchmark corpus and the remaining blocker is runtime/source-id output, not corpus +selection. + +## Metrics + +The `metrics` object MUST include `retrieval`, `lifecycle`, `answer_safety`, +`operations`, and `coverage` sub-objects. + +`retrieval` MUST include: + +- `k`. +- `metric_basis`. +- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg`, or `null` when the row lacks + ranked produced evidence. +- `expected_evidence_recall`. +- `citation_source_ref_coverage`. +- matched, total, and produced evidence counts. + +For `metric_basis = "produced_evidence_order"`, ranked retrieval metrics use the +ordered `produced_evidence` list in the scored job output as the retrieved list. +Expected evidence ids are the relevance set. Relevance is binary. `recall_at_k` and +`precision_at_k` use the first `k` produced evidence ids. MRR is reciprocal rank of +the first relevant produced evidence id. nDCG uses binary gains with the ideal DCG +bounded by `min(k, expected_evidence_total)`. + +`lifecycle` MUST include: + +- stale suppression rate and counts. +- update correctness rate and counts. +- delete correctness rate and counts. +- rollback/history readback rate and counts. + +`answer_safety` MUST include: + +- unsupported-claim rate and count. +- stale-answer rate and count. +- hallucinated-evidence rate when measurable. +- redaction leak count. +- irrelevant-context ratio. + +`operations` MUST include: + +- mean latency in milliseconds when measured. +- total cost when cost accounting exists. +- resource-envelope status, encoded job count, and pass count. + +`coverage` MUST include: + +- job count. +- encoded suite count. +- pass count. +- typed non-pass count. +- source-ref coverage. +- evidence coverage. +- evidence class. + +## Comparability Rules + +A row is comparable only when all of the following are true: + +- `same_corpus = true`. +- `source_id_mapped = true`. +- `held_out = true`. +- `leakage_audited = true`. +- `product_runtime = true`. +- `container_digest_identified = true`. +- `result_state = "pass"`. +- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg` are present. + +If any required gate is false, the report MUST set `comparable = false`, add a +specific `next_evidence` entry for each missing gate, and avoid any win, parity, or +rank claim for that row. If an otherwise passing row is missing a required gate, the +public row state SHOULD be `not_comparable` so the report is explicit about the +reason no product-runtime comparison claim is allowed. + +## Report Claim Rules + +- A row with `fixture_backed`, `live_baseline`, or `research_gate` evidence MUST NOT + be described as a comparable product-runtime pass. +- A row with `blocked`, `incomplete`, `not_tested`, `not_encoded`, `not_comparable`, + or `unsupported_claim` MUST remain visible as a non-pass row. +- External competitors MUST have either comparable product-runtime evidence or an + explicit typed non-pass/blocker row with source provenance. +- Missing Docker image digest evidence is a blocker for comparability, even if a live + adapter executed. +- Public-proxy, fixture-only, local-mock, diagnostic, blocked, and not-encoded rows + MUST NOT be promoted into universal product superiority claims. +- Optimization direction MUST be tied to row-level `next_evidence`, metrics, or typed + non-pass states. diff --git a/docs/spec/index.md b/docs/spec/index.md index 7660f68a..c250b89c 100644 --- a/docs/spec/index.md +++ b/docs/spec/index.md @@ -31,6 +31,7 @@ Question this index answers: "what must remain true?" ## Documents +- `agent_memory_quantitative_benchmark_v1.md`: Agent Memory Quantitative Benchmark v1. - `agent_memory_knowledge_system_v1.md`: Agent Memory and Knowledge System v1. - `external_memory_pattern_radar_v1.md`: External Memory Pattern Radar v1. - `production_corpus_manifest_v1.md`: Production Corpus Manifest v1. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 120c7ee8..68e745a0 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -15,7 +15,8 @@ code_refs: - Makefile.toml - apps/elf-eval/src/bin/real_world_job_benchmark.rs - apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json -related: [] +related: + - docs/spec/agent_memory_quantitative_benchmark_v1.md drift_watch: - docs/spec/real_world_agent_memory_benchmark_v1.md - apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -73,7 +74,9 @@ blocking caveat, or fabricates a decision that is not in the corpus. The public quality scoreboard is a claim grammar, not a leaderboard. Reports MUST use the grammar below when summarizing what is proven, what is not proven, and which -evidence class supports the claim. +evidence class supports the claim. The quantitative row schema, metric definitions, +comparability gates, and optimization-direction fields are defined in +`docs/spec/agent_memory_quantitative_benchmark_v1.md`. Public result states: @@ -85,6 +88,7 @@ Public result states: | `blocked` | The check cannot be run safely without credentials, manual setup, private input, durable product runtime, or host integration outside the run scope. | | `not_tested` | No benchmark execution or comparable adapter output exists for the row. | | `not_encoded` | The suite, job, adapter path, or scoring dimension is not implemented in the runner, so no pass/fail claim is allowed. | +| `not_comparable` | Useful row evidence exists but one or more comparability gates are missing, so no product-runtime comparison pass may be claimed. | | `unsupported_claim` | The system or report made a substantive claim, decision, evidence citation, or capability claim that is not supported by the corpus, required evidence, or report metadata. | Public evidence classes: @@ -719,7 +723,12 @@ Reports MUST include: result states, evidence classes, encoded-job and external-adapter typed non-pass counts, visible typed non-pass states for each bucket and the aggregate report, evidence-class counts, bounded job and aggregate summary claims, and an explicit - unqualified-win guard; + unqualified-win guard. Public quantitative scoreboard reports MUST also include + row-level recall@k, precision@k, MRR, nDCG, expected evidence recall, source-ref + coverage, stale suppression, update correctness, delete correctness, latency, cost, + resource-envelope status, comparability gates, strengths, weaknesses, source + provenance, and row-level next-evidence metadata as defined in + `docs/spec/agent_memory_quantitative_benchmark_v1.md`; - operational evidence gates using schema `elf.operational_evidence_gates/v1`, separating `local_fixture`, `public_proxy`, `private_corpus`, and `provider_backed` tiers. The gates MUST report tier status, job counts, pass and diff --git a/docs/spec/system_version_registry.md b/docs/spec/system_version_registry.md index 4eed26ae..44ed05ba 100644 --- a/docs/spec/system_version_registry.md +++ b/docs/spec/system_version_registry.md @@ -39,6 +39,18 @@ This document is normative. When a new versioned identifier is introduced, it mu semantics, authority-layer boundaries, or claim-boundary rules become incompatible with this contract. +### Agent memory quantitative scoreboard schema + +- Identifier: `elf.quality_scoreboard/v1`. +- Type: Public quantitative competitor scoreboard report and row schema. +- Defined in: `docs/spec/agent_memory_quantitative_benchmark_v1.md`. +- Consumers: `apps/elf-eval/src/bin/real_world_job_benchmark.rs`, checked-in + benchmark report snapshots, public benchmarking evidence reports, and agents + deciding whether a product row is comparable or only a typed blocker. +- Bump rule: Introduce a new identifier only when row states, comparability gates, + metric names, metric basis semantics, evidence classes, or required row fields + become incompatible with this contract. + ### HTTP API version - Identifier: `/v2` (URL path prefix).