From de13a1f2547b578a82c270b48c210b73f01e50a2 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 00:51:29 -0400 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Batch modularize benchmark and chunk search tests","authority":"manual"} --- .../benchmark_core.rs | 381 +---------------- .../benchmark_core_adversarial_scoreboard.rs | 177 ++++++++ .../benchmark_core_blocker_rows.rs | 56 +++ .../benchmark_core_capture_sources.rs | 85 ++++ .../benchmark_core_smoke_report.rs | 71 ++++ .../root_aggregate_summary.rs | 381 +---------------- .../root_aggregate_summary_counts.rs | 95 +++++ .../root_aggregate_summary_scoreboard.rs | 136 ++++++ .../root_aggregate_summary_suite_summaries.rs | 141 ++++++ .../chunk_search/relation_context.rs | 401 +----------------- .../relation_context/fact_bounds.rs | 60 +++ .../chunk_search/relation_context/fixture.rs | 159 +++++++ .../chunk_search/relation_context/records.rs | 132 ++++++ .../relation_context/temporal_status.rs | 65 +++ .../acceptance/chunk_search/tests_core.rs | 400 +---------------- .../chunk_search/tests_core/basic_search.rs | 204 +++++++++ .../chunk_search/tests_core/dedupe.rs | 88 ++++ .../chunk_search/tests_core/progressive.rs | 110 +++++ 18 files changed, 1602 insertions(+), 1540 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_adversarial_scoreboard.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_blocker_rows.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_capture_sources.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_smoke_report.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_suite_summaries.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/relation_context/fact_bounds.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/relation_context/fixture.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/relation_context/records.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/relation_context/temporal_status.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/tests_core/basic_search.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/tests_core/dedupe.rs create mode 100644 packages/elf-service/tests/acceptance/chunk_search/tests_core/progressive.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core.rs b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core.rs index dae4414f..31d5ce74 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core.rs @@ -1,377 +1,6 @@ -use color_eyre::Result; -use serde_json::Value; +mod benchmark_core_adversarial_scoreboard; +mod benchmark_core_blocker_rows; +mod benchmark_core_capture_sources; +mod benchmark_core_smoke_report; -use crate::support; - -pub(super) fn assert_tracked_external_blocker_row( - row: &Value, - product_name: &str, - same_corpus: bool, -) -> Result<()> { - assert_eq!(row.pointer("/product_name").and_then(Value::as_str), Some(product_name)); - assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("blocked")); - assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); - assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/same_corpus").and_then(Value::as_bool), Some(same_corpus)); - assert_eq!(row.pointer("/source_id_mapped").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); - assert!(row.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); - assert!(row.pointer("/metrics/retrieval/precision_at_k").is_some_and(Value::is_null)); - assert!(row.pointer("/metrics/retrieval/mrr").is_some_and(Value::is_null)); - assert!(row.pointer("/metrics/retrieval/ndcg").is_some_and(Value::is_null)); - assert!(support::array_contains_str( - row, - "/next_evidence", - "Map returned evidence to stable source ids." - )?); - assert!(support::array_contains_str( - row, - "/next_evidence", - "Run a Docker-contained product-runtime adapter for this row." - )?); - assert!(support::array_contains_str( - row, - "/next_evidence", - "Record container image digest evidence." - )?); - - if same_corpus { - assert!(!support::array_contains_str( - row, - "/next_evidence", - "Map this product to the same corpus." - )?); - } else { - assert!(support::array_contains_str( - row, - "/next_evidence", - "Map this product to the same corpus." - )?); - } - - Ok(()) -} - -#[test] -fn smoke_fixture_produces_typed_json_report() -> Result<()> { - let report = support::run_json_report()?; - - assert_eq!( - report.pointer("/schema").and_then(Value::as_str), - Some("elf.real_world_job_report/v1") - ); - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), - Some(26) - ); - assert_eq!( - report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), - Some(14) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let job = support::find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; - - assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("work_resume")); - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(2.0)); - assert_eq!(job.pointer("/cost/amount").and_then(Value::as_f64), Some(0.0)); - - let expected_evidence = support::array_at(job, "/expected_evidence")?; - let produced_evidence = support::array_at(job, "/produced_evidence")?; - - assert_eq!(expected_evidence.len(), 2); - assert_eq!(produced_evidence.len(), 1); - assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("xy844-current-worktree")); - - let suites = support::array_at(&report, "/suites")?; - let encoded_suite = support::find_by_field(suites, "/suite_id", "work_resume")?; - let capture_suite = support::find_by_field(suites, "/suite_id", "capture_integration")?; - let unencoded_suite = support::find_by_field(suites, "/suite_id", "retrieval")?; - - assert_eq!(encoded_suite.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(encoded_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(capture_suite.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(capture_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); - assert_eq!(unencoded_suite.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - - let capture_fixture_backed = support::array_at(&report, "/capture_integration/fixture_backed")?; - - assert!(capture_fixture_backed.iter().any(|value| { - value.as_str().is_some_and(|item| item.contains("agentmemory-style hook capture")) - })); - - let capture_not_encoded = support::array_at(&report, "/capture_integration/not_encoded")?; - - assert!(capture_not_encoded.iter().any(|value| { - value.as_str().is_some_and(|item| item.contains("No live external hook ingestion")) - })); - - Ok(()) -} - -#[test] -fn capture_integration_fixtures_score_redaction_and_source_ids() -> Result<()> { - let report = support::run_json_report_from(support::capture_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); - - let suites = support::array_at(&report, "/suites")?; - let capture = support::find_by_field(suites, "/suite_id", "capture_integration")?; - - assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(capture.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); - - let jobs = support::array_at(&report, "/jobs")?; - let source_id = support::find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; - let redaction = support::find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; - - assert!(support::array_contains_str( - source_id, - "/produced_evidence", - "source-id-release-summary" - )?); - assert!(support::array_contains_str(source_id, "/produced_evidence", "source-id-command-log")?); - assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert!( - redaction - .pointer("/produced_answer") - .and_then(Value::as_str) - .is_some_and(|answer| !answer.contains("orchid-envelope")) - ); - - Ok(()) -} - -#[test] -fn source_library_fixtures_score_saved_sources_without_memory_promotion() -> Result<()> { - let report = support::run_json_report_from(support::source_library_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); - - let suites = support::array_at(&report, "/suites")?; - let source_library = support::find_by_field(suites, "/suite_id", "source_library")?; - - assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(source_library.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); - - let jobs = support::array_at(&report, "/jobs")?; - let long_doc = support::find_by_field(jobs, "/job_id", "source-library-long-doc-001")?; - let thread = support::find_by_field(jobs, "/job_id", "source-library-social-thread-001")?; - - assert!(support::array_contains_str(long_doc, "/produced_evidence", "article-source-record")?); - assert!(support::array_contains_str( - long_doc, - "/produced_evidence", - "article-hydrated-excerpt" - )?); - assert!(support::array_contains_str(thread, "/produced_evidence", "thread-source-record")?); - assert!(support::array_contains_str( - thread, - "/produced_evidence", - "thread-promotion-boundary" - )?); - assert!(long_doc.pointer("/produced_answer").and_then(Value::as_str).is_some_and(|answer| { - answer.contains("does not automatically create a durable Memory Note") - })); - assert!( - thread - .pointer("/produced_answer") - .and_then(Value::as_str) - .is_some_and(|answer| answer.contains("explicit add_note or reviewed promotion")) - ); - - Ok(()) -} - -#[test] -fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(2) - ); - assert_eq!( - report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(3) - ); - assert_eq!( - report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), - Some(1) - ); - - let result_states = support::string_array_at(&report, "/scoreboard/result_states")?; - let evidence_classes = support::string_array_at(&report, "/scoreboard/evidence_classes")?; - - assert_eq!( - result_states, - [ - "pass", - "wrong_result", - "incomplete", - "blocked", - "not_tested", - "not_encoded", - "not_comparable", - "unsupported_claim", - ] - .map(str::to_owned) - ); - assert_eq!( - evidence_classes, - ["fixture_backed", "live_baseline", "live_real_world", "research_gate"].map(str::to_owned) - ); - assert_eq!( - report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), - Some("typed_non_pass_present") - ); - assert_eq!( - report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), - Some("all_encoded_jobs_passed") - ); - assert_eq!( - report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), - Some(240) - ); - assert_eq!( - report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), - Some(240) - ); - assert_eq!( - support::string_array_at(&report, "/scoreboard/job_typed_non_pass_states_present")?, - Vec::::new() - ); - - for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { - assert!(support::array_contains_str( - &report, - "/scoreboard/typed_non_pass_states_present", - state - )?); - assert!(support::array_contains_str( - &report, - "/scoreboard/external_adapter_typed_non_pass_states_present", - state - )?); - } - - assert_eq!( - report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report.pointer("/scoreboard/evidence_class_counts/live_baseline").and_then(Value::as_u64), - Some(6) - ); - assert_eq!( - report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), - Some("produced_evidence_order") - ); - assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); - - assert_scoreboard_rows_expose_quantitative_and_blocker_contract(&report)?; - - let suites = support::array_at(&report, "/suites")?; - let adversarial = support::find_by_field(suites, "/suite_id", "adversarial_quality")?; - - assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - - Ok(()) -} - -fn assert_scoreboard_rows_expose_quantitative_and_blocker_contract(report: &Value) -> Result<()> { - let rows = support::array_at(report, "/scoreboard/rows")?; - let elf = support::find_by_field(rows, "/product_id", "elf_current_report")?; - let qmd = support::find_by_field(rows, "/product_id", "qmd")?; - let pageindex = support::find_by_field(rows, "/product_id", "vectifyai_pageindex")?; - let openkb = support::find_by_field(rows, "/product_id", "vectifyai_openkb")?; - let honcho = support::find_by_field(rows, "/product_id", "plastic_labs_honcho")?; - - assert_eq!(rows.len(), 20); - assert_eq!(elf.pointer("/product_name").and_then(Value::as_str), Some("ELF")); - assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("not_comparable")); - assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); - assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); - assert_eq!(elf.pointer("/held_out").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); - assert_eq!( - elf.pointer("/metrics/retrieval/metric_basis").and_then(Value::as_str), - Some("produced_evidence_order") - ); - assert_eq!(elf.pointer("/metrics/retrieval/k").and_then(Value::as_u64), Some(5)); - assert!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64).is_some()); - assert!(elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64).is_some()); - assert!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64).is_some()); - assert!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64).is_some()); - assert_eq!( - elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - elf.pointer("/metrics/coverage/source_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert!(support::array_contains_str( - elf, - "/next_evidence", - "Run a Docker-contained product-runtime adapter for this row." - )?); - assert!(support::array_contains_str( - elf, - "/next_evidence", - "Record container image digest evidence." - )?); - assert_eq!(qmd.pointer("/product_name").and_then(Value::as_str), Some("qmd")); - assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); - assert_eq!(qmd.pointer("/comparable").and_then(Value::as_bool), Some(false)); - assert_eq!(qmd.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); - assert_eq!(qmd.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); - assert!(qmd.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); - assert!(support::array_contains_str( - qmd, - "/next_evidence", - "Record container image digest evidence." - )?); - - assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; - assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; - assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; - - Ok(()) -} +pub(super) use benchmark_core_blocker_rows::assert_tracked_external_blocker_row; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_adversarial_scoreboard.rs b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_adversarial_scoreboard.rs new file mode 100644 index 00000000..7825b6fe --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_adversarial_scoreboard.rs @@ -0,0 +1,177 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn adversarial_quality_fixtures_score_scoreboard_gates() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report.pointer("/summary/history_readback_encoded_count").and_then(Value::as_u64), + Some(1) + ); + + let result_states = support::string_array_at(&report, "/scoreboard/result_states")?; + let evidence_classes = support::string_array_at(&report, "/scoreboard/evidence_classes")?; + + assert_eq!( + result_states, + [ + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "not_comparable", + "unsupported_claim", + ] + .map(str::to_owned) + ); + assert_eq!( + evidence_classes, + ["fixture_backed", "live_baseline", "live_real_world", "research_gate"].map(str::to_owned) + ); + assert_eq!( + report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), + Some("all_encoded_jobs_passed") + ); + assert_eq!( + report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), + Some(240) + ); + assert_eq!( + report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), + Some(240) + ); + assert_eq!( + support::string_array_at(&report, "/scoreboard/job_typed_non_pass_states_present")?, + Vec::::new() + ); + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(support::array_contains_str( + &report, + "/scoreboard/typed_non_pass_states_present", + state + )?); + assert!(support::array_contains_str( + &report, + "/scoreboard/external_adapter_typed_non_pass_states_present", + state + )?); + } + + assert_eq!( + report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/scoreboard/evidence_class_counts/live_baseline").and_then(Value::as_u64), + Some(6) + ); + assert_eq!( + report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); + + assert_scoreboard_rows_expose_quantitative_and_blocker_contract(&report)?; + + let suites = support::array_at(&report, "/suites")?; + let adversarial = support::find_by_field(suites, "/suite_id", "adversarial_quality")?; + + assert_eq!(adversarial.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adversarial.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + + Ok(()) +} + +fn assert_scoreboard_rows_expose_quantitative_and_blocker_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/scoreboard/rows")?; + let elf = support::find_by_field(rows, "/product_id", "elf_current_report")?; + let qmd = support::find_by_field(rows, "/product_id", "qmd")?; + let pageindex = support::find_by_field(rows, "/product_id", "vectifyai_pageindex")?; + let openkb = support::find_by_field(rows, "/product_id", "vectifyai_openkb")?; + let honcho = support::find_by_field(rows, "/product_id", "plastic_labs_honcho")?; + + assert_eq!(rows.len(), 20); + assert_eq!(elf.pointer("/product_name").and_then(Value::as_str), Some("ELF")); + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("not_comparable")); + assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/held_out").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert_eq!( + elf.pointer("/metrics/retrieval/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(elf.pointer("/metrics/retrieval/k").and_then(Value::as_u64), Some(5)); + assert!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64).is_some()); + assert!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64).is_some()); + assert_eq!( + elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/coverage/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert!(support::array_contains_str( + elf, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + assert!(support::array_contains_str( + elf, + "/next_evidence", + "Record container image digest evidence." + )?); + assert_eq!(qmd.pointer("/product_name").and_then(Value::as_str), Some("qmd")); + assert_eq!(qmd.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(qmd.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(qmd.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); + assert_eq!(qmd.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert!(qmd.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(support::array_contains_str( + qmd, + "/next_evidence", + "Record container image digest evidence." + )?); + + crate::assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; + crate::assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; + crate::assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_blocker_rows.rs b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_blocker_rows.rs new file mode 100644 index 00000000..cd8badfe --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_blocker_rows.rs @@ -0,0 +1,56 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(crate) fn assert_tracked_external_blocker_row( + row: &Value, + product_name: &str, + same_corpus: bool, +) -> Result<()> { + assert_eq!(row.pointer("/product_name").and_then(Value::as_str), Some(product_name)); + assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("research_gate")); + assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/same_corpus").and_then(Value::as_bool), Some(same_corpus)); + assert_eq!(row.pointer("/source_id_mapped").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/container_digest_identified").and_then(Value::as_bool), Some(false)); + assert!(row.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/precision_at_k").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/mrr").is_some_and(Value::is_null)); + assert!(row.pointer("/metrics/retrieval/ndcg").is_some_and(Value::is_null)); + assert!(support::array_contains_str( + row, + "/next_evidence", + "Map returned evidence to stable source ids." + )?); + assert!(support::array_contains_str( + row, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + assert!(support::array_contains_str( + row, + "/next_evidence", + "Record container image digest evidence." + )?); + + if same_corpus { + assert!(!support::array_contains_str( + row, + "/next_evidence", + "Map this product to the same corpus." + )?); + } else { + assert!(support::array_contains_str( + row, + "/next_evidence", + "Map this product to the same corpus." + )?); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_capture_sources.rs b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_capture_sources.rs new file mode 100644 index 00000000..d5b982b4 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_capture_sources.rs @@ -0,0 +1,85 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn capture_integration_fixtures_score_redaction_and_source_ids() -> Result<()> { + let report = support::run_json_report_from(support::capture_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = support::array_at(&report, "/suites")?; + let capture = support::find_by_field(suites, "/suite_id", "capture_integration")?; + + assert_eq!(capture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = support::array_at(&report, "/jobs")?; + let source_id = support::find_by_field(jobs, "/job_id", "capture-source-id-binding-001")?; + let redaction = support::find_by_field(jobs, "/job_id", "capture-write-policy-redaction-001")?; + + assert!(support::array_contains_str( + source_id, + "/produced_evidence", + "source-id-release-summary" + )?); + assert!(support::array_contains_str(source_id, "/produced_evidence", "source-id-command-log")?); + assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert!( + redaction + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| !answer.contains("orchid-envelope")) + ); + + Ok(()) +} + +#[test] +fn source_library_fixtures_score_saved_sources_without_memory_promotion() -> Result<()> { + let report = support::run_json_report_from(support::source_library_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + + let suites = support::array_at(&report, "/suites")?; + let source_library = support::find_by_field(suites, "/suite_id", "source_library")?; + + assert_eq!(source_library.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(source_library.pointer("/encoded_job_count").and_then(Value::as_u64), Some(2)); + + let jobs = support::array_at(&report, "/jobs")?; + let long_doc = support::find_by_field(jobs, "/job_id", "source-library-long-doc-001")?; + let thread = support::find_by_field(jobs, "/job_id", "source-library-social-thread-001")?; + + assert!(support::array_contains_str(long_doc, "/produced_evidence", "article-source-record")?); + assert!(support::array_contains_str( + long_doc, + "/produced_evidence", + "article-hydrated-excerpt" + )?); + assert!(support::array_contains_str(thread, "/produced_evidence", "thread-source-record")?); + assert!(support::array_contains_str( + thread, + "/produced_evidence", + "thread-promotion-boundary" + )?); + assert!(long_doc.pointer("/produced_answer").and_then(Value::as_str).is_some_and(|answer| { + answer.contains("does not automatically create a durable Memory Note") + })); + assert!( + thread + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| answer.contains("explicit add_note or reviewed promotion")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_smoke_report.rs b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_smoke_report.rs new file mode 100644 index 00000000..a771ac3b --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/benchmark_core_smoke_report.rs @@ -0,0 +1,71 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn smoke_fixture_produces_typed_json_report() -> Result<()> { + let report = support::run_json_report()?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.real_world_job_report/v1") + ); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/external_adapters/summary/adapter_count").and_then(Value::as_u64), + Some(26) + ); + assert_eq!( + report.pointer("/external_adapters/summary/live_real_world_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/external_adapters/summary/research_gate_count").and_then(Value::as_u64), + Some(14) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let job = support::find_by_field(jobs, "/job_id", "work-resume-stale-worktree-001")?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("work_resume")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(2.0)); + assert_eq!(job.pointer("/cost/amount").and_then(Value::as_f64), Some(0.0)); + + let expected_evidence = support::array_at(job, "/expected_evidence")?; + let produced_evidence = support::array_at(job, "/produced_evidence")?; + + assert_eq!(expected_evidence.len(), 2); + assert_eq!(produced_evidence.len(), 1); + assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("xy844-current-worktree")); + + let suites = support::array_at(&report, "/suites")?; + let encoded_suite = support::find_by_field(suites, "/suite_id", "work_resume")?; + let capture_suite = support::find_by_field(suites, "/suite_id", "capture_integration")?; + let unencoded_suite = support::find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(encoded_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(encoded_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(capture_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(capture_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(unencoded_suite.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + + let capture_fixture_backed = support::array_at(&report, "/capture_integration/fixture_backed")?; + + assert!(capture_fixture_backed.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("agentmemory-style hook capture")) + })); + + let capture_not_encoded = support::array_at(&report, "/capture_integration/not_encoded")?; + + assert!(capture_not_encoded.iter().any(|value| { + value.as_str().is_some_and(|item| item.contains("No live external hook ingestion")) + })); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary.rs index 47d6dd8e..dc6657c1 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary.rs @@ -1,377 +1,18 @@ +mod root_aggregate_summary_counts; +mod root_aggregate_summary_scoreboard; +mod root_aggregate_summary_suite_summaries; + use color_eyre::Result; use serde_json::Value; -use crate::support; - pub(crate) fn assert_root_aggregate_summary(report: &Value) -> Result<()> { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); - assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(19)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(75)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.0) - ); - assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(11) - ); - assert_eq!( - report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(16) - ); - assert_eq!( - report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), - Some(3) - ); - assert_eq!( - report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), - Some(3) - ); - assert_eq!( - report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(180) - ); - assert_eq!( - report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(180) - ); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!( - report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), - Some(0) - ); - - assert_root_scoreboard_summary(report)?; - - assert_eq!( - report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/consolidation/proposal_unsupported_claim_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/memory_summary/job_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - - assert_root_knowledge_summary(report); - assert_root_proactive_brief_summary(report); - assert_root_scheduled_memory_summary(report); - assert_root_work_continuity_summary(report); - - Ok(()) -} - -fn assert_root_scoreboard_summary(report: &Value) -> Result<()> { - assert_eq!( - report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), - Some("typed_non_pass_present") - ); - assert_eq!( - report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), - Some("typed_non_pass_present") - ); - assert_eq!( - report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), - Some(240) - ); - assert_eq!( - report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), - Some(247) - ); - assert_eq!( - report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), - Some(false) - ); - assert!(support::array_contains_str(report, "/scoreboard/result_states", "not_comparable")?); - assert_eq!( - report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), - Some("produced_evidence_order") - ); - assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); - - assert_root_scoreboard_rows(report)?; - - for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { - assert!(support::array_contains_str( - report, - "/scoreboard/typed_non_pass_states_present", - state - )?); - } - - assert_eq!( - support::string_array_at(report, "/scoreboard/job_typed_non_pass_states_present")?, - ["blocked"].map(str::to_owned) - ); - - for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { - assert!(support::array_contains_str( - report, - "/scoreboard/external_adapter_typed_non_pass_states_present", - state - )?); - } - - Ok(()) -} - -fn assert_root_scoreboard_rows(report: &Value) -> Result<()> { - let rows = support::array_at(report, "/scoreboard/rows")?; - let elf = support::find_by_field(rows, "/product_id", "elf_current_report")?; - let qmd = support::find_by_field(rows, "/product_id", "qmd")?; - let graphify = support::find_by_field(rows, "/product_id", "graphify")?; - let pageindex = support::find_by_field(rows, "/product_id", "vectifyai_pageindex")?; - let openkb = support::find_by_field(rows, "/product_id", "vectifyai_openkb")?; - let honcho = support::find_by_field(rows, "/product_id", "plastic_labs_honcho")?; - - assert_eq!(rows.len(), 20); - assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("blocked")); - assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); - assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); - assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64), Some(0.988)); - assert_eq!( - elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64), - Some(0.415) - ); - assert_eq!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64), Some(0.988)); - assert_eq!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64), Some(0.985)); - assert_eq!( - elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - elf.pointer("/metrics/lifecycle/update_correctness").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - elf.pointer("/metrics/lifecycle/delete_correctness").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - elf.pointer("/metrics/coverage/typed_non_pass_count").and_then(Value::as_u64), - Some(7) - ); - assert!(support::array_contains_str( - elf, - "/next_evidence", - "Run a Docker-contained product-runtime adapter for this row." - )?); - - for competitor in [qmd, graphify] { - assert_eq!( - competitor.pointer("/evidence_class").and_then(Value::as_str), - Some("live_real_world") - ); - assert_eq!( - competitor.pointer("/result_state").and_then(Value::as_str), - Some("wrong_result") - ); - assert_eq!(competitor.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); - assert_eq!( - competitor.pointer("/container_digest_identified").and_then(Value::as_bool), - Some(false) - ); - assert!(competitor.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); - assert!(support::array_contains_str( - competitor, - "/next_evidence", - "Record container image digest evidence." - )?); - } - - crate::assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; - crate::assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; - crate::assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; + root_aggregate_summary_counts::assert_root_summary_counts(report); + root_aggregate_summary_scoreboard::assert_root_scoreboard_summary(report)?; + root_aggregate_summary_counts::assert_root_consolidation_summary(report); + root_aggregate_summary_suite_summaries::assert_root_knowledge_summary(report); + root_aggregate_summary_suite_summaries::assert_root_proactive_brief_summary(report); + root_aggregate_summary_suite_summaries::assert_root_scheduled_memory_summary(report); + root_aggregate_summary_suite_summaries::assert_root_work_continuity_summary(report); Ok(()) } - -fn assert_root_knowledge_summary(report: &Value) { - assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(5)); - assert_eq!( - report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), - Some(0.979) - ); -} - -fn assert_root_proactive_brief_summary(report: &Value) { - assert_eq!( - report.pointer("/summary/proactive_brief/job_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/action_rationale_coverage") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/invalid_current_suggestion_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/proactive_brief/tombstone_violation_count") - .and_then(Value::as_u64), - Some(0) - ); -} - -fn assert_root_scheduled_memory_summary(report: &Value) { - assert_eq!( - report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/action_rationale_coverage") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/invalid_current_output_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/scheduled_memory/tombstone_violation_count") - .and_then(Value::as_u64), - Some(0) - ); -} - -fn assert_root_work_continuity_summary(report: &Value) { - assert_eq!( - report.pointer("/summary/work_continuity/job_count").and_then(Value::as_u64), - Some(8) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/reset_resume_success_rate") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/decision_rationale_recall_rate") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/rejected_option_suppression_rate") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/inferred_step_instruction_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/sensitive_marker_persistence_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/janitor_false_promotion_count") - .and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/work_continuity/journal_only_authority_claim_count") - .and_then(Value::as_u64), - Some(0) - ); -} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs new file mode 100644 index 00000000..afd26c4d --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs @@ -0,0 +1,95 @@ +use serde_json::Value; + +pub(crate) fn assert_root_summary_counts(report: &Value) { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); + assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(19)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(75)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(16) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), + Some(3) + ); + assert_eq!( + report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), + Some(180) + ); + assert_eq!( + report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), + Some(180) + ); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(0) + ); +} + +pub(crate) fn assert_root_consolidation_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/job_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/memory_summary/invalid_top_of_mind_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/memory_summary/source_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs new file mode 100644 index 00000000..c3735856 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs @@ -0,0 +1,136 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(crate) fn assert_root_scoreboard_summary(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/scoreboard/summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_summary_claim").and_then(Value::as_str), + Some("typed_non_pass_present") + ); + assert_eq!( + report.pointer("/scoreboard/job_typed_non_pass_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report.pointer("/scoreboard/external_adapter_typed_non_pass_count").and_then(Value::as_u64), + Some(240) + ); + assert_eq!( + report.pointer("/scoreboard/typed_non_pass_count").and_then(Value::as_u64), + Some(247) + ); + assert_eq!( + report.pointer("/scoreboard/unqualified_win_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + assert!(support::array_contains_str(report, "/scoreboard/result_states", "not_comparable")?); + assert_eq!( + report.pointer("/scoreboard/metric_basis").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!(report.pointer("/scoreboard/retrieval_k").and_then(Value::as_u64), Some(5)); + + assert_root_scoreboard_rows(report)?; + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(support::array_contains_str( + report, + "/scoreboard/typed_non_pass_states_present", + state + )?); + } + + assert_eq!( + support::string_array_at(report, "/scoreboard/job_typed_non_pass_states_present")?, + ["blocked"].map(str::to_owned) + ); + + for state in ["blocked", "incomplete", "not_encoded", "not_tested", "wrong_result"] { + assert!(support::array_contains_str( + report, + "/scoreboard/external_adapter_typed_non_pass_states_present", + state + )?); + } + + Ok(()) +} + +fn assert_root_scoreboard_rows(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/scoreboard/rows")?; + let elf = support::find_by_field(rows, "/product_id", "elf_current_report")?; + let qmd = support::find_by_field(rows, "/product_id", "qmd")?; + let graphify = support::find_by_field(rows, "/product_id", "graphify")?; + let pageindex = support::find_by_field(rows, "/product_id", "vectifyai_pageindex")?; + let openkb = support::find_by_field(rows, "/product_id", "vectifyai_openkb")?; + let honcho = support::find_by_field(rows, "/product_id", "plastic_labs_honcho")?; + + assert_eq!(rows.len(), 20); + assert_eq!(elf.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!(elf.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(elf.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); + assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); + assert_eq!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64), Some(0.988)); + assert_eq!( + elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64), + Some(0.415) + ); + assert_eq!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64), Some(0.988)); + assert_eq!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64), Some(0.985)); + assert_eq!( + elf.pointer("/metrics/lifecycle/stale_suppression").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/lifecycle/update_correctness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/lifecycle/delete_correctness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + elf.pointer("/metrics/coverage/typed_non_pass_count").and_then(Value::as_u64), + Some(7) + ); + assert!(support::array_contains_str( + elf, + "/next_evidence", + "Run a Docker-contained product-runtime adapter for this row." + )?); + + for competitor in [qmd, graphify] { + assert_eq!( + competitor.pointer("/evidence_class").and_then(Value::as_str), + Some("live_real_world") + ); + assert_eq!( + competitor.pointer("/result_state").and_then(Value::as_str), + Some("wrong_result") + ); + assert_eq!(competitor.pointer("/product_runtime").and_then(Value::as_bool), Some(true)); + assert_eq!( + competitor.pointer("/container_digest_identified").and_then(Value::as_bool), + Some(false) + ); + assert!(competitor.pointer("/metrics/retrieval/recall_at_k").is_some_and(Value::is_null)); + assert!(support::array_contains_str( + competitor, + "/next_evidence", + "Record container image digest evidence." + )?); + } + + crate::assert_tracked_external_blocker_row(pageindex, "VectifyAI PageIndex", true)?; + crate::assert_tracked_external_blocker_row(openkb, "VectifyAI OpenKB", true)?; + crate::assert_tracked_external_blocker_row(honcho, "plastic-labs Honcho", false)?; + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_suite_summaries.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_suite_summaries.rs new file mode 100644 index 00000000..f9a188ca --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_suite_summaries.rs @@ -0,0 +1,141 @@ +use serde_json::Value; + +pub(crate) fn assert_root_knowledge_summary(report: &Value) { + assert_eq!(report.pointer("/summary/knowledge/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.979) + ); +} + +pub(crate) fn assert_root_proactive_brief_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/proactive_brief/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/suggestion_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/proactive_brief/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/invalid_current_suggestion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/proactive_brief/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); +} + +pub(crate) fn assert_root_scheduled_memory_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/scheduled_memory/job_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/task_run_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/output_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/scheduled_memory/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/invalid_current_output_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/scheduled_memory/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); +} + +pub(crate) fn assert_root_work_continuity_summary(report: &Value) { + assert_eq!( + report.pointer("/summary/work_continuity/job_count").and_then(Value::as_u64), + Some(8) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/reset_resume_success_rate") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/decision_rationale_recall_rate") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/rejected_option_suppression_rate") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/inferred_step_instruction_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/sensitive_marker_persistence_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/janitor_false_promotion_count") + .and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/work_continuity/journal_only_authority_claim_count") + .and_then(Value::as_u64), + Some(0) + ); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/relation_context.rs b/packages/elf-service/tests/acceptance/chunk_search/relation_context.rs index 4027a3d3..1d056688 100644 --- a/packages/elf-service/tests/acceptance/chunk_search/relation_context.rs +++ b/packages/elf-service/tests/acceptance/chunk_search/relation_context.rs @@ -1,397 +1,4 @@ -use sqlx::PgExecutor; -use time::{Duration, OffsetDateTime}; -use uuid::Uuid; - -use crate::acceptance::{ - self, StubRerank, - chunk_search::tests_helpers::{self, TestContext}, -}; -use elf_service::{ElfService, Providers, RelationTemporalStatus, SearchRequest}; - -async fn insert_graph_entity<'e, E>( - executor: E, - entity_id: Uuid, - canonical: &str, - kind: Option<&str>, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO graph_entities ( - entity_id, - tenant_id, - project_id, - canonical, - canonical_norm, - kind -) -VALUES ($1, $2, $3, $4, $5, $6)", - ) - .bind(entity_id) - .bind("t") - .bind("p") - .bind(canonical) - .bind(canonical.to_lowercase()) - .bind(kind) - .execute(executor) - .await - .expect("Failed to insert graph entity."); -} - -async fn insert_graph_predicate<'e, E>(executor: E, predicate_id: Uuid, canonical: &str) -where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO graph_predicates ( - predicate_id, - scope_key, - tenant_id, - project_id, - canonical, - canonical_norm, - cardinality, - status -) -VALUES ($1, $2, $3, $4, $5, $6, 'single', 'active')", - ) - .bind(predicate_id) - .bind("__project__:p") - .bind("t") - .bind("p") - .bind(canonical) - .bind(canonical.to_lowercase()) - .execute(executor) - .await - .expect("Failed to insert graph predicate."); -} - -#[allow(clippy::too_many_arguments)] -async fn insert_graph_fact<'e, E>( - executor: E, - fact_id: Uuid, - subject_entity_id: Uuid, - predicate: &str, - predicate_id: Uuid, - object_value: &str, - valid_from: OffsetDateTime, - valid_to: Option, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO graph_facts ( - fact_id, - tenant_id, - project_id, - agent_id, - scope, - subject_entity_id, - predicate, - predicate_id, - object_entity_id, - object_value, - valid_from, - valid_to -) -VALUES ($1, $2, $3, $4, $5, $6, $7, $8, NULL, $9, $10, $11)", - ) - .bind(fact_id) - .bind("t") - .bind("p") - .bind("a") - .bind("agent_private") - .bind(subject_entity_id) - .bind(predicate) - .bind(predicate_id) - .bind(object_value) - .bind(valid_from) - .bind(valid_to) - .execute(executor) - .await - .expect("Failed to insert graph fact."); -} - -async fn insert_graph_fact_evidence<'e, E>( - executor: E, - fact_id: Uuid, - note_id: Uuid, - created_at: OffsetDateTime, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) -VALUES ($1, $2, $3, $4)", - ) - .bind(Uuid::new_v4()) - .bind(fact_id) - .bind(note_id) - .bind(created_at) - .execute(executor) - .await - .expect("Failed to insert graph fact evidence."); -} - -async fn setup_graph_context_test( - test_name: &str, - providers: Providers, - max_facts_per_item: u32, - max_evidence_notes_per_fact: u32, -) -> Option { - let Some(test_db) = acceptance::test_db().await else { - eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); - - return None; - }; - let Some(qdrant_url) = acceptance::test_qdrant_url() else { - eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); - - return None; - }; - let collection = test_db.collection_name("elf_acceptance"); - let docs_collection = test_db.collection_name("elf_acceptance_docs"); - let mut cfg = acceptance::test_config( - test_db.dsn().to_string(), - qdrant_url, - 4_096, - collection, - docs_collection, - ); - - cfg.search.graph_context.enabled = true; - cfg.search.graph_context.max_facts_per_item = max_facts_per_item; - cfg.search.graph_context.max_evidence_notes_per_fact = max_evidence_notes_per_fact; - - let service = - acceptance::build_service(cfg, providers).await.expect("Failed to build service."); - - acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - tests_helpers::reset_collection(&service).await; - - let embedding_version = format!( - "{}:{}:{}", - service.cfg.providers.embedding.provider_id, - service.cfg.providers.embedding.model, - service.cfg.storage.qdrant.vector_dim - ); - - Some(TestContext { service, test_db, embedding_version }) -} - -async fn seed_relation_context_fixture( - service: &ElfService, - embedding_version: &str, -) -> (Uuid, Uuid, Uuid) { - let now = OffsetDateTime::now_utc(); - let note_id = Uuid::new_v4(); - let note_id_2 = Uuid::new_v4(); - let chunk_id = Uuid::new_v4(); - let chunk_text = "Alice mentors Bob about projects and priorities."; - let subject_id = Uuid::new_v4(); - let newer_fact_id = Uuid::new_v4(); - let predicate_id = Uuid::new_v4(); - let older_fact_id = Uuid::new_v4(); - let older_fact_valid_from = now - Duration::seconds(10); - let newer_fact_valid_from = now - Duration::seconds(5); - let note_1_evidence_created_at = now - Duration::seconds(30); - let note_2_evidence_created_at = now - Duration::seconds(10); - - tests_helpers::insert_note(&service.db.pool, note_id, chunk_text, embedding_version).await; - tests_helpers::insert_note( - &service.db.pool, - note_id_2, - "Second note for evidence ordering.", - embedding_version, - ) - .await; - tests_helpers::insert_chunk( - &service.db.pool, - chunk_id, - note_id, - 0, - 0, - chunk_text.len() as i32, - chunk_text, - embedding_version, - ) - .await; - tests_helpers::upsert_point( - service, - chunk_id, - note_id, - 0, - 0, - chunk_text.len() as i32, - chunk_text, - ) - .await; - - insert_graph_entity(&service.db.pool, subject_id, "Alice", Some("person")).await; - insert_graph_predicate(&service.db.pool, predicate_id, "mentors").await; - insert_graph_fact( - &service.db.pool, - older_fact_id, - subject_id, - "mentors", - predicate_id, - "Bob", - older_fact_valid_from, - Some(newer_fact_valid_from), - ) - .await; - insert_graph_fact_evidence( - &service.db.pool, - older_fact_id, - note_id, - note_1_evidence_created_at, - ) - .await; - insert_graph_fact( - &service.db.pool, - newer_fact_id, - subject_id, - "mentors", - predicate_id, - "Carol", - newer_fact_valid_from, - None, - ) - .await; - insert_graph_fact_evidence( - &service.db.pool, - newer_fact_id, - note_id, - note_1_evidence_created_at, - ) - .await; - insert_graph_fact_evidence( - &service.db.pool, - newer_fact_id, - note_id_2, - note_2_evidence_created_at, - ) - .await; - - (note_id, newer_fact_id, older_fact_id) -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_raw_quick_includes_relation_context_and_respects_fact_bounds() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = setup_graph_context_test( - "search_raw_quick_includes_relation_context_and_respects_fact_bounds", - providers, - 1, - 1, - ) - .await - else { - return; - }; - let fixture = seed_relation_context_fixture(&context.service, &context.embedding_version).await; - let note_id = fixture.0; - let newer_fact_id = fixture.1; - let response = context - .service - .search_raw_quick(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "Alice".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); - let relation_context = item - .explain - .relation_context - .as_ref() - .expect("Expected relation context in search explain."); - - assert_eq!(relation_context.len(), 1, "Expected relation context to be truncated to one fact."); - assert_eq!( - relation_context[0].fact_id, newer_fact_id, - "Expected the most recent fact after truncation." - ); - assert_eq!(relation_context[0].object.value.as_deref(), Some("Carol")); - assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); - assert!(relation_context[0].valid_to.is_none()); - assert_eq!(relation_context[0].evidence_note_ids.len(), 1); - assert_eq!(relation_context[0].evidence_note_ids[0], note_id); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_raw_quick_marks_historical_relation_context() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = setup_graph_context_test( - "search_raw_quick_marks_historical_relation_context", - providers, - 2, - 2, - ) - .await - else { - return; - }; - let fixture = seed_relation_context_fixture(&context.service, &context.embedding_version).await; - let older_fact_id = fixture.2; - let response = context - .service - .search_raw_quick(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "Alice".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); - let relation_context = item - .explain - .relation_context - .as_ref() - .expect("Expected relation context in search explain."); - - assert_eq!( - relation_context.len(), - 2, - "Expected current and historical relation facts in context.", - ); - assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); - - let historical = relation_context - .iter() - .find(|context| context.fact_id == older_fact_id) - .expect("Expected historical fact in relation context."); - - assert_eq!(historical.object.value.as_deref(), Some("Bob")); - assert_eq!(historical.temporal_status, RelationTemporalStatus::Historical); - assert!(historical.valid_to.is_some()); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} +mod fact_bounds; +mod fixture; +mod records; +mod temporal_status; diff --git a/packages/elf-service/tests/acceptance/chunk_search/relation_context/fact_bounds.rs b/packages/elf-service/tests/acceptance/chunk_search/relation_context/fact_bounds.rs new file mode 100644 index 00000000..f7a19863 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/relation_context/fact_bounds.rs @@ -0,0 +1,60 @@ +use crate::acceptance::{ + StubRerank, + chunk_search::{relation_context::fixture, tests_helpers}, +}; +use elf_service::{RelationTemporalStatus, SearchRequest}; + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_raw_quick_includes_relation_context_and_respects_fact_bounds() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = fixture::setup_graph_context_test( + "search_raw_quick_includes_relation_context_and_respects_fact_bounds", + providers, + 1, + 1, + ) + .await + else { + return; + }; + let relation_fixture = + fixture::seed_relation_context_fixture(&context.service, &context.embedding_version).await; + let response = context + .service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Alice".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + let relation_context = item + .explain + .relation_context + .as_ref() + .expect("Expected relation context in search explain."); + + assert_eq!(relation_context.len(), 1, "Expected relation context to be truncated to one fact."); + assert_eq!( + relation_context[0].fact_id, relation_fixture.newer_fact_id, + "Expected the most recent fact after truncation." + ); + assert_eq!(relation_context[0].object.value.as_deref(), Some("Carol")); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + assert!(relation_context[0].valid_to.is_none()); + assert_eq!(relation_context[0].evidence_note_ids.len(), 1); + assert_eq!(relation_context[0].evidence_note_ids[0], relation_fixture.note_id); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/relation_context/fixture.rs b/packages/elf-service/tests/acceptance/chunk_search/relation_context/fixture.rs new file mode 100644 index 00000000..d6eb75f1 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/relation_context/fixture.rs @@ -0,0 +1,159 @@ +use time::{Duration, OffsetDateTime}; +use uuid::Uuid; + +use crate::acceptance::{ + self, + chunk_search::{ + relation_context::records, + tests_helpers::{self, TestContext}, + }, +}; +use elf_service::{ElfService, Providers}; + +pub(super) struct RelationContextFixture { + pub(super) note_id: Uuid, + pub(super) newer_fact_id: Uuid, + pub(super) older_fact_id: Uuid, +} + +pub(super) async fn setup_graph_context_test( + test_name: &str, + providers: Providers, + max_facts_per_item: u32, + max_evidence_notes_per_fact: u32, +) -> Option { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let mut cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + + cfg.search.graph_context.enabled = true; + cfg.search.graph_context.max_facts_per_item = max_facts_per_item; + cfg.search.graph_context.max_evidence_notes_per_fact = max_evidence_notes_per_fact; + + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + tests_helpers::reset_collection(&service).await; + + let embedding_version = format!( + "{}:{}:{}", + service.cfg.providers.embedding.provider_id, + service.cfg.providers.embedding.model, + service.cfg.storage.qdrant.vector_dim + ); + + Some(TestContext { service, test_db, embedding_version }) +} + +pub(super) async fn seed_relation_context_fixture( + service: &ElfService, + embedding_version: &str, +) -> RelationContextFixture { + let now = OffsetDateTime::now_utc(); + let note_id = Uuid::new_v4(); + let note_id_2 = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let chunk_text = "Alice mentors Bob about projects and priorities."; + let subject_id = Uuid::new_v4(); + let newer_fact_id = Uuid::new_v4(); + let predicate_id = Uuid::new_v4(); + let older_fact_id = Uuid::new_v4(); + let older_fact_valid_from = now - Duration::seconds(10); + let newer_fact_valid_from = now - Duration::seconds(5); + let note_1_evidence_created_at = now - Duration::seconds(30); + let note_2_evidence_created_at = now - Duration::seconds(10); + + tests_helpers::insert_note(&service.db.pool, note_id, chunk_text, embedding_version).await; + tests_helpers::insert_note( + &service.db.pool, + note_id_2, + "Second note for evidence ordering.", + embedding_version, + ) + .await; + tests_helpers::insert_chunk( + &service.db.pool, + chunk_id, + note_id, + 0, + 0, + chunk_text.len() as i32, + chunk_text, + embedding_version, + ) + .await; + tests_helpers::upsert_point( + service, + chunk_id, + note_id, + 0, + 0, + chunk_text.len() as i32, + chunk_text, + ) + .await; + records::insert_graph_entity(&service.db.pool, subject_id, "Alice", Some("person")).await; + records::insert_graph_predicate(&service.db.pool, predicate_id, "mentors").await; + records::insert_graph_fact( + &service.db.pool, + older_fact_id, + subject_id, + "mentors", + predicate_id, + "Bob", + older_fact_valid_from, + Some(newer_fact_valid_from), + ) + .await; + records::insert_graph_fact_evidence( + &service.db.pool, + older_fact_id, + note_id, + note_1_evidence_created_at, + ) + .await; + records::insert_graph_fact( + &service.db.pool, + newer_fact_id, + subject_id, + "mentors", + predicate_id, + "Carol", + newer_fact_valid_from, + None, + ) + .await; + records::insert_graph_fact_evidence( + &service.db.pool, + newer_fact_id, + note_id, + note_1_evidence_created_at, + ) + .await; + records::insert_graph_fact_evidence( + &service.db.pool, + newer_fact_id, + note_id_2, + note_2_evidence_created_at, + ) + .await; + + RelationContextFixture { note_id, newer_fact_id, older_fact_id } +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/relation_context/records.rs b/packages/elf-service/tests/acceptance/chunk_search/relation_context/records.rs new file mode 100644 index 00000000..3df39f0d --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/relation_context/records.rs @@ -0,0 +1,132 @@ +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +pub(super) async fn insert_graph_entity<'e, E>( + executor: E, + entity_id: Uuid, + canonical: &str, + kind: Option<&str>, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_entities ( + entity_id, + tenant_id, + project_id, + canonical, + canonical_norm, + kind +) +VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(entity_id) + .bind("t") + .bind("p") + .bind(canonical) + .bind(canonical.to_lowercase()) + .bind(kind) + .execute(executor) + .await + .expect("Failed to insert graph entity."); +} + +pub(super) async fn insert_graph_predicate<'e, E>(executor: E, predicate_id: Uuid, canonical: &str) +where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_predicates ( + predicate_id, + scope_key, + tenant_id, + project_id, + canonical, + canonical_norm, + cardinality, + status +) +VALUES ($1, $2, $3, $4, $5, $6, 'single', 'active')", + ) + .bind(predicate_id) + .bind("__project__:p") + .bind("t") + .bind("p") + .bind(canonical) + .bind(canonical.to_lowercase()) + .execute(executor) + .await + .expect("Failed to insert graph predicate."); +} + +#[allow(clippy::too_many_arguments)] +pub(super) async fn insert_graph_fact<'e, E>( + executor: E, + fact_id: Uuid, + subject_entity_id: Uuid, + predicate: &str, + predicate_id: Uuid, + object_value: &str, + valid_from: OffsetDateTime, + valid_to: Option, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_facts ( + fact_id, + tenant_id, + project_id, + agent_id, + scope, + subject_entity_id, + predicate, + predicate_id, + object_entity_id, + object_value, + valid_from, + valid_to +) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, NULL, $9, $10, $11)", + ) + .bind(fact_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind(subject_entity_id) + .bind(predicate) + .bind(predicate_id) + .bind(object_value) + .bind(valid_from) + .bind(valid_to) + .execute(executor) + .await + .expect("Failed to insert graph fact."); +} + +pub(super) async fn insert_graph_fact_evidence<'e, E>( + executor: E, + fact_id: Uuid, + note_id: Uuid, + created_at: OffsetDateTime, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO graph_fact_evidence (evidence_id, fact_id, note_id, created_at) +VALUES ($1, $2, $3, $4)", + ) + .bind(Uuid::new_v4()) + .bind(fact_id) + .bind(note_id) + .bind(created_at) + .execute(executor) + .await + .expect("Failed to insert graph fact evidence."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/relation_context/temporal_status.rs b/packages/elf-service/tests/acceptance/chunk_search/relation_context/temporal_status.rs new file mode 100644 index 00000000..cce15a5d --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/relation_context/temporal_status.rs @@ -0,0 +1,65 @@ +use crate::acceptance::{ + StubRerank, + chunk_search::{relation_context::fixture, tests_helpers}, +}; +use elf_service::{RelationTemporalStatus, SearchRequest}; + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_raw_quick_marks_historical_relation_context() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = fixture::setup_graph_context_test( + "search_raw_quick_marks_historical_relation_context", + providers, + 2, + 2, + ) + .await + else { + return; + }; + let relation_fixture = + fixture::seed_relation_context_fixture(&context.service, &context.embedding_version).await; + let response = context + .service + .search_raw_quick(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Alice".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + let relation_context = item + .explain + .relation_context + .as_ref() + .expect("Expected relation context in search explain."); + + assert_eq!( + relation_context.len(), + 2, + "Expected current and historical relation facts in context.", + ); + assert_eq!(relation_context[0].temporal_status, RelationTemporalStatus::Current); + + let historical = relation_context + .iter() + .find(|context| context.fact_id == relation_fixture.older_fact_id) + .expect("Expected historical fact in relation context."); + + assert_eq!(historical.object.value.as_deref(), Some("Bob")); + assert_eq!(historical.temporal_status, RelationTemporalStatus::Historical); + assert!(historical.valid_to.is_some()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_core.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_core.rs index fec0e63b..170f27f7 100644 --- a/packages/elf-service/tests/acceptance/chunk_search/tests_core.rs +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_core.rs @@ -1,397 +1,3 @@ -use uuid::Uuid; - -use crate::acceptance::{ - StubRerank, - chunk_search::tests_helpers::{self, KeywordRerank}, -}; -use elf_service::{SearchDetailsRequest, SearchRequest, SearchTimelineRequest}; - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_returns_chunk_items() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = tests_helpers::setup_context("search_returns_chunk_items", providers).await - else { - return; - }; - let note_id = Uuid::new_v4(); - let chunk_id = Uuid::new_v4(); - let note_text = "First sentence. Second sentence."; - - tests_helpers::insert_note( - &context.service.db.pool, - note_id, - note_text, - &context.embedding_version, - ) - .await; - tests_helpers::insert_chunk( - &context.service.db.pool, - chunk_id, - note_id, - 0, - 0, - note_text.len() as i32, - note_text, - &context.embedding_version, - ) - .await; - tests_helpers::upsert_point( - &context.service, - chunk_id, - note_id, - 0, - 0, - note_text.len() as i32, - note_text, - ) - .await; - - let response = context - .service - .search_raw(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "First".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); - - assert_eq!(item.chunk_id, chunk_id); - assert!(!item.snippet.is_empty()); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_stitches_adjacent_chunks() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = - tests_helpers::setup_context("search_stitches_adjacent_chunks", providers).await - else { - return; - }; - let note_id = Uuid::new_v4(); - let chunk_texts = ["First sentence. ", "Second sentence. ", "Third sentence."]; - let note_text = chunk_texts.concat(); - - tests_helpers::insert_note( - &context.service.db.pool, - note_id, - ¬e_text, - &context.embedding_version, - ) - .await; - - let mut offset = 0_i32; - let mut chunk_ids = Vec::new(); - - for (index, chunk_text) in chunk_texts.iter().enumerate() { - let chunk_id = Uuid::new_v4(); - let start = offset; - let end = start + chunk_text.len() as i32; - - tests_helpers::insert_chunk( - &context.service.db.pool, - chunk_id, - note_id, - index as i32, - start, - end, - chunk_text, - &context.embedding_version, - ) - .await; - - chunk_ids.push((chunk_id, start, end, *chunk_text)); - - offset = end; - } - - let (chunk_id, start, end, text) = chunk_ids[1]; - - tests_helpers::upsert_point(&context.service, chunk_id, note_id, 1, start, end, text).await; - - let response = context - .service - .search_raw(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "Second".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); - - assert_eq!(item.chunk_id, chunk_id); - assert!(item.snippet.contains("First sentence.")); - assert!(item.snippet.contains("Second sentence.")); - assert!(item.snippet.contains("Third sentence.")); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_skips_missing_chunk_metadata() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = - tests_helpers::setup_context("search_skips_missing_chunk_metadata", providers).await - else { - return; - }; - let note_id = Uuid::new_v4(); - let chunk_id = Uuid::new_v4(); - let note_text = "Missing chunk metadata."; - - tests_helpers::insert_note( - &context.service.db.pool, - note_id, - note_text, - &context.embedding_version, - ) - .await; - tests_helpers::upsert_point( - &context.service, - chunk_id, - note_id, - 0, - 0, - note_text.len() as i32, - note_text, - ) - .await; - - let response = context - .service - .search_raw(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "Missing".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - - assert!(response.items.is_empty()); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn progressive_search_returns_index_timeline_and_details() { - let providers = tests_helpers::build_providers(StubRerank); - let Some(context) = tests_helpers::setup_context( - "progressive_search_returns_index_timeline_and_details", - providers, - ) - .await - else { - return; - }; - let note_id = Uuid::new_v4(); - let chunk_id = Uuid::new_v4(); - let note_text = "Progressive retrieval works best with staged expansion."; - - tests_helpers::insert_note( - &context.service.db.pool, - note_id, - note_text, - &context.embedding_version, - ) - .await; - tests_helpers::insert_chunk( - &context.service.db.pool, - chunk_id, - note_id, - 0, - 0, - note_text.len() as i32, - note_text, - &context.embedding_version, - ) - .await; - tests_helpers::upsert_point( - &context.service, - chunk_id, - note_id, - 0, - 0, - note_text.len() as i32, - note_text, - ) - .await; - - let index = context - .service - .search(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "Progressive".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search index failed."); - - assert!(!index.items.is_empty()); - - let timeline = context - .service - .search_timeline(SearchTimelineRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - search_session_id: index.search_session_id, - payload_level: Default::default(), - group_by: None, - }) - .await - .expect("Search timeline failed."); - - assert!(!timeline.groups.is_empty()); - - let details = context - .service - .search_details(SearchDetailsRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - search_session_id: index.search_session_id, - payload_level: Default::default(), - note_ids: vec![note_id], - record_hits: Some(false), - }) - .await - .expect("Search details failed."); - let returned = details - .results - .first() - .and_then(|result| result.note.as_ref()) - .expect("Expected note details."); - - assert_eq!(returned.note_id, note_id); - assert_eq!(returned.text, note_text); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] -async fn search_dedupes_note_results() { - let providers = tests_helpers::build_providers(KeywordRerank { keyword: "preferred" }); - let Some(context) = - tests_helpers::setup_context("search_dedupes_note_results", providers).await - else { - return; - }; - let note_id = Uuid::new_v4(); - let chunk_texts = ["preferred alpha. ", "bridge chunk. ", "other alpha."]; - let note_text = chunk_texts.concat(); - - tests_helpers::insert_note( - &context.service.db.pool, - note_id, - ¬e_text, - &context.embedding_version, - ) - .await; - - let mut offset = 0_i32; - let mut chunk_ids = Vec::new(); - - for (index, chunk_text) in chunk_texts.iter().enumerate() { - let chunk_id = Uuid::new_v4(); - let start = offset; - let end = start + chunk_text.len() as i32; - - tests_helpers::insert_chunk( - &context.service.db.pool, - chunk_id, - note_id, - index as i32, - start, - end, - chunk_text, - &context.embedding_version, - ) - .await; - - chunk_ids.push((chunk_id, start, end, *chunk_text)); - - offset = end; - } - - let (chunk_id_a, start_a, end_a, text_a) = chunk_ids[0]; - let (chunk_id_c, start_c, end_c, text_c) = chunk_ids[2]; - - tests_helpers::upsert_point(&context.service, chunk_id_a, note_id, 0, start_a, end_a, text_a) - .await; - tests_helpers::upsert_point(&context.service, chunk_id_c, note_id, 2, start_c, end_c, text_c) - .await; - - let response = context - .service - .search_raw(SearchRequest { - tenant_id: "t".to_string(), - project_id: "p".to_string(), - agent_id: "a".to_string(), - token_id: None, - read_profile: "private_only".to_string(), - payload_level: Default::default(), - query: "alpha".to_string(), - top_k: Some(5), - candidate_k: Some(10), - filter: None, - record_hits: Some(false), - ranking: None, - }) - .await - .expect("Search failed."); - let item = response.items.first().expect("Expected search result."); - - assert_eq!(response.items.len(), 1); - assert_eq!(item.note_id, note_id); - assert!( - item.chunk_id == chunk_id_a || item.chunk_id == chunk_id_c, - "Expected deduped result chunk_id to be one of the ingested chunks." - ); - - context.test_db.cleanup().await.expect("Failed to cleanup test database."); -} +mod basic_search; +mod dedupe; +mod progressive; diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_core/basic_search.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_core/basic_search.rs new file mode 100644 index 00000000..17ba45e0 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_core/basic_search.rs @@ -0,0 +1,204 @@ +use uuid::Uuid; + +use crate::acceptance::{StubRerank, chunk_search::tests_helpers}; +use elf_service::SearchRequest; + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_returns_chunk_items() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = tests_helpers::setup_context("search_returns_chunk_items", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let note_text = "First sentence. Second sentence."; + + tests_helpers::insert_note( + &context.service.db.pool, + note_id, + note_text, + &context.embedding_version, + ) + .await; + tests_helpers::insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + &context.embedding_version, + ) + .await; + tests_helpers::upsert_point( + &context.service, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + ) + .await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "First".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + + assert_eq!(item.chunk_id, chunk_id); + assert!(!item.snippet.is_empty()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_stitches_adjacent_chunks() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = + tests_helpers::setup_context("search_stitches_adjacent_chunks", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_texts = ["First sentence. ", "Second sentence. ", "Third sentence."]; + let note_text = chunk_texts.concat(); + + tests_helpers::insert_note( + &context.service.db.pool, + note_id, + ¬e_text, + &context.embedding_version, + ) + .await; + + let mut offset = 0_i32; + let mut chunk_ids = Vec::new(); + + for (index, chunk_text) in chunk_texts.iter().enumerate() { + let chunk_id = Uuid::new_v4(); + let start = offset; + let end = start + chunk_text.len() as i32; + + tests_helpers::insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + index as i32, + start, + end, + chunk_text, + &context.embedding_version, + ) + .await; + + chunk_ids.push((chunk_id, start, end, *chunk_text)); + + offset = end; + } + + let (chunk_id, start, end, text) = chunk_ids[1]; + + tests_helpers::upsert_point(&context.service, chunk_id, note_id, 1, start, end, text).await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Second".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + + assert_eq!(item.chunk_id, chunk_id); + assert!(item.snippet.contains("First sentence.")); + assert!(item.snippet.contains("Second sentence.")); + assert!(item.snippet.contains("Third sentence.")); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_skips_missing_chunk_metadata() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = + tests_helpers::setup_context("search_skips_missing_chunk_metadata", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let note_text = "Missing chunk metadata."; + + tests_helpers::insert_note( + &context.service.db.pool, + note_id, + note_text, + &context.embedding_version, + ) + .await; + tests_helpers::upsert_point( + &context.service, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + ) + .await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Missing".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + + assert!(response.items.is_empty()); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_core/dedupe.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_core/dedupe.rs new file mode 100644 index 00000000..15799698 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_core/dedupe.rs @@ -0,0 +1,88 @@ +use uuid::Uuid; + +use crate::acceptance::chunk_search::tests_helpers::{self, KeywordRerank}; +use elf_service::SearchRequest; + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn search_dedupes_note_results() { + let providers = tests_helpers::build_providers(KeywordRerank { keyword: "preferred" }); + let Some(context) = + tests_helpers::setup_context("search_dedupes_note_results", providers).await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_texts = ["preferred alpha. ", "bridge chunk. ", "other alpha."]; + let note_text = chunk_texts.concat(); + + tests_helpers::insert_note( + &context.service.db.pool, + note_id, + ¬e_text, + &context.embedding_version, + ) + .await; + + let mut offset = 0_i32; + let mut chunk_ids = Vec::new(); + + for (index, chunk_text) in chunk_texts.iter().enumerate() { + let chunk_id = Uuid::new_v4(); + let start = offset; + let end = start + chunk_text.len() as i32; + + tests_helpers::insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + index as i32, + start, + end, + chunk_text, + &context.embedding_version, + ) + .await; + + chunk_ids.push((chunk_id, start, end, *chunk_text)); + + offset = end; + } + + let (chunk_id_a, start_a, end_a, text_a) = chunk_ids[0]; + let (chunk_id_c, start_c, end_c, text_c) = chunk_ids[2]; + + tests_helpers::upsert_point(&context.service, chunk_id_a, note_id, 0, start_a, end_a, text_a) + .await; + tests_helpers::upsert_point(&context.service, chunk_id_c, note_id, 2, start_c, end_c, text_c) + .await; + + let response = context + .service + .search_raw(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "alpha".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search failed."); + let item = response.items.first().expect("Expected search result."); + + assert_eq!(response.items.len(), 1); + assert_eq!(item.note_id, note_id); + assert!( + item.chunk_id == chunk_id_a || item.chunk_id == chunk_id_c, + "Expected deduped result chunk_id to be one of the ingested chunks." + ); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_core/progressive.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_core/progressive.rs new file mode 100644 index 00000000..63dac77f --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_core/progressive.rs @@ -0,0 +1,110 @@ +use uuid::Uuid; + +use crate::acceptance::{StubRerank, chunk_search::tests_helpers}; +use elf_service::{SearchDetailsRequest, SearchRequest, SearchTimelineRequest}; + +#[tokio::test] +#[ignore = "Requires external Postgres and Qdrant. Set ELF_PG_DSN and ELF_QDRANT_URL to run."] +async fn progressive_search_returns_index_timeline_and_details() { + let providers = tests_helpers::build_providers(StubRerank); + let Some(context) = tests_helpers::setup_context( + "progressive_search_returns_index_timeline_and_details", + providers, + ) + .await + else { + return; + }; + let note_id = Uuid::new_v4(); + let chunk_id = Uuid::new_v4(); + let note_text = "Progressive retrieval works best with staged expansion."; + + tests_helpers::insert_note( + &context.service.db.pool, + note_id, + note_text, + &context.embedding_version, + ) + .await; + tests_helpers::insert_chunk( + &context.service.db.pool, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + &context.embedding_version, + ) + .await; + tests_helpers::upsert_point( + &context.service, + chunk_id, + note_id, + 0, + 0, + note_text.len() as i32, + note_text, + ) + .await; + + let index = context + .service + .search(SearchRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + token_id: None, + read_profile: "private_only".to_string(), + payload_level: Default::default(), + query: "Progressive".to_string(), + top_k: Some(5), + candidate_k: Some(10), + filter: None, + record_hits: Some(false), + ranking: None, + }) + .await + .expect("Search index failed."); + + assert!(!index.items.is_empty()); + + let timeline = context + .service + .search_timeline(SearchTimelineRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + search_session_id: index.search_session_id, + payload_level: Default::default(), + group_by: None, + }) + .await + .expect("Search timeline failed."); + + assert!(!timeline.groups.is_empty()); + + let details = context + .service + .search_details(SearchDetailsRequest { + tenant_id: "t".to_string(), + project_id: "p".to_string(), + agent_id: "a".to_string(), + search_session_id: index.search_session_id, + payload_level: Default::default(), + note_ids: vec![note_id], + record_hits: Some(false), + }) + .await + .expect("Search details failed."); + let returned = details + .results + .first() + .and_then(|result| result.note.as_ref()) + .expect("Expected note details."); + + assert_eq!(returned.note_id, note_id); + assert_eq!(returned.text, note_text); + + context.test_db.cleanup().await.expect("Failed to cleanup test database."); +}