From f61bc036eea3277e17ea6d2b53e258f6ad4f3aca Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Sat, 27 Jun 2026 21:27:15 +0800 Subject: [PATCH 1/3] {"schema":"decodex/commit/1","summary":"Add authority recovery drill benchmark evidence","authority":"XY-1119","related":["XY-1098"]} --- .../authority_plane_recovery_drill.json | 556 ++++++++++++++++++ .../src/bin/real_world_job_benchmark.rs | 499 ++++++++++++++++ .../tests/real_world_job_benchmark.rs | 148 ++++- ...27-authority-recovery-drill-drift-audit.md | 90 +++ docs/evidence/index.md | 2 + docs/log.md | 4 + .../real_world_agent_memory_benchmark.md | 25 +- .../real_world_agent_memory_benchmark_v1.md | 44 +- 8 files changed, 1334 insertions(+), 34 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json create mode 100644 docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md diff --git a/apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json b/apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json new file mode 100644 index 00000000..4351a2da --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json @@ -0,0 +1,556 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "production-ops-authority-plane-recovery-001", + "suite": "production_ops", + "title": "Recover authority-plane records with degraded derived indexes labeled", + "corpus": { + "corpus_id": "real-world-memory-production-ops-2026-06-27", + "profile": "synthetic", + "items": [ + { + "evidence_id": "authority-recovery-topology", + "kind": "trace", + "text": "Authority-plane drill topology used one Postgres authority store with Qdrant memory and document indexes treated as rebuildable derived indexes; failover was not_encoded because no standby authority service was part of the drill.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-topology" + }, + "locator": { + "quote": "one Postgres authority store with Qdrant memory and document indexes treated as rebuildable derived indexes" + } + }, + "created_at": "2026-06-27T09:00:00Z" + }, + { + "evidence_id": "authority-recovery-backup-pitr", + "kind": "trace", + "text": "Backup/PITR drill restored the authority store from backup backup-20260627T090000Z to PITR target 2026-06-27T09:12:00Z before replaying pending outbox work.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-backup-pitr" + }, + "locator": { + "quote": "restored the authority store from backup backup-20260627T090000Z to PITR target 2026-06-27T09:12:00Z" + } + }, + "created_at": "2026-06-27T09:12:00Z" + }, + { + "evidence_id": "authority-recovery-counts", + "kind": "trace", + "text": "Before and after recovery, authority counts matched for source=3, journal=2, memory=4, knowledge=2, proposal=2, trace=3, and audit=5; source refs and lifecycle history were preserved on every plane.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-counts" + }, + "locator": { + "quote": "source refs and lifecycle history were preserved on every plane" + } + }, + "created_at": "2026-06-27T09:14:00Z" + }, + { + "evidence_id": "authority-recovery-degraded-read", + "kind": "trace", + "text": "During degraded read, unavailable Qdrant memory index, document index, and external adapter rows were labeled unavailable_derived_index or unavailable_adapter while Postgres source-of-truth records remained visible.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-degraded-read" + }, + "locator": { + "quote": "unavailable Qdrant memory index, document index, and external adapter rows were labeled" + } + }, + "created_at": "2026-06-27T09:15:00Z" + }, + { + "evidence_id": "authority-recovery-replay-rebuild", + "kind": "trace", + "text": "Idempotent outbox replay processed 6 pending jobs with zero duplicate writes; Qdrant rebuild returned rebuilt_count=9, missing_vector_count=0, error_count=0.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-replay-rebuild" + }, + "locator": { + "quote": "processed 6 pending jobs with zero duplicate writes" + } + }, + "created_at": "2026-06-27T09:16:00Z" + }, + { + "evidence_id": "authority-recovery-repair-dead-letter", + "kind": "trace", + "text": "Migration repair fixed 1 recoverable Work Journal source-ref shape without promoting journal-only content, and 2 dead-letter rows were handled without hiding the failed job history.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-repair-dead-letter" + }, + "locator": { + "quote": "2 dead-letter rows were handled without hiding the failed job history" + } + }, + "created_at": "2026-06-27T09:17:00Z" + }, + { + "evidence_id": "authority-recovery-rpo-rto", + "kind": "trace", + "text": "The drill reported RPO target 60 seconds with measured 12 seconds and RTO target 300 seconds with measured 184 seconds.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-rpo-rto" + }, + "locator": { + "quote": "RPO target 60 seconds with measured 12 seconds and RTO target 300 seconds with measured 184 seconds" + } + }, + "created_at": "2026-06-27T09:18:00Z" + }, + { + "evidence_id": "authority-recovery-failover-decoy", + "kind": "adapter_state", + "text": "Decoy: the drill proves multi-region failover with a standby authority service.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "authority_plane_recovery_drill", + "evidence_id": "authority-recovery-failover-decoy" + } + }, + "created_at": "2026-06-27T09:01:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_production_ops", + "answer": { + "content": "The authority-plane recovery drill restored the Postgres authority store from backup/PITR, preserved source refs and lifecycle history for source, journal, memory, knowledge, proposal, trace, and audit records, labeled unavailable derived indexes and adapters during degraded read, replayed outbox work idempotently, completed Qdrant rebuild with rebuilt_count=9, missing_vector_count=0, error_count=0, handled 2 dead-letter rows, applied 1 migration repair, and met RPO 12/60 seconds plus RTO 184/300 seconds. Failover remains not_encoded because no standby authority service was part of the drill.", + "claims": [ + { + "claim_id": "backup_pitr_restored", + "text": "The authority-plane recovery drill restored the Postgres authority store from backup/PITR.", + "evidence_ids": ["authority-recovery-backup-pitr"], + "confidence": "high" + }, + { + "claim_id": "authority_counts_preserved", + "text": "The authority-plane recovery drill preserved source refs and lifecycle history for source, journal, memory, knowledge, proposal, trace, and audit records.", + "evidence_ids": ["authority-recovery-counts"], + "confidence": "high" + }, + { + "claim_id": "degraded_read_labeled", + "text": "Unavailable derived indexes and adapters were labeled during degraded read while source-of-truth records remained visible.", + "evidence_ids": ["authority-recovery-degraded-read"], + "confidence": "high" + }, + { + "claim_id": "replay_rebuild_dead_letter", + "text": "Outbox replay was idempotent, Qdrant rebuild was complete, and 2 dead-letter rows were handled without hiding failure history.", + "evidence_ids": ["authority-recovery-replay-rebuild", "authority-recovery-repair-dead-letter"], + "confidence": "high" + }, + { + "claim_id": "rpo_rto_reported", + "text": "The drill met RPO 12/60 seconds and RTO 184/300 seconds.", + "evidence_ids": ["authority-recovery-rpo-rto"], + "confidence": "high" + }, + { + "claim_id": "failover_not_encoded", + "text": "Failover remains not_encoded because no standby authority service was part of the drill.", + "evidence_ids": ["authority-recovery-topology"], + "confidence": "high" + } + ], + "evidence_ids": [ + "authority-recovery-topology", + "authority-recovery-backup-pitr", + "authority-recovery-counts", + "authority-recovery-degraded-read", + "authority-recovery-replay-rebuild", + "authority-recovery-repair-dead-letter", + "authority-recovery-rpo-rto" + ], + "recovery_drills": [ + { + "drill_id": "authority-plane-drill-20260627", + "contract_schema": "elf.authority_recovery_drill/v1", + "generated_at": "2026-06-27T09:18:00Z", + "topology": { + "authority_store": "postgres", + "derived_indexes": ["qdrant_memory", "qdrant_docs"], + "adapters": ["external_adapter_probe"], + "failover": "not_encoded_no_standby_authority_service" + }, + "failure_injections": [ + { + "injection_id": "qdrant-memory-index-unavailable", + "target": "qdrant_memory", + "fault": "derived_index_unavailable", + "started_at": "2026-06-27T09:15:00Z", + "completed_at": "2026-06-27T09:16:00Z", + "evidence_refs": ["authority-recovery-degraded-read"] + }, + { + "injection_id": "outbox-replay-after-pitr", + "target": "indexing_outbox", + "fault": "pending_jobs_replayed_after_restore", + "started_at": "2026-06-27T09:16:00Z", + "completed_at": "2026-06-27T09:17:00Z", + "evidence_refs": ["authority-recovery-replay-rebuild"] + } + ], + "backup_pitr": { + "backup_ref": "backup-20260627T090000Z", + "pitr_target": "2026-06-27T09:12:00Z", + "restored": true, + "evidence_refs": ["authority-recovery-backup-pitr"] + }, + "degraded_read": { + "source_of_truth_visible": true, + "unavailable_derived_indexes": ["qdrant_memory", "qdrant_docs"], + "unavailable_adapters": ["external_adapter_probe"], + "unavailable_labels": ["unavailable_derived_index", "unavailable_adapter"], + "evidence_refs": ["authority-recovery-degraded-read"] + }, + "rpo": { + "target_seconds": 60.0, + "measured_seconds": 12.0, + "evidence_refs": ["authority-recovery-rpo-rto"] + }, + "rto": { + "target_seconds": 300.0, + "measured_seconds": 184.0, + "evidence_refs": ["authority-recovery-rpo-rto"] + }, + "authority_record_counts": [ + { + "plane": "source", + "before_count": 3, + "after_count": 3, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "journal", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "memory", + "before_count": 4, + "after_count": 4, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "knowledge", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "proposal", + "before_count": 2, + "after_count": 2, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "trace", + "before_count": 3, + "after_count": 3, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + }, + { + "plane": "audit", + "before_count": 5, + "after_count": 5, + "source_refs_preserved": true, + "lifecycle_history_preserved": true, + "evidence_refs": ["authority-recovery-counts"] + } + ], + "outbox_replay": { + "idempotent": true, + "replayed_count": 6, + "duplicate_write_count": 0, + "evidence_refs": ["authority-recovery-replay-rebuild"] + }, + "qdrant_rebuild": { + "complete": true, + "rebuilt_count": 9, + "missing_vector_count": 0, + "error_count": 0, + "evidence_refs": ["authority-recovery-replay-rebuild"] + }, + "migration_repair": { + "applied": true, + "repaired_count": 1, + "evidence_refs": ["authority-recovery-repair-dead-letter"] + }, + "dead_letter": { + "dead_letter_count": 2, + "handled_count": 2, + "evidence_refs": ["authority-recovery-repair-dead-letter"] + } + } + ], + "latency_ms": 2.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "authority-drill-topology-recorded", + "ts": "2026-06-27T09:00:00Z", + "actor": "tool", + "action": "recorded_recovery_topology", + "evidence_ids": ["authority-recovery-topology"], + "summary": "The drill recorded authority store, derived index, adapter, and failover boundaries." + }, + { + "event_id": "authority-drill-pitr-restored", + "ts": "2026-06-27T09:12:00Z", + "actor": "tool", + "action": "restored_backup_pitr", + "evidence_ids": ["authority-recovery-backup-pitr"], + "summary": "The authority store was restored from the backup to the PITR target." + }, + { + "event_id": "authority-drill-counts-checked", + "ts": "2026-06-27T09:14:00Z", + "actor": "tool", + "action": "checked_authority_record_counts", + "evidence_ids": ["authority-recovery-counts"], + "summary": "Before and after counts matched for source, journal, memory, knowledge, proposal, trace, and audit records." + }, + { + "event_id": "authority-drill-degraded-read", + "ts": "2026-06-27T09:15:00Z", + "actor": "tool", + "action": "read_degraded_authority_records", + "evidence_ids": ["authority-recovery-degraded-read"], + "summary": "Derived index and adapter unavailability were labeled while source-of-truth records stayed visible." + }, + { + "event_id": "authority-drill-replay-rebuild", + "ts": "2026-06-27T09:16:00Z", + "actor": "tool", + "action": "replayed_outbox_and_rebuilt_qdrant", + "evidence_ids": ["authority-recovery-replay-rebuild"], + "summary": "Outbox replay was idempotent and Qdrant rebuild completed." + }, + { + "event_id": "authority-drill-repair-dead-letter", + "ts": "2026-06-27T09:17:00Z", + "actor": "tool", + "action": "repaired_migration_and_handled_dead_letters", + "evidence_ids": ["authority-recovery-repair-dead-letter"], + "summary": "Recoverable migration repair and dead-letter handling were recorded without hiding history." + }, + { + "event_id": "authority-drill-rpo-rto", + "ts": "2026-06-27T09:18:00Z", + "actor": "tool", + "action": "reported_rpo_rto", + "evidence_ids": ["authority-recovery-rpo-rto"], + "summary": "RPO and RTO targets and measurements were reported." + } + ], + "prompt": { + "role": "user", + "content": "What does the authority-plane recovery drill prove, and what remains unencoded?", + "job_mode": "operate", + "constraints": [ + "cite_evidence", + "state_authority_planes", + "label_degraded_indexes", + "do_not_claim_failover_without_standby" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "backup_pitr_restored", + "text": "The authority-plane recovery drill restored the Postgres authority store from backup/PITR." + }, + { + "claim_id": "authority_counts_preserved", + "text": "The authority-plane recovery drill preserved source refs and lifecycle history for source, journal, memory, knowledge, proposal, trace, and audit records." + }, + { + "claim_id": "degraded_read_labeled", + "text": "Unavailable derived indexes and adapters were labeled during degraded read while source-of-truth records remained visible." + }, + { + "claim_id": "replay_rebuild_dead_letter", + "text": "Outbox replay was idempotent, Qdrant rebuild was complete, and 2 dead-letter rows were handled without hiding failure history." + }, + { + "claim_id": "rpo_rto_reported", + "text": "The drill met RPO 12/60 seconds and RTO 184/300 seconds." + }, + { + "claim_id": "failover_not_encoded", + "text": "Failover remains not_encoded because no standby authority service was part of the drill." + } + ], + "must_not_include": [ + "proves multi-region failover", + "has a standby authority service" + ], + "evidence_links": { + "backup_pitr_restored": ["authority-recovery-backup-pitr"], + "authority_counts_preserved": ["authority-recovery-counts"], + "degraded_read_labeled": ["authority-recovery-degraded-read"], + "replay_rebuild_dead_letter": ["authority-recovery-replay-rebuild", "authority-recovery-repair-dead-letter"], + "rpo_rto_reported": ["authority-recovery-rpo-rto"], + "failover_not_encoded": ["authority-recovery-topology"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "authority-recovery-backup-pitr", + "claim_id": "backup_pitr_restored", + "requirement": "cite", + "quote": "Backup/PITR drill restored the authority store from backup backup-20260627T090000Z to PITR target 2026-06-27T09:12:00Z" + }, + { + "evidence_id": "authority-recovery-counts", + "claim_id": "authority_counts_preserved", + "requirement": "cite", + "quote": "source refs and lifecycle history were preserved on every plane" + }, + { + "evidence_id": "authority-recovery-degraded-read", + "claim_id": "degraded_read_labeled", + "requirement": "cite", + "quote": "unavailable Qdrant memory index, document index, and external adapter rows were labeled" + }, + { + "evidence_id": "authority-recovery-replay-rebuild", + "claim_id": "replay_rebuild_dead_letter", + "requirement": "cite", + "quote": "processed 6 pending jobs with zero duplicate writes" + }, + { + "evidence_id": "authority-recovery-repair-dead-letter", + "claim_id": "replay_rebuild_dead_letter", + "requirement": "cite", + "quote": "2 dead-letter rows were handled without hiding the failed job history" + }, + { + "evidence_id": "authority-recovery-rpo-rto", + "claim_id": "rpo_rto_reported", + "requirement": "cite", + "quote": "RPO target 60 seconds with measured 12 seconds and RTO target 300 seconds with measured 184 seconds" + }, + { + "evidence_id": "authority-recovery-topology", + "claim_id": "failover_not_encoded", + "requirement": "cite", + "quote": "failover was not_encoded because no standby authority service was part of the drill" + } + ], + "negative_traps": [ + { + "trap_id": "multi-region-failover-decoy", + "type": "unsupported_claim", + "evidence_ids": ["authority-recovery-failover-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "lifecycle_behavior": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Explains backup/PITR, idempotent replay, rebuild, repair, dead-letter, and authority history preservation." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites authority counts, degraded read, replay/rebuild, repair/dead-letter, RPO/RTO, and topology evidence." + }, + "uncertainty_handling": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Preserves the failover not_encoded caveat." + }, + "trap_avoidance": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Does not claim standby or multi-region failover." + } + }, + "pass_threshold": 0.85, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "Failover remains not_encoded because no standby authority service was part of the drill." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "production_ops", + "authority_recovery", + "recovery_drill", + "backup_pitr", + "restore", + "qdrant_rebuild", + "degraded_read", + "outbox_replay", + "dead_letter", + "migration_repair", + "rpo_rto", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index b694ebb9..6deca7ba 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -22,6 +22,7 @@ const EXTERNAL_ADAPTER_MANIFEST_SCHEMA: &str = "elf.real_world_external_adapter_ const EXTERNAL_ADAPTER_REPORT_SCHEMA: &str = "elf.real_world_external_adapter_report/v1"; const SCOREBOARD_SCHEMA: &str = "elf.quality_scoreboard/v1"; const OPERATIONAL_EVIDENCE_SCHEMA: &str = "elf.operational_evidence_gates/v1"; +const AUTHORITY_RECOVERY_DRILL_SCHEMA: &str = "elf.authority_recovery_drill/v1"; const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_memory/work_resume"; const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; @@ -466,6 +467,8 @@ struct ProducedAnswer { scheduled_tasks: Vec, #[serde(default)] work_journal_readbacks: Vec, + #[serde(default)] + recovery_drills: Vec, #[serde(skip_serializing_if = "Option::is_none")] latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -818,6 +821,123 @@ struct WorkJournalJanitorCandidateArtifact { promoted_to_memory: bool, } +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AuthorityRecoveryDrillArtifact { + drill_id: String, + contract_schema: String, + generated_at: String, + topology: RecoveryDrillTopology, + #[serde(default)] + failure_injections: Vec, + backup_pitr: RecoveryBackupPitr, + degraded_read: RecoveryDegradedRead, + rpo: RecoveryMeasurement, + rto: RecoveryMeasurement, + #[serde(default)] + authority_record_counts: Vec, + outbox_replay: RecoveryOutboxReplay, + qdrant_rebuild: RecoveryQdrantRebuild, + migration_repair: RecoveryMigrationRepair, + dead_letter: RecoveryDeadLetterHandling, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryDrillTopology { + authority_store: String, + #[serde(default)] + derived_indexes: Vec, + #[serde(default)] + adapters: Vec, + failover: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryFailureInjection { + injection_id: String, + target: String, + fault: String, + started_at: String, + completed_at: String, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryBackupPitr { + backup_ref: String, + pitr_target: String, + restored: bool, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryDegradedRead { + source_of_truth_visible: bool, + #[serde(default)] + unavailable_derived_indexes: Vec, + #[serde(default)] + unavailable_adapters: Vec, + #[serde(default)] + unavailable_labels: Vec, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryMeasurement { + target_seconds: f64, + measured_seconds: f64, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct AuthorityRecordCount { + plane: String, + before_count: u64, + after_count: u64, + source_refs_preserved: bool, + lifecycle_history_preserved: bool, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryOutboxReplay { + idempotent: bool, + replayed_count: u64, + duplicate_write_count: u64, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryQdrantRebuild { + complete: bool, + rebuilt_count: u64, + missing_vector_count: u64, + error_count: u64, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryMigrationRepair { + applied: bool, + repaired_count: u64, + #[serde(default)] + evidence_refs: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct RecoveryDeadLetterHandling { + dead_letter_count: u64, + handled_count: u64, + #[serde(default)] + evidence_refs: Vec, +} + #[derive(Clone, Debug, Deserialize)] struct ConsolidationFixture { #[serde(default)] @@ -1011,6 +1131,8 @@ struct OperationalEvidenceReport { cost: OperationalCostSummary, resource: OperationalResourceSummary, cold_start_restore_rebuild: OperationalColdStartRestoreRebuild, + #[serde(default)] + authority_recovery: OperationalAuthorityRecoveryReport, missing_private_provider_inputs_are_typed_blockers: bool, private_corpus_pass_claim_allowed: bool, provider_backed_pass_claim_allowed: bool, @@ -1080,6 +1202,29 @@ struct OperationalColdStartRestoreRebuild { job_ids: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +struct OperationalAuthorityRecoveryReport { + drill_count: usize, + drill_pass_count: usize, + topology_reported_count: usize, + failure_injection_count: usize, + degraded_read_labeled_count: usize, + source_of_truth_visible_count: usize, + rpo_target_count: usize, + rpo_met_count: usize, + rto_target_count: usize, + rto_met_count: usize, + authority_plane_count: usize, + source_ref_preserved_count: usize, + lifecycle_history_preserved_count: usize, + idempotent_outbox_replay_count: usize, + qdrant_rebuild_complete_count: usize, + migration_repair_count: usize, + dead_letter_handled_count: usize, + #[serde(default)] + job_ids: Vec, +} + #[derive(Debug, Deserialize, Serialize)] struct AdapterReport { adapter_id: String, @@ -1639,6 +1784,8 @@ struct JobReport { scheduled_memory: Option, #[serde(skip_serializing_if = "Option::is_none")] work_continuity: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + recovery_drills: Vec, trap_ids_used: Vec, dimension_scores: Vec, reason: String, @@ -2444,6 +2591,9 @@ fn validate_adapter_response(job: &RealWorldJob, path: &Path) -> Result<()> { for readback in &adapter_response.answer.work_journal_readbacks { validate_work_journal_readback_artifact(readback, path, &evidence_ids)?; } + for drill in &adapter_response.answer.recovery_drills { + validate_authority_recovery_drill_artifact(drill, path, &evidence_ids)?; + } if job.suite == "memory_summary" && adapter_response.answer.memory_summaries.is_empty() @@ -3140,6 +3290,234 @@ fn validate_work_journal_where_stopped( Ok(()) } +fn validate_authority_recovery_drill_artifact( + drill: &AuthorityRecoveryDrillArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if drill.drill_id.trim().is_empty() + || drill.contract_schema != AUTHORITY_RECOVERY_DRILL_SCHEMA + || drill.generated_at.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete authority recovery drill.", path.display())); + } + + validate_optional_rfc3339(&drill.generated_at, path, drill.drill_id.as_str())?; + validate_recovery_topology(&drill.topology, path, drill.drill_id.as_str())?; + validate_recovery_backup_pitr(&drill.backup_pitr, path, evidence_ids)?; + validate_recovery_degraded_read(&drill.degraded_read, path, evidence_ids)?; + validate_recovery_measurement("rpo", &drill.rpo, path, evidence_ids)?; + validate_recovery_measurement("rto", &drill.rto, path, evidence_ids)?; + validate_recovery_authority_record_counts(drill, path, evidence_ids)?; + validate_recovery_outbox_replay(&drill.outbox_replay, path, evidence_ids)?; + validate_recovery_qdrant_rebuild(&drill.qdrant_rebuild, path, evidence_ids)?; + validate_recovery_migration_repair(&drill.migration_repair, path, evidence_ids)?; + validate_recovery_dead_letter(&drill.dead_letter, path, evidence_ids)?; + + for injection in &drill.failure_injections { + if injection.injection_id.trim().is_empty() + || injection.target.trim().is_empty() + || injection.fault.trim().is_empty() + || injection.started_at.trim().is_empty() + || injection.completed_at.trim().is_empty() + || injection.evidence_refs.is_empty() + { + return Err(eyre::eyre!( + "{} authority recovery drill {} has an incomplete failure injection.", + path.display(), + drill.drill_id + )); + } + + validate_optional_rfc3339(&injection.started_at, path, injection.injection_id.as_str())?; + validate_optional_rfc3339(&injection.completed_at, path, injection.injection_id.as_str())?; + ensure_known_evidence_refs(path, evidence_ids, &injection.evidence_refs)?; + } + + if drill.failure_injections.is_empty() { + return Err(eyre::eyre!( + "{} authority recovery drill {} must include failure injection evidence.", + path.display(), + drill.drill_id + )); + } + + Ok(()) +} + +fn validate_recovery_topology( + topology: &RecoveryDrillTopology, + path: &Path, + drill_id: &str, +) -> Result<()> { + if topology.authority_store.trim().is_empty() + || topology.derived_indexes.is_empty() + || topology.failover.trim().is_empty() + { + return Err(eyre::eyre!( + "{} authority recovery drill {} has incomplete topology.", + path.display(), + drill_id + )); + } + + Ok(()) +} + +fn validate_recovery_backup_pitr( + backup_pitr: &RecoveryBackupPitr, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if backup_pitr.backup_ref.trim().is_empty() + || backup_pitr.pitr_target.trim().is_empty() + || backup_pitr.evidence_refs.is_empty() + { + return Err(eyre::eyre!("{} has incomplete backup/PITR drill evidence.", path.display())); + } + + validate_optional_rfc3339(&backup_pitr.pitr_target, path, backup_pitr.backup_ref.as_str())?; + + ensure_known_evidence_refs(path, evidence_ids, &backup_pitr.evidence_refs) +} + +fn validate_recovery_degraded_read( + degraded_read: &RecoveryDegradedRead, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if degraded_read.unavailable_labels.is_empty() || degraded_read.evidence_refs.is_empty() { + return Err(eyre::eyre!("{} has incomplete degraded-read drill evidence.", path.display())); + } + + ensure_known_evidence_refs(path, evidence_ids, °raded_read.evidence_refs) +} + +fn validate_recovery_measurement( + label: &str, + measurement: &RecoveryMeasurement, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if !measurement.target_seconds.is_finite() + || !measurement.measured_seconds.is_finite() + || measurement.target_seconds < 0.0 + || measurement.measured_seconds < 0.0 + || measurement.evidence_refs.is_empty() + { + return Err(eyre::eyre!("{} has invalid {label} recovery measurement.", path.display())); + } + + ensure_known_evidence_refs(path, evidence_ids, &measurement.evidence_refs) +} + +fn validate_recovery_authority_record_counts( + drill: &AuthorityRecoveryDrillArtifact, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + let required_planes = + ["source", "journal", "memory", "knowledge", "proposal", "trace", "audit"]; + let present_planes = drill + .authority_record_counts + .iter() + .map(|count| count.plane.as_str()) + .collect::>(); + + for plane in required_planes { + if !present_planes.contains(plane) { + return Err(eyre::eyre!( + "{} authority recovery drill {} is missing {} authority counts.", + path.display(), + drill.drill_id, + plane + )); + } + } + for count in &drill.authority_record_counts { + if count.plane.trim().is_empty() || count.evidence_refs.is_empty() { + return Err(eyre::eyre!( + "{} authority recovery drill {} has incomplete authority record counts.", + path.display(), + drill.drill_id + )); + } + + ensure_known_evidence_refs(path, evidence_ids, &count.evidence_refs)?; + } + + Ok(()) +} + +fn validate_recovery_outbox_replay( + replay: &RecoveryOutboxReplay, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if replay.evidence_refs.is_empty() { + return Err(eyre::eyre!("{} has incomplete outbox replay drill evidence.", path.display())); + } + + ensure_known_evidence_refs(path, evidence_ids, &replay.evidence_refs) +} + +fn validate_recovery_qdrant_rebuild( + rebuild: &RecoveryQdrantRebuild, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if rebuild.evidence_refs.is_empty() { + return Err(eyre::eyre!( + "{} has incomplete Qdrant rebuild drill evidence.", + path.display() + )); + } + + ensure_known_evidence_refs(path, evidence_ids, &rebuild.evidence_refs) +} + +fn validate_recovery_migration_repair( + repair: &RecoveryMigrationRepair, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if repair.evidence_refs.is_empty() { + return Err(eyre::eyre!( + "{} has incomplete migration repair drill evidence.", + path.display() + )); + } + + ensure_known_evidence_refs(path, evidence_ids, &repair.evidence_refs) +} + +fn validate_recovery_dead_letter( + dead_letter: &RecoveryDeadLetterHandling, + path: &Path, + evidence_ids: &BTreeSet, +) -> Result<()> { + if dead_letter.evidence_refs.is_empty() { + return Err(eyre::eyre!( + "{} has incomplete dead-letter handling drill evidence.", + path.display() + )); + } + + ensure_known_evidence_refs(path, evidence_ids, &dead_letter.evidence_refs) +} + +fn ensure_known_evidence_refs( + path: &Path, + evidence_ids: &BTreeSet, + refs: &[String], +) -> Result<()> { + for evidence_ref in refs { + ensure_known_evidence(path, evidence_ids, evidence_ref)?; + } + + Ok(()) +} + fn validate_optional_summary_time(path: &Path, value: Option<&str>, id: &str) -> Result<()> { if let Some(value) = value { validate_optional_rfc3339(value, path, id)?; @@ -4074,6 +4452,7 @@ fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { proactive_briefs: Vec::new(), scheduled_tasks: Vec::new(), work_journal_readbacks: Vec::new(), + recovery_drills: Vec::new(), latency_ms: None, cost: None, trace_explainability: None, @@ -4117,6 +4496,23 @@ fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { evidence.extend(candidate.evidence_refs.iter().cloned()); } } + for drill in &answer.recovery_drills { + evidence.extend(drill.backup_pitr.evidence_refs.iter().cloned()); + evidence.extend(drill.degraded_read.evidence_refs.iter().cloned()); + evidence.extend(drill.rpo.evidence_refs.iter().cloned()); + evidence.extend(drill.rto.evidence_refs.iter().cloned()); + evidence.extend(drill.outbox_replay.evidence_refs.iter().cloned()); + evidence.extend(drill.qdrant_rebuild.evidence_refs.iter().cloned()); + evidence.extend(drill.migration_repair.evidence_refs.iter().cloned()); + evidence.extend(drill.dead_letter.evidence_refs.iter().cloned()); + + for injection in &drill.failure_injections { + evidence.extend(injection.evidence_refs.iter().cloned()); + } + for count in &drill.authority_record_counts { + evidence.extend(count.evidence_refs.iter().cloned()); + } + } evidence } @@ -5860,6 +6256,7 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { proactive_brief: scoring.proactive_brief, scheduled_memory: scoring.scheduled_memory, work_continuity: scoring.work_continuity, + recovery_drills: answer.recovery_drills.clone(), trap_ids_used: scoring.trap_ids_used, dimension_scores: scoring.dimension_scores, reason: scoring.reason, @@ -6553,6 +6950,7 @@ fn operational_evidence_report( cost: operational_cost_summary(reports), resource: operational_resource_summary(paired.as_slice()), cold_start_restore_rebuild: operational_cold_start_restore_rebuild(paired.as_slice()), + authority_recovery: operational_authority_recovery(reports), missing_private_provider_inputs_are_typed_blockers, private_corpus_pass_claim_allowed, provider_backed_pass_claim_allowed, @@ -6723,6 +7121,86 @@ fn operational_cold_start_restore_rebuild( } } +fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthorityRecoveryReport { + let recovery_jobs = + reports.iter().filter(|report| !report.recovery_drills.is_empty()).collect::>(); + let drills = + recovery_jobs.iter().flat_map(|report| report.recovery_drills.iter()).collect::>(); + let authority_counts = + drills.iter().flat_map(|drill| drill.authority_record_counts.iter()).collect::>(); + let mut job_ids = recovery_jobs + .iter() + .map(|report| report.job_id.clone()) + .collect::>() + .into_iter() + .collect::>(); + + job_ids.sort(); + OperationalAuthorityRecoveryReport { + drill_count: drills.len(), + drill_pass_count: recovery_jobs + .iter() + .filter(|report| report.status == TypedStatus::Pass) + .map(|report| report.recovery_drills.len()) + .sum(), + topology_reported_count: drills + .iter() + .filter(|drill| !drill.topology.authority_store.trim().is_empty()) + .count(), + failure_injection_count: drills.iter().map(|drill| drill.failure_injections.len()).sum(), + degraded_read_labeled_count: drills + .iter() + .filter(|drill| !drill.degraded_read.unavailable_labels.is_empty()) + .count(), + source_of_truth_visible_count: drills + .iter() + .filter(|drill| drill.degraded_read.source_of_truth_visible) + .count(), + rpo_target_count: drills.len(), + rpo_met_count: drills + .iter() + .filter(|drill| drill.rpo.measured_seconds <= drill.rpo.target_seconds) + .count(), + rto_target_count: drills.len(), + rto_met_count: drills + .iter() + .filter(|drill| drill.rto.measured_seconds <= drill.rto.target_seconds) + .count(), + authority_plane_count: authority_counts.len(), + source_ref_preserved_count: authority_counts + .iter() + .filter(|count| count.source_refs_preserved) + .count(), + lifecycle_history_preserved_count: authority_counts + .iter() + .filter(|count| count.lifecycle_history_preserved) + .count(), + idempotent_outbox_replay_count: drills + .iter() + .filter(|drill| { + drill.outbox_replay.idempotent && drill.outbox_replay.duplicate_write_count == 0 + }) + .count(), + qdrant_rebuild_complete_count: drills + .iter() + .filter(|drill| { + drill.qdrant_rebuild.complete + && drill.qdrant_rebuild.missing_vector_count == 0 + && drill.qdrant_rebuild.error_count == 0 + }) + .count(), + migration_repair_count: drills + .iter() + .filter(|drill| drill.migration_repair.applied) + .count(), + dead_letter_handled_count: drills + .iter() + .filter(|drill| drill.dead_letter.handled_count >= drill.dead_letter.dead_letter_count) + .count(), + job_ids, + } +} + fn operational_evidence_tier(job: &RealWorldJob) -> &'static str { if job_has_tag(job, "provider_backed") { "provider_backed" @@ -8089,6 +8567,27 @@ fn render_markdown_operational_evidence(out: &mut String, report: &RealWorldRepo evidence.cold_start_restore_rebuild.qdrant_rebuild_pass_count, evidence.cold_start_restore_rebuild.qdrant_rebuild_job_count )); + out.push_str(&format!( + "- Authority recovery drills: `{}`/`{}` pass, topology `{}`, failure injections `{}`, degraded reads labeled `{}`, source-of-truth visible `{}`, RPO `{}`/`{}` met, RTO `{}`/`{}` met, source refs `{}`/`{}` preserved, lifecycle histories `{}`/`{}` preserved, idempotent replay `{}`, complete Qdrant rebuild `{}`, migration repair `{}`, dead-letter handled `{}`\n\n", + evidence.authority_recovery.drill_pass_count, + evidence.authority_recovery.drill_count, + evidence.authority_recovery.topology_reported_count, + evidence.authority_recovery.failure_injection_count, + evidence.authority_recovery.degraded_read_labeled_count, + evidence.authority_recovery.source_of_truth_visible_count, + evidence.authority_recovery.rpo_met_count, + evidence.authority_recovery.rpo_target_count, + evidence.authority_recovery.rto_met_count, + evidence.authority_recovery.rto_target_count, + evidence.authority_recovery.source_ref_preserved_count, + evidence.authority_recovery.authority_plane_count, + evidence.authority_recovery.lifecycle_history_preserved_count, + evidence.authority_recovery.authority_plane_count, + evidence.authority_recovery.idempotent_outbox_replay_count, + evidence.authority_recovery.qdrant_rebuild_complete_count, + evidence.authority_recovery.migration_repair_count, + evidence.authority_recovery.dead_letter_handled_count + )); out.push_str("| Evidence Tier | Status | Jobs | Pass | Blocked | Incomplete | Not Encoded | Mean Latency | Cost | Resource | Cold Start | Restore | Qdrant Rebuild | Pass Claim |\n"); out.push_str("| --- | --- | ---: | ---: | ---: | ---: | ---: | --- | --- | ---: | ---: | ---: | ---: | --- |\n"); diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 4015ab59..23106d28 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -2911,7 +2911,7 @@ fn assert_live_sweep_record(adapter: &Value, production_ops_status: &str) -> Res fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(81)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); Ok(()) } @@ -7880,28 +7880,42 @@ fn assert_hard_fail_hit(job: &Value, expected_hit: &str) { fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = run_json_report_from(production_ops_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(7)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_production_ops_summary(&report)?; + assert_production_ops_jobs(&report)?; + assert_production_ops_operational_evidence(&report)?; + + Ok(()) +} + +fn assert_production_ops_summary(report: &Value) -> Result<()> { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), - Some(1) + Some(2) ); assert_eq!( report.pointer("/private_corpus_redaction/private_fixture_count").and_then(Value::as_u64), Some(1) ); - let suites = array_at(&report, "/suites")?; + let suites = array_at(report, "/suites")?; let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(7)); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); - let jobs = array_at(&report, "/jobs")?; + Ok(()) +} + +fn assert_production_ops_jobs(report: &Value) -> Result<()> { + let jobs = array_at(report, "/jobs")?; + let authority_recovery = + find_by_field(jobs, "/job_id", "production-ops-authority-plane-recovery-001")?; let backfill = find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; let restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; let public_proxy = find_by_field(jobs, "/job_id", "production-ops-public-proxy-addendum-001")?; @@ -7910,6 +7924,9 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let credentials = find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; let dependency = find_by_field(jobs, "/job_id", "production-ops-cold-start-dependency-001")?; + assert_authority_recovery_job(authority_recovery)?; + + assert_eq!(authority_recovery.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); @@ -7929,6 +7946,23 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { Some("provider_backed") ); assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} + +fn assert_authority_recovery_job(job: &Value) -> Result<()> { + assert_eq!(job.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(job.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); + assert_eq!( + job.pointer("/recovery_drills/0/contract_schema").and_then(Value::as_str), + Some("elf.authority_recovery_drill/v1") + ); + assert!(array_at(job, "/hard_fail_hits")?.is_empty()); + + Ok(()) +} + +fn assert_production_ops_operational_evidence(report: &Value) -> Result<()> { assert_eq!( report.pointer("/operational_evidence/schema").and_then(Value::as_str), Some("elf.operational_evidence_gates/v1") @@ -7953,11 +7987,11 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { ); assert_eq!( report.pointer("/operational_evidence/latency/measured_job_count").and_then(Value::as_u64), - Some(7) + Some(8) ); assert_eq!( report.pointer("/operational_evidence/cost/jobs_with_cost_report").and_then(Value::as_u64), - Some(7) + Some(8) ); assert_eq!( report @@ -7969,17 +8003,19 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { report .pointer("/operational_evidence/cold_start_restore_rebuild/qdrant_rebuild_pass_count") .and_then(Value::as_u64), - Some(1) + Some(2) ); - let tiers = array_at(&report, "/operational_evidence/tiers")?; + assert_authority_recovery_operational_evidence(report); + + let tiers = array_at(report, "/operational_evidence/tiers")?; let local_fixture = find_by_field(tiers, "/tier", "local_fixture")?; let public_proxy_tier = find_by_field(tiers, "/tier", "public_proxy")?; let private_corpus = find_by_field(tiers, "/tier", "private_corpus")?; let provider_backed = find_by_field(tiers, "/tier", "provider_backed")?; assert_eq!(local_fixture.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(local_fixture.pointer("/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(local_fixture.pointer("/job_count").and_then(Value::as_u64), Some(5)); assert_eq!(public_proxy_tier.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(public_proxy_tier.pointer("/job_count").and_then(Value::as_u64), Some(1)); assert_eq!(private_corpus.pointer("/status").and_then(Value::as_str), Some("blocked")); @@ -7990,6 +8026,69 @@ fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { Ok(()) } +fn assert_authority_recovery_operational_evidence(report: &Value) { + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/drill_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/authority_plane_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/source_ref_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/lifecycle_history_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/rpo_met_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/rto_met_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/idempotent_outbox_replay_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/qdrant_rebuild_complete_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/migration_repair_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/dead_letter_handled_count") + .and_then(Value::as_u64), + Some(1) + ); +} + #[test] fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { let report = run_json_report_from(core_archival_memory_fixture_dir())?; @@ -8215,9 +8314,9 @@ fn assert_root_knowledge_summary(report: &Value) { } fn assert_root_aggregate_summary(report: &Value) -> Result<()> { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(81)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(19)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(74)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(75)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); @@ -8252,19 +8351,19 @@ fn assert_root_aggregate_summary(report: &Value) -> Result<()> { assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report.pointer("/summary/qdrant_rebuild_pass_count").and_then(Value::as_u64), - Some(2) + Some(3) ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(173) + Some(180) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(173) + Some(180) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); @@ -8545,7 +8644,7 @@ fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let production_ops = find_by_field(suites, "/suite_id", "production_ops")?; assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(7)); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); let proactive = find_by_field(suites, "/suite_id", "proactive_brief")?; @@ -8585,6 +8684,8 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; let production_restore = find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let production_authority = + find_by_field(jobs, "/job_id", "production-ops-authority-plane-recovery-001")?; let core_fallback = find_by_field(jobs, "/job_id", "core-archival-archival-fallback-001")?; let stale_core = find_by_field(jobs, "/job_id", "core-archival-stale-core-detection-001")?; let scheduled_weekly = @@ -8595,6 +8696,15 @@ fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { production_restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true) ); + assert_eq!( + production_authority.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), + Some(true) + ); + assert_eq!(production_authority.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + production_authority.pointer("/recovery_drills/0/contract_schema").and_then(Value::as_str), + Some("elf.authority_recovery_drill/v1") + ); assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); diff --git a/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md new file mode 100644 index 00000000..a923a760 --- /dev/null +++ b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md @@ -0,0 +1,90 @@ +--- +type: Drift Audit +title: "Authority Recovery Drill Drift Audit" +description: "Drift audit for production-ops authority recovery drill benchmark artifacts and reports." +resource: docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md +status: active +authority: evidence +owner: docs +last_verified: 2026-06-27 +tags: + - docs + - evidence + - benchmarking + - production-ops +source_refs: + - https://linear.app/hackink/issue/XY-1119 +code_refs: + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +related: + - docs/spec/real_world_agent_memory_benchmark_v1.md + - docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +drift_watch: + - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/fixtures/real_world_memory/production_ops/ + - docs/spec/real_world_agent_memory_benchmark_v1.md +--- +# Authority Recovery Drill Drift Audit + +Purpose: Anchor the production-ops authority recovery drill report contract to the +runner, fixture, and documentation surfaces. +Read this when: You need evidence for backup/PITR, idempotent outbox replay, Qdrant +rebuild completeness, degraded read, migration repair, dead-letter handling, and +RPO/RTO reporting in the real-world memory benchmark. +Not this document: Live production restore proof, private-corpus quality, hosted HA, +or multi-region failover evidence. + +## Watched Claims + +- `elf.authority_recovery_drill/v1` is a benchmark artifact under + `adapter_response.answer.recovery_drills[]`. +- The runner validates drill topology, failure injections, backup/PITR evidence, + degraded-read labels, RPO/RTO measurements, authority record counts for source, + journal, memory, knowledge, proposal, trace, and audit planes, idempotent outbox + replay, Qdrant rebuild completeness, migration repair, and dead-letter handling. +- Reports expose those drill counts through + `operational_evidence.authority_recovery`. +- The checked-in fixture is local synthetic evidence only. It does not prove private + corpus quality, provider-backed behavior, hosted HA, standby failover, or + multi-region SLA. + +## Evidence Anchors + +- `apps/elf-eval/src/bin/real_world_job_benchmark.rs` defines and validates + `AuthorityRecoveryDrillArtifact` and aggregates + `OperationalAuthorityRecoveryReport`. +- `apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json` + encodes one production-ops job with topology, degraded-read labels, RPO/RTO, + before/after authority record counts, replay, rebuild, migration repair, and + dead-letter evidence. +- `docs/spec/real_world_agent_memory_benchmark_v1.md` defines the artifact schema and + production-ops/report semantics. +- `docs/runbook/benchmarking/real_world_agent_memory_benchmark.md` routes operators to + the production-ops command and describes the authority recovery drill coverage. + +## Reverse Checks + +- Run `cargo make real-world-memory-production-ops` to parse the fixture and render + the production-ops report. +- Run `cargo make check-docs` after docs changes. + +## Verdict + +pass + +## Required Updates + +- If recovery drill fields change, update the runner structs, fixture, benchmark + spec, runbook, and this audit together. +- If a live Docker recovery drill is added later, preserve the fixture/local evidence + boundary and add separate live evidence instead of reclassifying this fixture. + +## Citations + +- `apps/elf-eval/src/bin/real_world_job_benchmark.rs` +- `apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json` +- `docs/spec/real_world_agent_memory_benchmark_v1.md` +- `docs/runbook/benchmarking/real_world_agent_memory_benchmark.md` diff --git a/docs/evidence/index.md b/docs/evidence/index.md index bc0a38ab..3d211753 100644 --- a/docs/evidence/index.md +++ b/docs/evidence/index.md @@ -27,5 +27,7 @@ Routes to: Drift audits and evidence concepts under `docs/evidence/`. suppression boundaries. - `2026-06-27-work-journal-drift-audit.md`: Drift audit for Work Journal source-adjacent capture, readback, redaction, and promotion-boundary behavior. +- `2026-06-27-authority-recovery-drill-drift-audit.md`: Drift audit for + production-ops authority recovery drill benchmark artifacts and reports. - `external_memory_pattern_radar_latest.md`: Latest weekly external memory pattern radar summary. diff --git a/docs/log.md b/docs/log.md index c7e08500..4a9337dc 100644 --- a/docs/log.md +++ b/docs/log.md @@ -140,3 +140,7 @@ logs. Work Journal oracle fields, report rates, and hard-fail counters for redaction, rejected-option, inferred-step, journal-authority, and janitor false-promotion boundaries. +- Added the XY-1119 authority recovery drill production-ops slice, defining + `elf.authority_recovery_drill/v1` report artifacts, validating topology, degraded + reads, RPO/RTO, authority record counts, idempotent outbox replay, Qdrant rebuild, + migration repair, and dead-letter handling, and linking the drift audit. diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index 94c16659..16c883f3 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -6,7 +6,7 @@ resource: docs/runbook/benchmarking/real_world_agent_memory_benchmark.md status: active authority: procedural owner: runbook -last_verified: 2026-06-23 +last_verified: 2026-06-27 tags: - docs - runbook @@ -192,10 +192,12 @@ including the retrieval-quality slice below. The suite currently encodes: source-id preservation, evidence binding, no secret leakage, and fixture-backed capture/integration boundary classification. - `production_ops`: interrupted generated backfill resume, backup/restore plus - cold-start readback, resource-envelope interpretation, public-proxy - production-private addendum readback, pinned OpenViking local embedding - runtime/wrong-result classification, missing private manifest `blocked` - classification, and provider credential boundary `blocked` classification. + cold-start readback, recoverable authority-plane drill evidence over source, + journal, memory, knowledge, proposal, trace, and audit records, + resource-envelope interpretation, public-proxy production-private addendum readback, + pinned OpenViking local embedding runtime/wrong-result classification, missing + private manifest `blocked` classification, and provider credential boundary + `blocked` classification. - `personalization`: scoped stable preference correction without temporary or cross-project preference leakage. - `core_archival_memory`: core block attachment, scope, provenance, stale-core @@ -705,10 +707,15 @@ The production-ops fixtures live under `apps/elf-eval/fixtures/real_world_memory/production_ops/`. They encode user-job readback over existing public benchmark and restore evidence: interrupted backfill resume from checkpoint, clean-run comparison, backup/restore readback, Qdrant rebuild -from Postgres-held vectors, cold-start search recovery, and resource-envelope -interpretation. The P4 slice also encodes the operator-approved public-proxy -production-private addendum and emits `elf.operational_evidence_gates/v1` so local -fixture, public-proxy, private-corpus, and provider-backed evidence remain separate. +from Postgres-held vectors, cold-start search recovery, recoverable authority-plane +drills, and resource-envelope interpretation. Authority recovery drills use +`elf.authority_recovery_drill/v1` under `adapter_response.answer.recovery_drills[]` +to report topology, failure injection, backup/PITR, degraded-read labels, RPO/RTO +targets and measurements, before/after authority record counts, idempotent outbox +replay, Qdrant rebuild completeness, migration repair, and dead-letter handling. The +P4 slice also encodes the operator-approved public-proxy production-private addendum +and emits `elf.operational_evidence_gates/v1` so local fixture, public-proxy, +private-corpus, and provider-backed evidence remain separate. The same slice deliberately keeps non-pass boundaries typed. A missing private production manifest is `blocked`, unavailable provider credentials are `blocked`, and diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 8a415752..be6ab55c 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -6,7 +6,7 @@ resource: docs/spec/real_world_agent_memory_benchmark_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-23 +last_verified: 2026-06-27 tags: - docs - spec @@ -14,7 +14,7 @@ source_refs: [] code_refs: - Makefile.toml - apps/elf-eval/src/bin/real_world_job_benchmark.rs - - apps/elf-eval/fixtures/real_world_memory/ + - apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json related: [] drift_watch: - docs/spec/real_world_agent_memory_benchmark_v1.md @@ -451,6 +451,34 @@ untraced section count. Rebuild results are acceptable only when repeated output deterministic enough for regression comparison or every allowed variance is explicitly reported. +### Optional `adapter_response.answer.recovery_drills` + +Production-ops fixtures MAY include authority recovery drill artifacts in +`corpus.adapter_response.answer.recovery_drills[]`. These artifacts use schema +`elf.authority_recovery_drill/v1` and are fixture/report evidence, not proof of a +multi-region or hosted HA topology. + +Each recovery drill MUST include: + +- `drill_id`, `contract_schema`, and `generated_at`; +- `topology` with the authority store, derived indexes, adapters, and failover + boundary; +- one or more `failure_injections` with target, fault, timestamps, and evidence refs; +- `backup_pitr` with backup reference, PITR target, restored flag, and evidence refs; +- `degraded_read` with unavailable derived indexes or adapters labeled separately + from visible source-of-truth records; +- `rpo` and `rto` targets and measured seconds with evidence refs; +- `authority_record_counts` for `source`, `journal`, `memory`, `knowledge`, + `proposal`, `trace`, and `audit`, including before/after counts plus source-ref + and lifecycle-history preservation booleans; +- `outbox_replay`, `qdrant_rebuild`, `migration_repair`, and `dead_letter` sections + with evidence refs. + +A recovery drill MUST NOT claim failover unless a standby or replacement authority +service is actually part of the topology. Qdrant and document indexes remain derived +and rebuildable; degraded read must label unavailable derived indexes or adapters +without hiding Postgres source-of-truth records. + ### `negative_traps` Negative traps MUST be explicit so systems are tested against realistic memory failure @@ -638,7 +666,7 @@ Suite ids are stable public names. Each suite MUST contain at least one | `source_library` | Preserve long-form source records and citable excerpts without silently promoting them to memory. | Capture a long document; hydrate a source_ref excerpt; preserve a social/thread source boundary. | Source ids, canonical source metadata, source_ref hydration pointers, verified excerpts, explicit no-autopromotion boundary. | answer_correctness, evidence_grounding, lifecycle_behavior, trap_avoidance. | PageIndex, ELF. | | `operator_debugging_ux` | Show whether a wrong or ambiguous memory result can be debugged without raw store spelunking. | Explain why a result ranked first; inspect a trace; identify which stage dropped expected evidence. | Trace bundle, retrieval trajectory, candidate metrics, viewer or CLI readback. | debuggability, evidence_grounding, workflow_helpfulness, answer_correctness. | claude-mem, qmd, agentmemory, ELF. | | `capture_integration` | Evaluate how accurately work observations become usable memory across agents and tools. | Capture a session decision; exclude private spans; import external agent observations. | Hook/import logs, write policy audits, excluded spans, resulting note ids. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior. | agentmemory, claude-mem, memsearch, mem0. | -| `production_ops` | Prove safe operation under backup, restore, backfill, cold start, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | +| `production_ops` | Prove safe operation under backup, restore, backfill, cold start, authority recovery, resource, and credential boundaries. | Resume interrupted import; restore from backup; report missing private manifest as bounded caveat; report authority-plane degraded read and replay drills. | Command/report artifacts, resource envelope, checkpoint state, failure guard evidence, authority record counts, RPO/RTO measurements, degraded-read labels. | lifecycle_behavior, latency_resource, uncertainty_handling, evidence_grounding. | ELF, qmd, memsearch, LangGraph. | | `personalization` | Apply user/project preferences correctly without leaking across scopes or overfitting stale preferences. | Remember preferred response style; avoid using another project tenant's note; update a preference. | Scoped memory ids, preference versions, tenant/project/agent context, negative cross-scope traps. | personalization_fit, trap_avoidance, evidence_grounding, answer_correctness. | mem0, Letta, agentmemory, ELF. | | `core_archival_memory` | Verify always-loaded core memory behavior separately from archival note search and derived retrieval indexes. | Read an attached core block; enforce core block scope; detect stale core state from archival evidence; fall back to archival notes; recover a decision from core routing plus archival rationale. | Core block ids, attachment ids, read_profile/scope metadata, source_ref and audit history, archival note evidence ids, stale-core traps, and explicit no-Qdrant-core-block boundary evidence. | answer_correctness, evidence_grounding, trap_avoidance, lifecycle_behavior, workflow_helpfulness. | Letta, ELF. | | `context_trajectory` | Measure staged context trajectory, hierarchy selection, and recursive/context expansion without converting setup or retrieval preconditions into trajectory wins. | Explain whether a staged trajectory can be scored; identify selected hierarchy nodes; report recursive expansion paths and pruned branches. | Same-corpus expected evidence ids, matched/missing evidence ids, stage artifacts, selected hierarchy nodes, rejected siblings or decoys, expansion paths, pruned branches, comparable ELF trace/session artifacts when a comparison is claimed. | answer_correctness, evidence_grounding, trap_avoidance, debuggability, workflow_helpfulness. | OpenViking, ELF, qmd. | @@ -690,9 +718,13 @@ Reports MUST include: separating `local_fixture`, `public_proxy`, `private_corpus`, and `provider_backed` tiers. The gates MUST report tier status, job counts, pass and typed non-pass counts, mean latency, cost summary, resource-envelope counts, - cold-start/restore/Qdrant-rebuild counts, typed blocker reasons, and explicit - booleans for whether private-corpus or provider-backed pass claims are allowed. - Local fixture and public-proxy passes MUST NOT satisfy private-corpus or + cold-start/restore/Qdrant-rebuild counts, authority recovery drill counts, + topology coverage, failure-injection counts, degraded-read label counts, visible + source-of-truth counts, RPO/RTO target and met counts, source-ref and lifecycle + preservation counts, idempotent replay counts, complete Qdrant rebuild counts, + migration repair counts, dead-letter handling counts, typed blocker reasons, and + explicit booleans for whether private-corpus or provider-backed pass claims are + allowed. Local fixture and public-proxy passes MUST NOT satisfy private-corpus or provider-backed proof. - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; From 25637affa0c9f34029a8dac9f66f49aa5fa2944b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Sat, 27 Jun 2026 21:44:14 +0800 Subject: [PATCH 2/3] {"schema":"decodex/commit/1","summary":"Tighten authority recovery drill preservation checks","authority":"XY-1119","related":["XY-1098"]} --- .../src/bin/real_world_job_benchmark.rs | 24 +++++- .../tests/real_world_job_benchmark.rs | 83 +++++++++++++++++++ ...27-authority-recovery-drill-drift-audit.md | 16 ++-- .../real_world_agent_memory_benchmark.md | 12 +-- .../real_world_agent_memory_benchmark_v1.md | 20 +++-- 5 files changed, 133 insertions(+), 22 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index 6deca7ba..ebc24436 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -1210,11 +1210,13 @@ struct OperationalAuthorityRecoveryReport { failure_injection_count: usize, degraded_read_labeled_count: usize, source_of_truth_visible_count: usize, + backup_pitr_restored_count: usize, rpo_target_count: usize, rpo_met_count: usize, rto_target_count: usize, rto_met_count: usize, authority_plane_count: usize, + record_count_preserved_count: usize, source_ref_preserved_count: usize, lifecycle_history_preserved_count: usize, idempotent_outbox_replay_count: usize, @@ -3372,6 +3374,7 @@ fn validate_recovery_backup_pitr( if backup_pitr.backup_ref.trim().is_empty() || backup_pitr.pitr_target.trim().is_empty() || backup_pitr.evidence_refs.is_empty() + || !backup_pitr.restored { return Err(eyre::eyre!("{} has incomplete backup/PITR drill evidence.", path.display())); } @@ -3442,6 +3445,14 @@ fn validate_recovery_authority_record_counts( drill.drill_id )); } + if count.before_count != count.after_count { + return Err(eyre::eyre!( + "{} authority recovery drill {} lost or gained {} authority records.", + path.display(), + drill.drill_id, + count.plane + )); + } ensure_known_evidence_refs(path, evidence_ids, &count.evidence_refs)?; } @@ -7156,6 +7167,10 @@ fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthority .iter() .filter(|drill| drill.degraded_read.source_of_truth_visible) .count(), + backup_pitr_restored_count: drills + .iter() + .filter(|drill| drill.backup_pitr.restored) + .count(), rpo_target_count: drills.len(), rpo_met_count: drills .iter() @@ -7167,6 +7182,10 @@ fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthority .filter(|drill| drill.rto.measured_seconds <= drill.rto.target_seconds) .count(), authority_plane_count: authority_counts.len(), + record_count_preserved_count: authority_counts + .iter() + .filter(|count| count.before_count == count.after_count) + .count(), source_ref_preserved_count: authority_counts .iter() .filter(|count| count.source_refs_preserved) @@ -8568,17 +8587,20 @@ fn render_markdown_operational_evidence(out: &mut String, report: &RealWorldRepo evidence.cold_start_restore_rebuild.qdrant_rebuild_job_count )); out.push_str(&format!( - "- Authority recovery drills: `{}`/`{}` pass, topology `{}`, failure injections `{}`, degraded reads labeled `{}`, source-of-truth visible `{}`, RPO `{}`/`{}` met, RTO `{}`/`{}` met, source refs `{}`/`{}` preserved, lifecycle histories `{}`/`{}` preserved, idempotent replay `{}`, complete Qdrant rebuild `{}`, migration repair `{}`, dead-letter handled `{}`\n\n", + "- Authority recovery drills: `{}`/`{}` pass, topology `{}`, failure injections `{}`, backup/PITR restored `{}`, degraded reads labeled `{}`, source-of-truth visible `{}`, RPO `{}`/`{}` met, RTO `{}`/`{}` met, record counts `{}`/`{}` preserved, source refs `{}`/`{}` preserved, lifecycle histories `{}`/`{}` preserved, idempotent replay `{}`, complete Qdrant rebuild `{}`, migration repair `{}`, dead-letter handled `{}`\n\n", evidence.authority_recovery.drill_pass_count, evidence.authority_recovery.drill_count, evidence.authority_recovery.topology_reported_count, evidence.authority_recovery.failure_injection_count, + evidence.authority_recovery.backup_pitr_restored_count, evidence.authority_recovery.degraded_read_labeled_count, evidence.authority_recovery.source_of_truth_visible_count, evidence.authority_recovery.rpo_met_count, evidence.authority_recovery.rpo_target_count, evidence.authority_recovery.rto_met_count, evidence.authority_recovery.rto_target_count, + evidence.authority_recovery.record_count_preserved_count, + evidence.authority_recovery.authority_plane_count, evidence.authority_recovery.source_ref_preserved_count, evidence.authority_recovery.authority_plane_count, evidence.authority_recovery.lifecycle_history_preserved_count, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 23106d28..eee42734 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -484,6 +484,22 @@ fn run_json_report_from(fixtures: PathBuf) -> Result { Ok(serde_json::from_slice(&output.stdout)?) } +fn run_json_report_from_failure(fixtures: PathBuf) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixtures) + .output()?; + + assert!( + !output.status.success(), + "real_world_job runner unexpectedly passed: {}", + String::from_utf8_lossy(&output.stdout), + ); + + Ok(String::from_utf8_lossy(&output.stderr).to_string()) +} + fn run_json_report() -> Result { run_json_report_from(fixture_dir()) } @@ -8039,6 +8055,18 @@ fn assert_authority_recovery_operational_evidence(report: &Value) { .and_then(Value::as_u64), Some(7) ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/backup_pitr_restored_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/record_count_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); assert_eq!( report .pointer("/operational_evidence/authority_recovery/source_ref_preserved_count") @@ -8089,6 +8117,61 @@ fn assert_authority_recovery_operational_evidence(report: &Value) { ); } +#[test] +fn authority_recovery_fixture_rejects_unrestored_backup_or_record_count_loss() -> Result<()> { + assert_authority_recovery_fixture_failure( + "unrestored-backup", + |fixture| { + set_json_pointer( + fixture, + "/corpus/adapter_response/answer/recovery_drills/0/backup_pitr/restored", + serde_json::json!(false), + ) + }, + "incomplete backup/PITR drill evidence", + )?; + + assert_authority_recovery_fixture_failure( + "record-count-loss", + |fixture| { + set_json_pointer( + fixture, + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/after_count", + serde_json::json!(2), + ) + }, + "lost or gained source authority records", + ) +} + +fn assert_authority_recovery_fixture_failure( + slug: &str, + mutate: F, + expected_error: &str, +) -> Result<()> +where + F: FnOnce(&mut Value) -> Result<()>, +{ + let fixture_path = production_ops_fixture_dir().join("authority_plane_recovery_drill.json"); + let mut fixture = load_json(&fixture_path)?; + + mutate(&mut fixture)?; + + let temp_dir = env::temp_dir().join(format!("elf-authority-recovery-{slug}-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("fixture.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let stderr = run_json_report_from_failure(temp_dir)?; + + assert!( + stderr.contains(expected_error), + "missing expected error `{expected_error}` in stderr: {stderr}", + ); + + Ok(()) +} + #[test] fn core_archival_memory_fixtures_score_separate_core_and_archival_jobs() -> Result<()> { let report = run_json_report_from(core_archival_memory_fixture_dir())?; diff --git a/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md index a923a760..0677d278 100644 --- a/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md +++ b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md @@ -41,12 +41,14 @@ or multi-region failover evidence. - `elf.authority_recovery_drill/v1` is a benchmark artifact under `adapter_response.answer.recovery_drills[]`. -- The runner validates drill topology, failure injections, backup/PITR evidence, - degraded-read labels, RPO/RTO measurements, authority record counts for source, - journal, memory, knowledge, proposal, trace, and audit planes, idempotent outbox - replay, Qdrant rebuild completeness, migration repair, and dead-letter handling. +- The runner validates drill topology, failure injections, backup/PITR restored + evidence, degraded-read labels, RPO/RTO measurements, matching authority record + counts for source, journal, memory, knowledge, proposal, trace, and audit planes, + idempotent outbox replay, Qdrant rebuild completeness, migration repair, and + dead-letter handling. - Reports expose those drill counts through - `operational_evidence.authority_recovery`. + `operational_evidence.authority_recovery`, including backup/PITR restored and + record-count preservation counters. - The checked-in fixture is local synthetic evidence only. It does not prove private corpus quality, provider-backed behavior, hosted HA, standby failover, or multi-region SLA. @@ -58,8 +60,8 @@ or multi-region failover evidence. `OperationalAuthorityRecoveryReport`. - `apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json` encodes one production-ops job with topology, degraded-read labels, RPO/RTO, - before/after authority record counts, replay, rebuild, migration repair, and - dead-letter evidence. + matching before/after authority record counts, replay, rebuild, migration repair, + and dead-letter evidence. - `docs/spec/real_world_agent_memory_benchmark_v1.md` defines the artifact schema and production-ops/report semantics. - `docs/runbook/benchmarking/real_world_agent_memory_benchmark.md` routes operators to diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index 16c883f3..70c1cc98 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -711,11 +711,13 @@ from Postgres-held vectors, cold-start search recovery, recoverable authority-pl drills, and resource-envelope interpretation. Authority recovery drills use `elf.authority_recovery_drill/v1` under `adapter_response.answer.recovery_drills[]` to report topology, failure injection, backup/PITR, degraded-read labels, RPO/RTO -targets and measurements, before/after authority record counts, idempotent outbox -replay, Qdrant rebuild completeness, migration repair, and dead-letter handling. The -P4 slice also encodes the operator-approved public-proxy production-private addendum -and emits `elf.operational_evidence_gates/v1` so local fixture, public-proxy, -private-corpus, and provider-backed evidence remain separate. +targets and measurements, matching before/after authority record counts, idempotent +outbox replay, Qdrant rebuild completeness, migration repair, and dead-letter +handling. The generated `operational_evidence.authority_recovery` report includes +backup/PITR restored and record-count preservation counters. The P4 slice also +encodes the operator-approved public-proxy production-private addendum and emits +`elf.operational_evidence_gates/v1` so local fixture, public-proxy, private-corpus, +and provider-backed evidence remain separate. The same slice deliberately keeps non-pass boundaries typed. A missing private production manifest is `blocked`, unavailable provider credentials are `blocked`, and diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index be6ab55c..cec4cc59 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -464,13 +464,14 @@ Each recovery drill MUST include: - `topology` with the authority store, derived indexes, adapters, and failover boundary; - one or more `failure_injections` with target, fault, timestamps, and evidence refs; -- `backup_pitr` with backup reference, PITR target, restored flag, and evidence refs; +- `backup_pitr` with backup reference, PITR target, `restored = true`, and evidence + refs; - `degraded_read` with unavailable derived indexes or adapters labeled separately from visible source-of-truth records; - `rpo` and `rto` targets and measured seconds with evidence refs; - `authority_record_counts` for `source`, `journal`, `memory`, `knowledge`, - `proposal`, `trace`, and `audit`, including before/after counts plus source-ref - and lifecycle-history preservation booleans; + `proposal`, `trace`, and `audit`, including matching before/after counts plus + source-ref and lifecycle-history preservation booleans; - `outbox_replay`, `qdrant_rebuild`, `migration_repair`, and `dead_letter` sections with evidence refs. @@ -720,12 +721,13 @@ Reports MUST include: typed non-pass counts, mean latency, cost summary, resource-envelope counts, cold-start/restore/Qdrant-rebuild counts, authority recovery drill counts, topology coverage, failure-injection counts, degraded-read label counts, visible - source-of-truth counts, RPO/RTO target and met counts, source-ref and lifecycle - preservation counts, idempotent replay counts, complete Qdrant rebuild counts, - migration repair counts, dead-letter handling counts, typed blocker reasons, and - explicit booleans for whether private-corpus or provider-backed pass claims are - allowed. Local fixture and public-proxy passes MUST NOT satisfy private-corpus or - provider-backed proof. + source-of-truth counts, backup/PITR restored counts, RPO/RTO target and met counts, + authority record-count preservation counts, source-ref and lifecycle preservation + counts, idempotent replay counts, complete Qdrant rebuild counts, migration repair + counts, dead-letter handling counts, typed blocker reasons, and explicit booleans + for whether private-corpus or provider-backed pass claims are allowed. Local + fixture and public-proxy passes MUST NOT satisfy private-corpus or provider-backed + proof. - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; - per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer From eb377395081f63b8355de27d2b5ef319301aa967 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Sat, 27 Jun 2026 21:59:32 +0800 Subject: [PATCH 3/3] {"schema":"decodex/commit/1","summary":"Gate authority recovery drill passes on recovery predicates","authority":"XY-1119","related":["XY-1098"]} --- .../src/bin/real_world_job_benchmark.rs | 361 +++++++++++------- .../tests/real_world_job_benchmark.rs | 97 +++-- ...27-authority-recovery-drill-drift-audit.md | 14 +- .../real_world_agent_memory_benchmark.md | 11 +- .../real_world_agent_memory_benchmark_v1.md | 18 +- 5 files changed, 317 insertions(+), 184 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index ebc24436..3101070a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -79,6 +79,8 @@ const SCOREBOARD_EVIDENCE_CLASSES: &[&str] = &["fixture_backed", "live_baseline", "live_real_world", "research_gate"]; const OPERATIONAL_EVIDENCE_TIERS: &[&str] = &["local_fixture", "public_proxy", "private_corpus", "provider_backed"]; +const REQUIRED_AUTHORITY_PLANES: [&str; 7] = + ["source", "journal", "memory", "knowledge", "proposal", "trace", "audit"]; #[derive(Debug, Parser)] #[command( @@ -91,15 +93,6 @@ struct Args { command: Command, } -#[derive(Debug, Subcommand)] -#[command(rename_all = "kebab")] -enum Command { - /// Parse and score real_world_job fixtures, then emit a JSON report. - Run(RunArgs), - /// Render Markdown from a generated real_world_job JSON report. - Publish(PublishArgs), -} - #[derive(Debug, Parser)] struct RunArgs { /// Fixture file or directory containing real_world_job JSON fixtures. @@ -188,25 +181,6 @@ struct Corpus { adapter_response: Option, } -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum CorpusProfile { - Synthetic, - PrivateSanitized, - GeneratedPublic, - ExternalAdapter, -} -impl CorpusProfile { - fn as_str(&self) -> &'static str { - match self { - Self::Synthetic => "synthetic", - Self::PrivateSanitized => "private_sanitized", - Self::GeneratedPublic => "generated_public", - Self::ExternalAdapter => "external_adapter", - } - } -} - #[derive(Debug, Deserialize)] struct CorpusItem { evidence_id: String, @@ -258,43 +232,6 @@ struct ExpectedAnswer { requires_refusal: bool, } -#[derive(Clone, Debug, Deserialize)] -#[serde(untagged)] -enum ExpectedClaim { - Text(String), - Object { claim_id: Option, text: String }, -} -impl ExpectedClaim { - fn claim_id(&self) -> Option<&str> { - match self { - Self::Text(_) => None, - Self::Object { claim_id, .. } => claim_id.as_deref(), - } - } - - fn text(&self) -> &str { - match self { - Self::Text(text) => text, - Self::Object { text, .. } => text, - } - } -} - -#[derive(Clone, Debug, Deserialize)] -#[serde(untagged)] -enum EvidenceLink { - One(String), - Many(Vec), -} -impl EvidenceLink { - fn ids(&self) -> BTreeSet { - match self { - Self::One(id) => BTreeSet::from([id.clone()]), - Self::Many(ids) => ids.iter().cloned().collect(), - } - } -} - #[derive(Debug, Deserialize)] struct RequiredEvidence { evidence_id: String, @@ -968,14 +905,6 @@ struct ConsolidationProposalFixture { diff: Value, } -#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum ConsolidationReviewAction { - Apply, - Discard, - Defer, -} - #[derive(Clone, Debug, Deserialize)] struct ConsolidationExecutableGap { primitive: String, @@ -1065,18 +994,6 @@ struct TraceStageExplainability { notes: Option, } -#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum TypedStatus { - Pass, - WrongResult, - LifecycleFail, - Incomplete, - Blocked, - NotEncoded, - UnsupportedClaim, -} - #[derive(Debug, Deserialize, Serialize)] struct RealWorldReport { schema: String, @@ -1237,40 +1154,6 @@ struct AdapterReport { notes: String, } -#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum AdapterCoverageStatus { - Real, - Mocked, - Unsupported, - Blocked, - Incomplete, - WrongResult, - LifecycleFail, - Pass, - NotEncoded, -} - -#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum ElfScenarioPosition { - Wins, - Ties, - Loses, - Untested, -} - -#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -enum ScenarioComparisonOutcome { - Win, - Tie, - Loss, - NotTested, - Blocked, - NonGoal, -} - #[derive(Debug, Deserialize)] struct ExternalAdapterManifest { schema: String, @@ -2196,6 +2079,125 @@ struct JobMetrics { qdrant_rebuild_case: bool, } +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum Command { + /// Parse and score real_world_job fixtures, then emit a JSON report. + Run(RunArgs), + /// Render Markdown from a generated real_world_job JSON report. + Publish(PublishArgs), +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum CorpusProfile { + Synthetic, + PrivateSanitized, + GeneratedPublic, + ExternalAdapter, +} +impl CorpusProfile { + fn as_str(&self) -> &'static str { + match self { + Self::Synthetic => "synthetic", + Self::PrivateSanitized => "private_sanitized", + Self::GeneratedPublic => "generated_public", + Self::ExternalAdapter => "external_adapter", + } + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum ExpectedClaim { + Text(String), + Object { claim_id: Option, text: String }, +} +impl ExpectedClaim { + fn claim_id(&self) -> Option<&str> { + match self { + Self::Text(_) => None, + Self::Object { claim_id, .. } => claim_id.as_deref(), + } + } + + fn text(&self) -> &str { + match self { + Self::Text(text) => text, + Self::Object { text, .. } => text, + } + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum EvidenceLink { + One(String), + Many(Vec), +} +impl EvidenceLink { + fn ids(&self) -> BTreeSet { + match self { + Self::One(id) => BTreeSet::from([id.clone()]), + Self::Many(ids) => ids.iter().cloned().collect(), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ConsolidationReviewAction { + Apply, + Discard, + Defer, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum TypedStatus { + Pass, + WrongResult, + LifecycleFail, + Incomplete, + Blocked, + NotEncoded, + UnsupportedClaim, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum AdapterCoverageStatus { + Real, + Mocked, + Unsupported, + Blocked, + Incomplete, + WrongResult, + LifecycleFail, + Pass, + NotEncoded, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ElfScenarioPosition { + Wins, + Ties, + Loses, + Untested, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum ScenarioComparisonOutcome { + Win, + Tie, + Loss, + NotTested, + Blocked, + NonGoal, +} + fn main() -> Result<()> { color_eyre::install()?; @@ -3392,6 +3394,12 @@ fn validate_recovery_degraded_read( if degraded_read.unavailable_labels.is_empty() || degraded_read.evidence_refs.is_empty() { return Err(eyre::eyre!("{} has incomplete degraded-read drill evidence.", path.display())); } + if !degraded_read.source_of_truth_visible { + return Err(eyre::eyre!( + "{} has hidden source-of-truth records during degraded read.", + path.display() + )); + } ensure_known_evidence_refs(path, evidence_ids, °raded_read.evidence_refs) } @@ -3410,6 +3418,9 @@ fn validate_recovery_measurement( { return Err(eyre::eyre!("{} has invalid {label} recovery measurement.", path.display())); } + if !recovery_measurement_met(measurement) { + return Err(eyre::eyre!("{} exceeded {label} recovery target.", path.display())); + } ensure_known_evidence_refs(path, evidence_ids, &measurement.evidence_refs) } @@ -3419,15 +3430,13 @@ fn validate_recovery_authority_record_counts( path: &Path, evidence_ids: &BTreeSet, ) -> Result<()> { - let required_planes = - ["source", "journal", "memory", "knowledge", "proposal", "trace", "audit"]; let present_planes = drill .authority_record_counts .iter() .map(|count| count.plane.as_str()) .collect::>(); - for plane in required_planes { + for plane in REQUIRED_AUTHORITY_PLANES { if !present_planes.contains(plane) { return Err(eyre::eyre!( "{} authority recovery drill {} is missing {} authority counts.", @@ -3453,6 +3462,22 @@ fn validate_recovery_authority_record_counts( count.plane )); } + if !count.source_refs_preserved { + return Err(eyre::eyre!( + "{} authority recovery drill {} did not preserve {} authority source refs.", + path.display(), + drill.drill_id, + count.plane + )); + } + if !count.lifecycle_history_preserved { + return Err(eyre::eyre!( + "{} authority recovery drill {} did not preserve {} authority lifecycle history.", + path.display(), + drill.drill_id, + count.plane + )); + } ensure_known_evidence_refs(path, evidence_ids, &count.evidence_refs)?; } @@ -3465,7 +3490,7 @@ fn validate_recovery_outbox_replay( path: &Path, evidence_ids: &BTreeSet, ) -> Result<()> { - if replay.evidence_refs.is_empty() { + if replay.evidence_refs.is_empty() || !recovery_outbox_replay_succeeded(replay) { return Err(eyre::eyre!("{} has incomplete outbox replay drill evidence.", path.display())); } @@ -3477,7 +3502,7 @@ fn validate_recovery_qdrant_rebuild( path: &Path, evidence_ids: &BTreeSet, ) -> Result<()> { - if rebuild.evidence_refs.is_empty() { + if rebuild.evidence_refs.is_empty() || !recovery_qdrant_rebuild_succeeded(rebuild) { return Err(eyre::eyre!( "{} has incomplete Qdrant rebuild drill evidence.", path.display() @@ -3492,7 +3517,7 @@ fn validate_recovery_migration_repair( path: &Path, evidence_ids: &BTreeSet, ) -> Result<()> { - if repair.evidence_refs.is_empty() { + if repair.evidence_refs.is_empty() || !recovery_migration_repair_succeeded(repair) { return Err(eyre::eyre!( "{} has incomplete migration repair drill evidence.", path.display() @@ -3507,7 +3532,7 @@ fn validate_recovery_dead_letter( path: &Path, evidence_ids: &BTreeSet, ) -> Result<()> { - if dead_letter.evidence_refs.is_empty() { + if dead_letter.evidence_refs.is_empty() || !recovery_dead_letter_succeeded(dead_letter) { return Err(eyre::eyre!( "{} has incomplete dead-letter handling drill evidence.", path.display() @@ -3517,6 +3542,59 @@ fn validate_recovery_dead_letter( ensure_known_evidence_refs(path, evidence_ids, &dead_letter.evidence_refs) } +fn recovery_drill_succeeded(drill: &AuthorityRecoveryDrillArtifact) -> bool { + drill.backup_pitr.restored + && drill.degraded_read.source_of_truth_visible + && recovery_measurement_met(&drill.rpo) + && recovery_measurement_met(&drill.rto) + && recovery_authority_record_counts_succeeded(drill) + && recovery_outbox_replay_succeeded(&drill.outbox_replay) + && recovery_qdrant_rebuild_succeeded(&drill.qdrant_rebuild) + && recovery_migration_repair_succeeded(&drill.migration_repair) + && recovery_dead_letter_succeeded(&drill.dead_letter) +} + +fn recovery_measurement_met(measurement: &RecoveryMeasurement) -> bool { + measurement.measured_seconds <= measurement.target_seconds +} + +fn recovery_authority_record_counts_succeeded(drill: &AuthorityRecoveryDrillArtifact) -> bool { + let present_planes = drill + .authority_record_counts + .iter() + .map(|count| count.plane.as_str()) + .collect::>(); + + REQUIRED_AUTHORITY_PLANES.iter().all(|plane| present_planes.contains(*plane)) + && drill.authority_record_counts.iter().all(authority_record_count_succeeded) +} + +fn authority_record_count_succeeded(count: &AuthorityRecordCount) -> bool { + authority_record_count_balanced(count) + && count.source_refs_preserved + && count.lifecycle_history_preserved +} + +fn authority_record_count_balanced(count: &AuthorityRecordCount) -> bool { + count.before_count == count.after_count +} + +fn recovery_outbox_replay_succeeded(replay: &RecoveryOutboxReplay) -> bool { + replay.idempotent && replay.duplicate_write_count == 0 +} + +fn recovery_qdrant_rebuild_succeeded(rebuild: &RecoveryQdrantRebuild) -> bool { + rebuild.complete && rebuild.missing_vector_count == 0 && rebuild.error_count == 0 +} + +fn recovery_migration_repair_succeeded(repair: &RecoveryMigrationRepair) -> bool { + repair.applied +} + +fn recovery_dead_letter_succeeded(dead_letter: &RecoveryDeadLetterHandling) -> bool { + dead_letter.handled_count >= dead_letter.dead_letter_count +} + fn ensure_known_evidence_refs( path: &Path, evidence_ids: &BTreeSet, @@ -7152,8 +7230,9 @@ fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthority drill_pass_count: recovery_jobs .iter() .filter(|report| report.status == TypedStatus::Pass) - .map(|report| report.recovery_drills.len()) - .sum(), + .flat_map(|report| report.recovery_drills.iter()) + .filter(|drill| recovery_drill_succeeded(drill)) + .count(), topology_reported_count: drills .iter() .filter(|drill| !drill.topology.authority_store.trim().is_empty()) @@ -7172,19 +7251,13 @@ fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthority .filter(|drill| drill.backup_pitr.restored) .count(), rpo_target_count: drills.len(), - rpo_met_count: drills - .iter() - .filter(|drill| drill.rpo.measured_seconds <= drill.rpo.target_seconds) - .count(), + rpo_met_count: drills.iter().filter(|drill| recovery_measurement_met(&drill.rpo)).count(), rto_target_count: drills.len(), - rto_met_count: drills - .iter() - .filter(|drill| drill.rto.measured_seconds <= drill.rto.target_seconds) - .count(), + rto_met_count: drills.iter().filter(|drill| recovery_measurement_met(&drill.rto)).count(), authority_plane_count: authority_counts.len(), record_count_preserved_count: authority_counts .iter() - .filter(|count| count.before_count == count.after_count) + .filter(|count| authority_record_count_balanced(count)) .count(), source_ref_preserved_count: authority_counts .iter() @@ -7196,25 +7269,19 @@ fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthority .count(), idempotent_outbox_replay_count: drills .iter() - .filter(|drill| { - drill.outbox_replay.idempotent && drill.outbox_replay.duplicate_write_count == 0 - }) + .filter(|drill| recovery_outbox_replay_succeeded(&drill.outbox_replay)) .count(), qdrant_rebuild_complete_count: drills .iter() - .filter(|drill| { - drill.qdrant_rebuild.complete - && drill.qdrant_rebuild.missing_vector_count == 0 - && drill.qdrant_rebuild.error_count == 0 - }) + .filter(|drill| recovery_qdrant_rebuild_succeeded(&drill.qdrant_rebuild)) .count(), migration_repair_count: drills .iter() - .filter(|drill| drill.migration_repair.applied) + .filter(|drill| recovery_migration_repair_succeeded(&drill.migration_repair)) .count(), dead_letter_handled_count: drills .iter() - .filter(|drill| drill.dead_letter.handled_count >= drill.dead_letter.dead_letter_count) + .filter(|drill| recovery_dead_letter_succeeded(&drill.dead_letter)) .count(), job_ids, } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index eee42734..712965b8 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -8118,30 +8118,81 @@ fn assert_authority_recovery_operational_evidence(report: &Value) { } #[test] -fn authority_recovery_fixture_rejects_unrestored_backup_or_record_count_loss() -> Result<()> { - assert_authority_recovery_fixture_failure( - "unrestored-backup", - |fixture| { - set_json_pointer( - fixture, - "/corpus/adapter_response/answer/recovery_drills/0/backup_pitr/restored", - serde_json::json!(false), - ) - }, - "incomplete backup/PITR drill evidence", - )?; +fn authority_recovery_fixture_rejects_incomplete_recovery_predicates() -> Result<()> { + for (slug, pointer, replacement, expected_error) in authority_recovery_failure_cases() { + assert_authority_recovery_fixture_failure( + slug, + |fixture| set_json_pointer(fixture, pointer, replacement), + expected_error, + )?; + } - assert_authority_recovery_fixture_failure( - "record-count-loss", - |fixture| { - set_json_pointer( - fixture, - "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/after_count", - serde_json::json!(2), - ) - }, - "lost or gained source authority records", - ) + Ok(()) +} + +fn authority_recovery_failure_cases() -> Vec<(&'static str, &'static str, Value, &'static str)> { + vec![ + ( + "unrestored-backup", + "/corpus/adapter_response/answer/recovery_drills/0/backup_pitr/restored", + serde_json::json!(false), + "incomplete backup/PITR drill evidence", + ), + ( + "record-count-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/after_count", + serde_json::json!(2), + "lost or gained source authority records", + ), + ( + "source-ref-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/source_refs_preserved", + serde_json::json!(false), + "did not preserve source authority source refs", + ), + ( + "lifecycle-history-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/lifecycle_history_preserved", + serde_json::json!(false), + "did not preserve source authority lifecycle history", + ), + ( + "hidden-source-of-truth", + "/corpus/adapter_response/answer/recovery_drills/0/degraded_read/source_of_truth_visible", + serde_json::json!(false), + "hidden source-of-truth records during degraded read", + ), + ( + "rpo-miss", + "/corpus/adapter_response/answer/recovery_drills/0/rpo/measured_seconds", + serde_json::json!(61.0), + "exceeded rpo recovery target", + ), + ( + "non-idempotent-outbox", + "/corpus/adapter_response/answer/recovery_drills/0/outbox_replay/duplicate_write_count", + serde_json::json!(1), + "incomplete outbox replay drill evidence", + ), + ( + "incomplete-qdrant-rebuild", + "/corpus/adapter_response/answer/recovery_drills/0/qdrant_rebuild/complete", + serde_json::json!(false), + "incomplete Qdrant rebuild drill evidence", + ), + ( + "missing-migration-repair", + "/corpus/adapter_response/answer/recovery_drills/0/migration_repair/applied", + serde_json::json!(false), + "incomplete migration repair drill evidence", + ), + ( + "dead-letter-underhandled", + "/corpus/adapter_response/answer/recovery_drills/0/dead_letter/handled_count", + serde_json::json!(1), + "incomplete dead-letter handling drill evidence", + ), + ] } fn assert_authority_recovery_fixture_failure( diff --git a/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md index 0677d278..e9abfb1b 100644 --- a/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md +++ b/docs/evidence/2026-06-27-authority-recovery-drill-drift-audit.md @@ -42,13 +42,15 @@ or multi-region failover evidence. - `elf.authority_recovery_drill/v1` is a benchmark artifact under `adapter_response.answer.recovery_drills[]`. - The runner validates drill topology, failure injections, backup/PITR restored - evidence, degraded-read labels, RPO/RTO measurements, matching authority record - counts for source, journal, memory, knowledge, proposal, trace, and audit planes, - idempotent outbox replay, Qdrant rebuild completeness, migration repair, and - dead-letter handling. + evidence, degraded-read labels with visible source-of-truth records, RPO/RTO + measurements that meet targets, matching authority record counts for source, + journal, memory, knowledge, proposal, trace, and audit planes, preserved source + refs and lifecycle history, idempotent outbox replay without duplicate writes, + Qdrant rebuild completeness without missing vectors or errors, applied migration + repair, and dead-letter handling. - Reports expose those drill counts through - `operational_evidence.authority_recovery`, including backup/PITR restored and - record-count preservation counters. + `operational_evidence.authority_recovery`, including backup/PITR restored, + record-count preservation, and predicate-gated drill pass counters. - The checked-in fixture is local synthetic evidence only. It does not prove private corpus quality, provider-backed behavior, hosted HA, standby failover, or multi-region SLA. diff --git a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md index 70c1cc98..a83c33f1 100644 --- a/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/runbook/benchmarking/real_world_agent_memory_benchmark.md @@ -713,8 +713,15 @@ drills, and resource-envelope interpretation. Authority recovery drills use to report topology, failure injection, backup/PITR, degraded-read labels, RPO/RTO targets and measurements, matching before/after authority record counts, idempotent outbox replay, Qdrant rebuild completeness, migration repair, and dead-letter -handling. The generated `operational_evidence.authority_recovery` report includes -backup/PITR restored and record-count preservation counters. The P4 slice also +handling. The runner fails drills whose predicates are false: backup/PITR must be +restored, source-of-truth records must stay visible during degraded reads, RPO/RTO +measurements must meet targets, authority counts/source refs/lifecycle history must +be preserved, outbox replay must be idempotent without duplicate writes, Qdrant +rebuilds must complete without missing vectors or errors, migration repair must be +applied, and dead-letter rows must be handled. The generated +`operational_evidence.authority_recovery` report includes backup/PITR restored, +record-count preservation, and per-predicate recovery counters; drill pass counts +require both a passing job and successful recovery predicates. The P4 slice also encodes the operator-approved public-proxy production-private addendum and emits `elf.operational_evidence_gates/v1` so local fixture, public-proxy, private-corpus, and provider-backed evidence remain separate. diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index cec4cc59..120c7ee8 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -467,13 +467,18 @@ Each recovery drill MUST include: - `backup_pitr` with backup reference, PITR target, `restored = true`, and evidence refs; - `degraded_read` with unavailable derived indexes or adapters labeled separately - from visible source-of-truth records; -- `rpo` and `rto` targets and measured seconds with evidence refs; + from visible source-of-truth records, and `source_of_truth_visible = true`; +- `rpo` and `rto` targets and measured seconds with evidence refs, where measured + seconds are less than or equal to the target seconds; - `authority_record_counts` for `source`, `journal`, `memory`, `knowledge`, `proposal`, `trace`, and `audit`, including matching before/after counts plus - source-ref and lifecycle-history preservation booleans; -- `outbox_replay`, `qdrant_rebuild`, `migration_repair`, and `dead_letter` sections - with evidence refs. + `source_refs_preserved = true` and `lifecycle_history_preserved = true`; +- `outbox_replay` with `idempotent = true`, zero duplicate writes, and evidence refs; +- `qdrant_rebuild` with `complete = true`, zero missing vectors, zero errors, and + evidence refs; +- `migration_repair` with `applied = true` and evidence refs; +- `dead_letter` with handled count greater than or equal to dead-letter count and + evidence refs. A recovery drill MUST NOT claim failover unless a standby or replacement authority service is actually part of the topology. Qdrant and document indexes remain derived @@ -719,7 +724,8 @@ Reports MUST include: separating `local_fixture`, `public_proxy`, `private_corpus`, and `provider_backed` tiers. The gates MUST report tier status, job counts, pass and typed non-pass counts, mean latency, cost summary, resource-envelope counts, - cold-start/restore/Qdrant-rebuild counts, authority recovery drill counts, + cold-start/restore/Qdrant-rebuild counts, authority recovery drill counts where a + pass requires the job to pass and every drill predicate above to succeed, topology coverage, failure-injection counts, degraded-read label counts, visible source-of-truth counts, backup/PITR restored counts, RPO/RTO target and met counts, authority record-count preservation counts, source-ref and lifecycle preservation