From fb6464bd412df1cfa73335a6cb2ffa67b268f11b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 30 Jun 2026 11:43:22 +0800 Subject: [PATCH 1/4] {"schema":"decodex/commit/1","summary":"Split job report contract modules","authority":"manual"} --- .../job_report_core.rs | 124 +++++ .../job_report_domain_metrics.rs | 157 ++++++ .../job_report_evolution.rs | 39 ++ .../job_report_misc.rs | 15 + .../job_report_scoring.rs | 109 +++++ .../real_world_job_benchmark/job_reports.rs | 446 +----------------- 6 files changed, 461 insertions(+), 429 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs new file mode 100644 index 00000000..d119db7f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs @@ -0,0 +1,124 @@ +use crate::{ + AuthorityRecoveryDrillArtifact, CostReport, Deserialize, OperatorDebugEvidence, Serialize, + TraceExplainability, TypedStatus, +}; + +use super::{ + ConsolidationJobReport, EvolutionJobReport, KnowledgeJobMetrics, MemorySummaryJobMetrics, + ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, +}; + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct JobReport { + pub(crate) suite_id: String, + pub(crate) job_id: String, + pub(crate) title: String, + pub(crate) status: TypedStatus, + pub(crate) operational_evidence_tier: String, + pub(crate) answer_type: String, + pub(crate) requires_caveat: bool, + pub(crate) requires_refusal: bool, + pub(crate) can_answer_unknown: bool, + pub(crate) normalized_score: f64, + pub(crate) hard_fail_hits: Vec, + pub(crate) expected_evidence: Vec, + pub(crate) produced_answer: String, + pub(crate) produced_evidence: Vec, + pub(crate) unsupported_claim_count: usize, + pub(crate) wrong_result_count: usize, + #[serde(default)] + pub(crate) stale_answer_count: usize, + #[serde(default)] + pub(crate) conflict_detection_count: usize, + #[serde(default)] + pub(crate) update_rationale_available: bool, + #[serde(default)] + pub(crate) temporal_validity_not_encoded: bool, + #[serde(default)] + pub(crate) history_readback_encoded: bool, + pub(crate) retrieval_quality: RetrievalQualityReport, + pub(crate) latency_ms: Option, + pub(crate) cost: Option, + pub(crate) trace_explainability: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) knowledge: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) scheduled_memory: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) work_continuity: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub(crate) recovery_drills: Vec, + pub(crate) trap_ids_used: Vec, + pub(crate) dimension_scores: Vec, + pub(crate) reason: String, + #[serde(default)] + pub(crate) evidence_required_count: usize, + #[serde(default)] + pub(crate) evidence_covered_count: usize, + #[serde(default)] + pub(crate) source_ref_required_count: usize, + #[serde(default)] + pub(crate) source_ref_covered_count: usize, + #[serde(default)] + pub(crate) quote_required_count: usize, + #[serde(default)] + pub(crate) quote_covered_count: usize, + #[serde(default)] + pub(crate) stale_retrieval_count: usize, + #[serde(default)] + pub(crate) scope_check_count: usize, + #[serde(default)] + pub(crate) scope_correct_count: usize, + #[serde(default)] + pub(crate) scope_violation_count: usize, + #[serde(default)] + pub(crate) redaction_leak_count: usize, + #[serde(default)] + pub(crate) qdrant_rebuild_case: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) operator_debug: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) evolution: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) consolidation: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct ExpectedEvidenceReport { + pub(crate) evidence_id: String, + pub(crate) claim_id: String, + pub(crate) requirement: String, +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct DimensionScoreReport { + pub(crate) dimension: String, + pub(crate) score: f64, + pub(crate) max_points: f64, + pub(crate) weight: f64, +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct RetrievalQualityReport { + pub(crate) expected_evidence_total: usize, + pub(crate) expected_evidence_matched: usize, + pub(crate) expected_evidence_recall: f64, + pub(crate) produced_evidence_total: usize, + pub(crate) irrelevant_context_count: usize, + pub(crate) irrelevant_context_ratio: f64, + pub(crate) trap_context_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct UnsupportedClaimReport { + pub(crate) suite_id: String, + pub(crate) job_id: String, + pub(crate) claim_id: Option, + pub(crate) claim_text: String, + pub(crate) reason: String, + pub(crate) evidence_ids: Vec, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs new file mode 100644 index 00000000..f6f2ba8f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs @@ -0,0 +1,157 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct KnowledgeJobMetrics { + pub(crate) page_count: usize, + pub(crate) section_count: usize, + pub(crate) traced_section_count: usize, + pub(crate) flagged_unsupported_section_count: usize, + pub(crate) untraced_section_count: usize, + pub(crate) unsupported_summary_count: usize, + pub(crate) backlink_count: usize, + pub(crate) pages_with_backlinks: usize, + pub(crate) pages_with_version_diff: usize, + pub(crate) stale_trap_count: usize, + pub(crate) stale_traps_detected: usize, + pub(crate) rebuild_page_count: usize, + pub(crate) deterministic_rebuild_count: usize, + pub(crate) rebuild_failure_count: usize, + pub(crate) allowed_variance_count: usize, + pub(crate) citation_coverage: f64, + pub(crate) stale_claim_detection: f64, + pub(crate) rebuild_determinism: f64, + pub(crate) backlink_coverage: f64, + pub(crate) version_diff_coverage: f64, + pub(crate) page_usefulness: f64, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct MemorySummaryJobMetrics { + pub(crate) summary_count: usize, + pub(crate) entry_count: usize, + pub(crate) required_category_count: usize, + pub(crate) covered_required_category_count: usize, + pub(crate) missing_required_category_count: usize, + pub(crate) top_of_mind_count: usize, + pub(crate) background_count: usize, + pub(crate) stale_count: usize, + pub(crate) superseded_count: usize, + pub(crate) tombstone_count: usize, + pub(crate) derived_project_profile_count: usize, + pub(crate) source_ref_required_count: usize, + pub(crate) source_ref_entry_count: usize, + pub(crate) source_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) rationale_count: usize, + pub(crate) rationale_coverage: f64, + pub(crate) invalid_top_of_mind_count: usize, + pub(crate) untraced_entry_count: usize, + pub(crate) derived_with_source_or_unsupported_count: usize, + pub(crate) derived_missing_source_or_unsupported_count: usize, + pub(crate) unsupported_derived_entry_count: usize, + pub(crate) unsupported_current_entry_count: usize, + pub(crate) tombstone_ref_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ProactiveBriefJobMetrics { + pub(crate) brief_count: usize, + pub(crate) suggestion_count: usize, + pub(crate) required_suggestion_kind_count: usize, + pub(crate) covered_required_suggestion_kind_count: usize, + pub(crate) missing_required_suggestion_kind_count: usize, + pub(crate) evidence_ref_required_count: usize, + pub(crate) evidence_ref_suggestion_count: usize, + pub(crate) evidence_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) action_rationale_count: usize, + pub(crate) action_rationale_coverage: f64, + pub(crate) recommended_count: usize, + pub(crate) deferred_count: usize, + pub(crate) rejected_count: usize, + pub(crate) current_suggestion_count: usize, + pub(crate) non_current_suggestion_count: usize, + pub(crate) stale_warning_count: usize, + pub(crate) invalid_current_suggestion_count: usize, + pub(crate) untraced_suggestion_count: usize, + pub(crate) unsupported_current_suggestion_count: usize, + pub(crate) tombstone_violation_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ScheduledMemoryJobMetrics { + pub(crate) task_run_count: usize, + pub(crate) output_count: usize, + pub(crate) required_task_kind_count: usize, + pub(crate) covered_required_task_kind_count: usize, + pub(crate) missing_required_task_kind_count: usize, + pub(crate) evidence_ref_required_count: usize, + pub(crate) evidence_ref_output_count: usize, + pub(crate) evidence_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) action_rationale_count: usize, + pub(crate) action_rationale_coverage: f64, + pub(crate) trace_required_count: usize, + pub(crate) trace_complete_count: usize, + pub(crate) trace_coverage: f64, + pub(crate) source_mutation_count: usize, + pub(crate) current_output_count: usize, + pub(crate) non_current_output_count: usize, + pub(crate) invalid_current_output_count: usize, + pub(crate) untraced_output_count: usize, + pub(crate) unsupported_current_output_count: usize, + pub(crate) tombstone_violation_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct WorkContinuityJobMetrics { + pub(crate) readback_count: usize, + pub(crate) entry_count: usize, + pub(crate) reset_resume_required_count: usize, + pub(crate) reset_resume_success_count: usize, + pub(crate) reset_resume_success_rate: f64, + pub(crate) decision_rationale_required_count: usize, + pub(crate) decision_rationale_recalled_count: usize, + pub(crate) decision_rationale_recall_rate: f64, + pub(crate) rejected_option_required_count: usize, + pub(crate) rejected_option_suppressed_count: usize, + pub(crate) rejected_option_resurrection_count: usize, + pub(crate) rejected_option_suppression_rate: f64, + pub(crate) explicit_next_step_required_count: usize, + pub(crate) explicit_next_step_returned_count: usize, + pub(crate) explicit_next_step_correct_count: usize, + pub(crate) explicit_next_step_precision: f64, + pub(crate) inferred_next_step_required_count: usize, + pub(crate) inferred_next_step_labeled_count: usize, + pub(crate) inferred_step_instruction_count: usize, + pub(crate) inferred_next_step_labeling_rate: f64, + pub(crate) handoff_source_ref_required_count: usize, + pub(crate) handoff_source_ref_covered_count: usize, + pub(crate) handoff_source_ref_coverage: f64, + pub(crate) redaction_required_count: usize, + pub(crate) redaction_applied_count: usize, + pub(crate) sensitive_marker_persistence_count: usize, + pub(crate) redaction_rate: f64, + pub(crate) janitor_candidate_count: usize, + pub(crate) janitor_false_promotion_count: usize, + pub(crate) janitor_false_promotion_rate: f64, + pub(crate) journal_only_authority_claim_count: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs new file mode 100644 index 00000000..6945daa6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs @@ -0,0 +1,39 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct EvolutionSummary { + pub(crate) stale_answer_count: usize, + pub(crate) conflict_detection_count: usize, + pub(crate) update_rationale_available_count: usize, + pub(crate) temporal_validity_not_encoded_count: usize, + pub(crate) history_readback_encoded_count: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct EvolutionJobReport { + pub(crate) current_evidence: Vec, + pub(crate) historical_evidence: Vec, + pub(crate) tombstone_evidence: Vec, + pub(crate) invalidation_evidence: Vec, + pub(crate) selected_current_evidence: Vec, + pub(crate) selected_historical_evidence: Vec, + pub(crate) selected_rationale_evidence: Vec, + pub(crate) selected_tombstone_evidence: Vec, + pub(crate) selected_invalidation_evidence: Vec, + pub(crate) conflict_candidate_evidence: Vec, + pub(crate) retrieved_but_dropped_evidence: Vec, + pub(crate) selected_but_not_narrated_evidence: Vec, + pub(crate) stale_trap_ids_used: Vec, + pub(crate) stale_answer_count: usize, + pub(crate) conflict_count: usize, + pub(crate) conflict_detection_count: usize, + pub(crate) update_rationale_available: bool, + pub(crate) temporal_validity_required: bool, + pub(crate) temporal_validity_encoded: bool, + pub(crate) temporal_validity_not_encoded: bool, + pub(crate) history_readback_encoded: bool, + pub(crate) history_event_types: Vec, + pub(crate) history_requires_note_version_links: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) follow_up: Option, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs new file mode 100644 index 00000000..398460d3 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs @@ -0,0 +1,15 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct FollowUpReport { + pub(crate) suite_id: String, + pub(crate) job_id: String, + pub(crate) title: String, + pub(crate) reason: String, +} + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct PrivateCorpusRedaction { + pub(crate) policy: String, + pub(crate) private_fixture_count: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs new file mode 100644 index 00000000..c56e5cf4 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs @@ -0,0 +1,109 @@ +use crate::TypedStatus; + +use super::{ + ConsolidationJobReport, DimensionScoreReport, EvolutionJobReport, KnowledgeJobMetrics, + MemorySummaryJobMetrics, ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, + UnsupportedClaimReport, WorkContinuityJobMetrics, +}; + +#[derive(Debug)] +pub(crate) struct JobScoring { + pub(crate) status: TypedStatus, + pub(crate) normalized_score: f64, + pub(crate) hard_fail_hits: Vec, + pub(crate) unsupported_claims: Vec, + pub(crate) wrong_result_count: usize, + pub(crate) knowledge: Option, + pub(crate) trap_ids_used: Vec, + pub(crate) dimension_scores: Vec, + pub(crate) reason: String, + pub(crate) evolution: Option, + pub(crate) consolidation: Option, + pub(crate) memory_summary: Option, + pub(crate) proactive_brief: Option, + pub(crate) scheduled_memory: Option, + pub(crate) work_continuity: Option, +} + +#[derive(Debug, Default)] +pub(crate) struct FailureCounts { + pub(crate) missing_claims: usize, + pub(crate) forbidden_claims: usize, + pub(crate) missing_evidence: usize, + pub(crate) trap_uses: usize, + pub(crate) unsupported_claims: usize, + pub(crate) operator_debug_missing: usize, + pub(crate) operator_debug_raw_sql: usize, + pub(crate) operator_debug_trace_gaps: usize, + pub(crate) operator_debug_repair_unclear: usize, + pub(crate) stale_answers: usize, + pub(crate) conflict_detection_missing: usize, + pub(crate) update_rationale_missing: usize, + pub(crate) latency_violations: usize, + pub(crate) proposal_usefulness_failures: usize, + pub(crate) lineage_failures: usize, + pub(crate) review_action_failures: usize, + pub(crate) source_mutations: usize, + pub(crate) blocking_executable_gaps: usize, + pub(crate) memory_summary_invalid_current_entries: usize, + pub(crate) memory_summary_untraced_entries: usize, + pub(crate) memory_summary_missing_freshness: usize, + pub(crate) memory_summary_missing_rationale: usize, + pub(crate) memory_summary_missing_categories: usize, + pub(crate) memory_summary_unsupported_current_entries: usize, + pub(crate) proactive_brief_invalid_current_suggestions: usize, + pub(crate) proactive_brief_untraced_suggestions: usize, + pub(crate) proactive_brief_missing_freshness: usize, + pub(crate) proactive_brief_missing_action_rationale: usize, + pub(crate) proactive_brief_missing_kinds: usize, + pub(crate) proactive_brief_unsupported_current_suggestions: usize, + pub(crate) proactive_brief_tombstone_violations: usize, + pub(crate) scheduled_memory_invalid_current_outputs: usize, + pub(crate) scheduled_memory_untraced_outputs: usize, + pub(crate) scheduled_memory_missing_freshness: usize, + pub(crate) scheduled_memory_missing_action_rationale: usize, + pub(crate) scheduled_memory_missing_task_kinds: usize, + pub(crate) scheduled_memory_unsupported_current_outputs: usize, + pub(crate) scheduled_memory_tombstone_violations: usize, + pub(crate) scheduled_memory_missing_trace: usize, + pub(crate) work_continuity_reset_resume_missing: usize, + pub(crate) work_continuity_decision_rationale_missing: usize, + pub(crate) work_continuity_rejected_option_unsuppressed: usize, + pub(crate) work_continuity_rejected_option_resurrection: usize, + pub(crate) work_continuity_explicit_next_step_missing: usize, + pub(crate) work_continuity_explicit_next_step_extra: usize, + pub(crate) work_continuity_inferred_step_unlabeled: usize, + pub(crate) work_continuity_inferred_step_as_instruction: usize, + pub(crate) work_continuity_handoff_source_ref_missing: usize, + pub(crate) work_continuity_redaction_missing: usize, + pub(crate) work_continuity_sensitive_marker_persistence: usize, + pub(crate) work_continuity_janitor_false_promotion: usize, + pub(crate) work_continuity_journal_only_authority_claim: usize, + pub(crate) untraced_page_sections: usize, + pub(crate) missed_stale_findings: usize, + pub(crate) rebuild_failures: usize, + pub(crate) page_usefulness_failures: usize, +} + +#[derive(Debug, Default)] +pub(crate) struct JobMetrics { + pub(crate) evidence_required_count: usize, + pub(crate) evidence_covered_count: usize, + pub(crate) source_ref_required_count: usize, + pub(crate) source_ref_covered_count: usize, + pub(crate) quote_required_count: usize, + pub(crate) quote_covered_count: usize, + pub(crate) stale_retrieval_count: usize, + pub(crate) scope_check_count: usize, + pub(crate) scope_correct_count: usize, + pub(crate) scope_violation_count: usize, + pub(crate) redaction_leak_count: usize, + pub(crate) qdrant_rebuild_case: bool, +} + +pub(crate) struct ScoreboardRankedMetrics { + pub(crate) relevant_at_k: usize, + pub(crate) precision_denominator_at_k: usize, + pub(crate) reciprocal_rank: f64, + pub(crate) ndcg: f64, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs index bf5aafcc..91b4af55 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs @@ -1,435 +1,23 @@ mod consolidation_reports; +mod job_report_core; +mod job_report_domain_metrics; +mod job_report_evolution; +mod job_report_misc; +mod job_report_scoring; pub(super) use consolidation_reports::{ ConsolidationExecutableGapReport, ConsolidationJobReport, ConsolidationProposalReport, }; - -use crate::{ - AuthorityRecoveryDrillArtifact, CostReport, Deserialize, OperatorDebugEvidence, Serialize, - TraceExplainability, TypedStatus, +pub(super) use job_report_core::{ + DimensionScoreReport, ExpectedEvidenceReport, JobReport, RetrievalQualityReport, + UnsupportedClaimReport, +}; +pub(super) use job_report_domain_metrics::{ + KnowledgeJobMetrics, MemorySummaryJobMetrics, ProactiveBriefJobMetrics, + ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, +}; +pub(super) use job_report_evolution::{EvolutionJobReport, EvolutionSummary}; +pub(super) use job_report_misc::{FollowUpReport, PrivateCorpusRedaction}; +pub(super) use job_report_scoring::{ + FailureCounts, JobMetrics, JobScoring, ScoreboardRankedMetrics, }; - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct JobReport { - pub(super) suite_id: String, - pub(super) job_id: String, - pub(super) title: String, - pub(super) status: TypedStatus, - pub(super) operational_evidence_tier: String, - pub(super) answer_type: String, - pub(super) requires_caveat: bool, - pub(super) requires_refusal: bool, - pub(super) can_answer_unknown: bool, - pub(super) normalized_score: f64, - pub(super) hard_fail_hits: Vec, - pub(super) expected_evidence: Vec, - pub(super) produced_answer: String, - pub(super) produced_evidence: Vec, - pub(super) unsupported_claim_count: usize, - pub(super) wrong_result_count: usize, - #[serde(default)] - pub(super) stale_answer_count: usize, - #[serde(default)] - pub(super) conflict_detection_count: usize, - #[serde(default)] - pub(super) update_rationale_available: bool, - #[serde(default)] - pub(super) temporal_validity_not_encoded: bool, - #[serde(default)] - pub(super) history_readback_encoded: bool, - pub(super) retrieval_quality: RetrievalQualityReport, - pub(super) latency_ms: Option, - pub(super) cost: Option, - pub(super) trace_explainability: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) knowledge: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) memory_summary: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) proactive_brief: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) scheduled_memory: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) work_continuity: Option, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub(super) recovery_drills: Vec, - pub(super) trap_ids_used: Vec, - pub(super) dimension_scores: Vec, - pub(super) reason: String, - #[serde(default)] - pub(super) evidence_required_count: usize, - #[serde(default)] - pub(super) evidence_covered_count: usize, - #[serde(default)] - pub(super) source_ref_required_count: usize, - #[serde(default)] - pub(super) source_ref_covered_count: usize, - #[serde(default)] - pub(super) quote_required_count: usize, - #[serde(default)] - pub(super) quote_covered_count: usize, - #[serde(default)] - pub(super) stale_retrieval_count: usize, - #[serde(default)] - pub(super) scope_check_count: usize, - #[serde(default)] - pub(super) scope_correct_count: usize, - #[serde(default)] - pub(super) scope_violation_count: usize, - #[serde(default)] - pub(super) redaction_leak_count: usize, - #[serde(default)] - pub(super) qdrant_rebuild_case: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) operator_debug: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) evolution: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) consolidation: Option, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct ExpectedEvidenceReport { - pub(super) evidence_id: String, - pub(super) claim_id: String, - pub(super) requirement: String, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct DimensionScoreReport { - pub(super) dimension: String, - pub(super) score: f64, - pub(super) max_points: f64, - pub(super) weight: f64, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct RetrievalQualityReport { - pub(super) expected_evidence_total: usize, - pub(super) expected_evidence_matched: usize, - pub(super) expected_evidence_recall: f64, - pub(super) produced_evidence_total: usize, - pub(super) irrelevant_context_count: usize, - pub(super) irrelevant_context_ratio: f64, - pub(super) trap_context_count: usize, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct UnsupportedClaimReport { - pub(super) suite_id: String, - pub(super) job_id: String, - pub(super) claim_id: Option, - pub(super) claim_text: String, - pub(super) reason: String, - pub(super) evidence_ids: Vec, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct KnowledgeJobMetrics { - pub(super) page_count: usize, - pub(super) section_count: usize, - pub(super) traced_section_count: usize, - pub(super) flagged_unsupported_section_count: usize, - pub(super) untraced_section_count: usize, - pub(super) unsupported_summary_count: usize, - pub(super) backlink_count: usize, - pub(super) pages_with_backlinks: usize, - pub(super) pages_with_version_diff: usize, - pub(super) stale_trap_count: usize, - pub(super) stale_traps_detected: usize, - pub(super) rebuild_page_count: usize, - pub(super) deterministic_rebuild_count: usize, - pub(super) rebuild_failure_count: usize, - pub(super) allowed_variance_count: usize, - pub(super) citation_coverage: f64, - pub(super) stale_claim_detection: f64, - pub(super) rebuild_determinism: f64, - pub(super) backlink_coverage: f64, - pub(super) version_diff_coverage: f64, - pub(super) page_usefulness: f64, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct MemorySummaryJobMetrics { - pub(super) summary_count: usize, - pub(super) entry_count: usize, - pub(super) required_category_count: usize, - pub(super) covered_required_category_count: usize, - pub(super) missing_required_category_count: usize, - pub(super) top_of_mind_count: usize, - pub(super) background_count: usize, - pub(super) stale_count: usize, - pub(super) superseded_count: usize, - pub(super) tombstone_count: usize, - pub(super) derived_project_profile_count: usize, - pub(super) source_ref_required_count: usize, - pub(super) source_ref_entry_count: usize, - pub(super) source_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) rationale_count: usize, - pub(super) rationale_coverage: f64, - pub(super) invalid_top_of_mind_count: usize, - pub(super) untraced_entry_count: usize, - pub(super) derived_with_source_or_unsupported_count: usize, - pub(super) derived_missing_source_or_unsupported_count: usize, - pub(super) unsupported_derived_entry_count: usize, - pub(super) unsupported_current_entry_count: usize, - pub(super) tombstone_ref_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ProactiveBriefJobMetrics { - pub(super) brief_count: usize, - pub(super) suggestion_count: usize, - pub(super) required_suggestion_kind_count: usize, - pub(super) covered_required_suggestion_kind_count: usize, - pub(super) missing_required_suggestion_kind_count: usize, - pub(super) evidence_ref_required_count: usize, - pub(super) evidence_ref_suggestion_count: usize, - pub(super) evidence_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) action_rationale_count: usize, - pub(super) action_rationale_coverage: f64, - pub(super) recommended_count: usize, - pub(super) deferred_count: usize, - pub(super) rejected_count: usize, - pub(super) current_suggestion_count: usize, - pub(super) non_current_suggestion_count: usize, - pub(super) stale_warning_count: usize, - pub(super) invalid_current_suggestion_count: usize, - pub(super) untraced_suggestion_count: usize, - pub(super) unsupported_current_suggestion_count: usize, - pub(super) tombstone_violation_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ScheduledMemoryJobMetrics { - pub(super) task_run_count: usize, - pub(super) output_count: usize, - pub(super) required_task_kind_count: usize, - pub(super) covered_required_task_kind_count: usize, - pub(super) missing_required_task_kind_count: usize, - pub(super) evidence_ref_required_count: usize, - pub(super) evidence_ref_output_count: usize, - pub(super) evidence_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) action_rationale_count: usize, - pub(super) action_rationale_coverage: f64, - pub(super) trace_required_count: usize, - pub(super) trace_complete_count: usize, - pub(super) trace_coverage: f64, - pub(super) source_mutation_count: usize, - pub(super) current_output_count: usize, - pub(super) non_current_output_count: usize, - pub(super) invalid_current_output_count: usize, - pub(super) untraced_output_count: usize, - pub(super) unsupported_current_output_count: usize, - pub(super) tombstone_violation_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct WorkContinuityJobMetrics { - pub(super) readback_count: usize, - pub(super) entry_count: usize, - pub(super) reset_resume_required_count: usize, - pub(super) reset_resume_success_count: usize, - pub(super) reset_resume_success_rate: f64, - pub(super) decision_rationale_required_count: usize, - pub(super) decision_rationale_recalled_count: usize, - pub(super) decision_rationale_recall_rate: f64, - pub(super) rejected_option_required_count: usize, - pub(super) rejected_option_suppressed_count: usize, - pub(super) rejected_option_resurrection_count: usize, - pub(super) rejected_option_suppression_rate: f64, - pub(super) explicit_next_step_required_count: usize, - pub(super) explicit_next_step_returned_count: usize, - pub(super) explicit_next_step_correct_count: usize, - pub(super) explicit_next_step_precision: f64, - pub(super) inferred_next_step_required_count: usize, - pub(super) inferred_next_step_labeled_count: usize, - pub(super) inferred_step_instruction_count: usize, - pub(super) inferred_next_step_labeling_rate: f64, - pub(super) handoff_source_ref_required_count: usize, - pub(super) handoff_source_ref_covered_count: usize, - pub(super) handoff_source_ref_coverage: f64, - pub(super) redaction_required_count: usize, - pub(super) redaction_applied_count: usize, - pub(super) sensitive_marker_persistence_count: usize, - pub(super) redaction_rate: f64, - pub(super) janitor_candidate_count: usize, - pub(super) janitor_false_promotion_count: usize, - pub(super) janitor_false_promotion_rate: f64, - pub(super) journal_only_authority_claim_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct EvolutionSummary { - pub(super) stale_answer_count: usize, - pub(super) conflict_detection_count: usize, - pub(super) update_rationale_available_count: usize, - pub(super) temporal_validity_not_encoded_count: usize, - pub(super) history_readback_encoded_count: usize, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct EvolutionJobReport { - pub(super) current_evidence: Vec, - pub(super) historical_evidence: Vec, - pub(super) tombstone_evidence: Vec, - pub(super) invalidation_evidence: Vec, - pub(super) selected_current_evidence: Vec, - pub(super) selected_historical_evidence: Vec, - pub(super) selected_rationale_evidence: Vec, - pub(super) selected_tombstone_evidence: Vec, - pub(super) selected_invalidation_evidence: Vec, - pub(super) conflict_candidate_evidence: Vec, - pub(super) retrieved_but_dropped_evidence: Vec, - pub(super) selected_but_not_narrated_evidence: Vec, - pub(super) stale_trap_ids_used: Vec, - pub(super) stale_answer_count: usize, - pub(super) conflict_count: usize, - pub(super) conflict_detection_count: usize, - pub(super) update_rationale_available: bool, - pub(super) temporal_validity_required: bool, - pub(super) temporal_validity_encoded: bool, - pub(super) temporal_validity_not_encoded: bool, - pub(super) history_readback_encoded: bool, - pub(super) history_event_types: Vec, - pub(super) history_requires_note_version_links: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) follow_up: Option, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct FollowUpReport { - pub(super) suite_id: String, - pub(super) job_id: String, - pub(super) title: String, - pub(super) reason: String, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct PrivateCorpusRedaction { - pub(super) policy: String, - pub(super) private_fixture_count: usize, -} - -#[derive(Debug)] -pub(super) struct JobScoring { - pub(super) status: TypedStatus, - pub(super) normalized_score: f64, - pub(super) hard_fail_hits: Vec, - pub(super) unsupported_claims: Vec, - pub(super) wrong_result_count: usize, - pub(super) knowledge: Option, - pub(super) trap_ids_used: Vec, - pub(super) dimension_scores: Vec, - pub(super) reason: String, - pub(super) evolution: Option, - pub(super) consolidation: Option, - pub(super) memory_summary: Option, - pub(super) proactive_brief: Option, - pub(super) scheduled_memory: Option, - pub(super) work_continuity: Option, -} - -#[derive(Debug, Default)] -pub(super) struct FailureCounts { - pub(super) missing_claims: usize, - pub(super) forbidden_claims: usize, - pub(super) missing_evidence: usize, - pub(super) trap_uses: usize, - pub(super) unsupported_claims: usize, - pub(super) operator_debug_missing: usize, - pub(super) operator_debug_raw_sql: usize, - pub(super) operator_debug_trace_gaps: usize, - pub(super) operator_debug_repair_unclear: usize, - pub(super) stale_answers: usize, - pub(super) conflict_detection_missing: usize, - pub(super) update_rationale_missing: usize, - pub(super) latency_violations: usize, - pub(super) proposal_usefulness_failures: usize, - pub(super) lineage_failures: usize, - pub(super) review_action_failures: usize, - pub(super) source_mutations: usize, - pub(super) blocking_executable_gaps: usize, - pub(super) memory_summary_invalid_current_entries: usize, - pub(super) memory_summary_untraced_entries: usize, - pub(super) memory_summary_missing_freshness: usize, - pub(super) memory_summary_missing_rationale: usize, - pub(super) memory_summary_missing_categories: usize, - pub(super) memory_summary_unsupported_current_entries: usize, - pub(super) proactive_brief_invalid_current_suggestions: usize, - pub(super) proactive_brief_untraced_suggestions: usize, - pub(super) proactive_brief_missing_freshness: usize, - pub(super) proactive_brief_missing_action_rationale: usize, - pub(super) proactive_brief_missing_kinds: usize, - pub(super) proactive_brief_unsupported_current_suggestions: usize, - pub(super) proactive_brief_tombstone_violations: usize, - pub(super) scheduled_memory_invalid_current_outputs: usize, - pub(super) scheduled_memory_untraced_outputs: usize, - pub(super) scheduled_memory_missing_freshness: usize, - pub(super) scheduled_memory_missing_action_rationale: usize, - pub(super) scheduled_memory_missing_task_kinds: usize, - pub(super) scheduled_memory_unsupported_current_outputs: usize, - pub(super) scheduled_memory_tombstone_violations: usize, - pub(super) scheduled_memory_missing_trace: usize, - pub(super) work_continuity_reset_resume_missing: usize, - pub(super) work_continuity_decision_rationale_missing: usize, - pub(super) work_continuity_rejected_option_unsuppressed: usize, - pub(super) work_continuity_rejected_option_resurrection: usize, - pub(super) work_continuity_explicit_next_step_missing: usize, - pub(super) work_continuity_explicit_next_step_extra: usize, - pub(super) work_continuity_inferred_step_unlabeled: usize, - pub(super) work_continuity_inferred_step_as_instruction: usize, - pub(super) work_continuity_handoff_source_ref_missing: usize, - pub(super) work_continuity_redaction_missing: usize, - pub(super) work_continuity_sensitive_marker_persistence: usize, - pub(super) work_continuity_janitor_false_promotion: usize, - pub(super) work_continuity_journal_only_authority_claim: usize, - pub(super) untraced_page_sections: usize, - pub(super) missed_stale_findings: usize, - pub(super) rebuild_failures: usize, - pub(super) page_usefulness_failures: usize, -} - -#[derive(Debug, Default)] -pub(super) struct JobMetrics { - pub(super) evidence_required_count: usize, - pub(super) evidence_covered_count: usize, - pub(super) source_ref_required_count: usize, - pub(super) source_ref_covered_count: usize, - pub(super) quote_required_count: usize, - pub(super) quote_covered_count: usize, - pub(super) stale_retrieval_count: usize, - pub(super) scope_check_count: usize, - pub(super) scope_correct_count: usize, - pub(super) scope_violation_count: usize, - pub(super) redaction_leak_count: usize, - pub(super) qdrant_rebuild_case: bool, -} - -pub(super) struct ScoreboardRankedMetrics { - pub(super) relevant_at_k: usize, - pub(super) precision_denominator_at_k: usize, - pub(super) reciprocal_rank: f64, - pub(super) ndcg: f64, -} From 223f56a695bb504972b4195df4f5a4a3d9af0c10 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 30 Jun 2026 11:53:37 +0800 Subject: [PATCH 2/4] {"schema":"decodex/commit/1","summary":"Split summary report contract modules","authority":"manual"} --- .../summary_report_core.rs | 106 +++++++ .../summary_report_domain.rs | 155 ++++++++++ .../summary_report_suite.rs | 25 ++ .../summary_reports.rs | 289 +----------------- 4 files changed, 296 insertions(+), 279 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs new file mode 100644 index 00000000..b5776d11 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs @@ -0,0 +1,106 @@ +use crate::{CostReport, Deserialize, Serialize}; + +use super::{ + KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, + ScheduledMemorySummaryReport, WorkContinuitySummaryReport, +}; + +#[derive(Debug, Default, Deserialize, Serialize)] +pub(crate) struct ReportSummary { + pub(crate) job_count: usize, + pub(crate) encoded_suite_count: usize, + pub(crate) pass: usize, + pub(crate) wrong_result: usize, + pub(crate) lifecycle_fail: usize, + pub(crate) incomplete: usize, + pub(crate) blocked: usize, + pub(crate) not_encoded: usize, + pub(crate) unsupported_claim: usize, + pub(crate) unsupported_claim_count: usize, + pub(crate) wrong_result_count: usize, + #[serde(default)] + pub(crate) stale_answer_count: usize, + #[serde(default)] + pub(crate) conflict_detection_count: usize, + #[serde(default)] + pub(crate) update_rationale_available_count: usize, + #[serde(default)] + pub(crate) temporal_validity_not_encoded_count: usize, + #[serde(default)] + pub(crate) history_readback_encoded_count: usize, + pub(crate) expected_evidence_total: usize, + pub(crate) expected_evidence_matched: usize, + pub(crate) expected_evidence_recall: f64, + pub(crate) irrelevant_context_count: usize, + pub(crate) irrelevant_context_ratio: f64, + pub(crate) trace_explainability_count: usize, + pub(crate) wrong_result_stage_attribution_count: usize, + pub(crate) mean_score: f64, + pub(crate) mean_latency_ms: Option, + pub(crate) total_cost: Option, + #[serde(default)] + pub(crate) evidence_required_count: usize, + #[serde(default)] + pub(crate) evidence_covered_count: usize, + #[serde(default)] + pub(crate) evidence_coverage: f64, + #[serde(default)] + pub(crate) source_ref_required_count: usize, + #[serde(default)] + pub(crate) source_ref_covered_count: usize, + #[serde(default)] + pub(crate) source_ref_coverage: f64, + #[serde(default)] + pub(crate) quote_required_count: usize, + #[serde(default)] + pub(crate) quote_covered_count: usize, + #[serde(default)] + pub(crate) quote_coverage: f64, + #[serde(default)] + pub(crate) stale_retrieval_count: usize, + #[serde(default)] + pub(crate) scope_check_count: usize, + #[serde(default)] + pub(crate) scope_correct_count: usize, + #[serde(default)] + pub(crate) scope_correctness: f64, + #[serde(default)] + pub(crate) scope_violation_count: usize, + #[serde(default)] + pub(crate) redaction_leak_count: usize, + #[serde(default)] + pub(crate) qdrant_rebuild_case_count: usize, + #[serde(default)] + pub(crate) qdrant_rebuild_pass_count: usize, + #[serde(default)] + pub(crate) operator_debug_job_count: usize, + #[serde(default)] + pub(crate) raw_sql_needed_count: usize, + #[serde(default)] + pub(crate) trace_incomplete_count: usize, + #[serde(default)] + pub(crate) operator_ux_gap_count: usize, + #[serde(default)] + pub(crate) consolidation: ConsolidationSummaryReport, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) memory_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) proactive_brief: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) scheduled_memory: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) work_continuity: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) knowledge: Option, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +pub(crate) struct ConsolidationSummaryReport { + pub(crate) proposal_count: usize, + pub(crate) proposal_usefulness: Option, + pub(crate) lineage_completeness: Option, + pub(crate) review_action_correctness: Option, + pub(crate) source_mutation_count: usize, + pub(crate) proposal_unsupported_claim_count: usize, + pub(crate) executable_gap_count: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs new file mode 100644 index 00000000..1557d3c9 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs @@ -0,0 +1,155 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct MemorySummaryReport { + pub(crate) job_count: usize, + pub(crate) summary_count: usize, + pub(crate) entry_count: usize, + pub(crate) required_category_count: usize, + pub(crate) covered_required_category_count: usize, + pub(crate) missing_required_category_count: usize, + pub(crate) top_of_mind_count: usize, + pub(crate) background_count: usize, + pub(crate) stale_count: usize, + pub(crate) superseded_count: usize, + pub(crate) tombstone_count: usize, + pub(crate) derived_project_profile_count: usize, + pub(crate) source_ref_required_count: usize, + pub(crate) source_ref_entry_count: usize, + pub(crate) source_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) rationale_count: usize, + pub(crate) rationale_coverage: f64, + pub(crate) invalid_top_of_mind_count: usize, + pub(crate) untraced_entry_count: usize, + pub(crate) derived_with_source_or_unsupported_count: usize, + pub(crate) derived_missing_source_or_unsupported_count: usize, + pub(crate) unsupported_derived_entry_count: usize, + pub(crate) unsupported_current_entry_count: usize, + pub(crate) tombstone_ref_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ProactiveBriefSummaryReport { + pub(crate) job_count: usize, + pub(crate) brief_count: usize, + pub(crate) suggestion_count: usize, + pub(crate) required_suggestion_kind_count: usize, + pub(crate) covered_required_suggestion_kind_count: usize, + pub(crate) missing_required_suggestion_kind_count: usize, + pub(crate) evidence_ref_required_count: usize, + pub(crate) evidence_ref_suggestion_count: usize, + pub(crate) evidence_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) action_rationale_count: usize, + pub(crate) action_rationale_coverage: f64, + pub(crate) recommended_count: usize, + pub(crate) deferred_count: usize, + pub(crate) rejected_count: usize, + pub(crate) current_suggestion_count: usize, + pub(crate) non_current_suggestion_count: usize, + pub(crate) stale_warning_count: usize, + pub(crate) invalid_current_suggestion_count: usize, + pub(crate) untraced_suggestion_count: usize, + pub(crate) unsupported_current_suggestion_count: usize, + pub(crate) tombstone_violation_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ScheduledMemorySummaryReport { + pub(crate) job_count: usize, + pub(crate) task_run_count: usize, + pub(crate) output_count: usize, + pub(crate) required_task_kind_count: usize, + pub(crate) covered_required_task_kind_count: usize, + pub(crate) missing_required_task_kind_count: usize, + pub(crate) evidence_ref_required_count: usize, + pub(crate) evidence_ref_output_count: usize, + pub(crate) evidence_ref_coverage: f64, + pub(crate) freshness_marker_count: usize, + pub(crate) freshness_coverage: f64, + pub(crate) action_rationale_count: usize, + pub(crate) action_rationale_coverage: f64, + pub(crate) trace_required_count: usize, + pub(crate) trace_complete_count: usize, + pub(crate) trace_coverage: f64, + pub(crate) source_mutation_count: usize, + pub(crate) current_output_count: usize, + pub(crate) non_current_output_count: usize, + pub(crate) invalid_current_output_count: usize, + pub(crate) untraced_output_count: usize, + pub(crate) unsupported_current_output_count: usize, + pub(crate) tombstone_violation_count: usize, + pub(crate) source_trace_selected_count: usize, + pub(crate) source_trace_dropped_count: usize, + pub(crate) source_trace_stale_count: usize, + pub(crate) source_trace_superseded_count: usize, + pub(crate) source_trace_tombstone_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct WorkContinuitySummaryReport { + pub(crate) job_count: usize, + pub(crate) readback_count: usize, + pub(crate) entry_count: usize, + pub(crate) reset_resume_required_count: usize, + pub(crate) reset_resume_success_count: usize, + pub(crate) reset_resume_success_rate: f64, + pub(crate) decision_rationale_required_count: usize, + pub(crate) decision_rationale_recalled_count: usize, + pub(crate) decision_rationale_recall_rate: f64, + pub(crate) rejected_option_required_count: usize, + pub(crate) rejected_option_suppressed_count: usize, + pub(crate) rejected_option_resurrection_count: usize, + pub(crate) rejected_option_suppression_rate: f64, + pub(crate) explicit_next_step_required_count: usize, + pub(crate) explicit_next_step_returned_count: usize, + pub(crate) explicit_next_step_correct_count: usize, + pub(crate) explicit_next_step_precision: f64, + pub(crate) inferred_next_step_required_count: usize, + pub(crate) inferred_next_step_labeled_count: usize, + pub(crate) inferred_step_instruction_count: usize, + pub(crate) inferred_next_step_labeling_rate: f64, + pub(crate) handoff_source_ref_required_count: usize, + pub(crate) handoff_source_ref_covered_count: usize, + pub(crate) handoff_source_ref_coverage: f64, + pub(crate) redaction_required_count: usize, + pub(crate) redaction_applied_count: usize, + pub(crate) sensitive_marker_persistence_count: usize, + pub(crate) redaction_rate: f64, + pub(crate) janitor_candidate_count: usize, + pub(crate) janitor_false_promotion_count: usize, + pub(crate) janitor_false_promotion_rate: f64, + pub(crate) journal_only_authority_claim_count: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct KnowledgeSummary { + pub(crate) job_count: usize, + pub(crate) page_count: usize, + pub(crate) section_count: usize, + pub(crate) backlink_count: usize, + pub(crate) pages_with_backlinks: usize, + pub(crate) pages_with_version_diff: usize, + pub(crate) citation_coverage: f64, + pub(crate) stale_claim_detection: f64, + pub(crate) rebuild_determinism: f64, + pub(crate) backlink_coverage: f64, + pub(crate) version_diff_coverage: f64, + pub(crate) page_usefulness: f64, + pub(crate) unsupported_summary_count: usize, + pub(crate) untraced_section_count: usize, + pub(crate) allowed_variance_count: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs new file mode 100644 index 00000000..165771c7 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs @@ -0,0 +1,25 @@ +use crate::{Deserialize, Serialize, TypedStatus}; + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct SuiteReport { + pub(crate) suite_id: String, + pub(crate) status: TypedStatus, + pub(crate) encoded_job_count: usize, + pub(crate) score_mean: Option, + pub(crate) unsupported_claim_count: usize, + pub(crate) wrong_result_count: usize, + #[serde(default)] + pub(crate) stale_answer_count: usize, + #[serde(default)] + pub(crate) conflict_detection_count: usize, + #[serde(default)] + pub(crate) update_rationale_available_count: usize, + #[serde(default)] + pub(crate) temporal_validity_not_encoded_count: usize, + #[serde(default)] + pub(crate) history_readback_encoded_count: usize, + pub(crate) expected_evidence_recall: Option, + pub(crate) irrelevant_context_ratio: Option, + pub(crate) trace_explainability_count: usize, + pub(crate) reason: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs index b750ebba..9f5ee6a0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs @@ -1,279 +1,10 @@ -use crate::{CostReport, Deserialize, Serialize, TypedStatus}; - -#[derive(Debug, Default, Deserialize, Serialize)] -pub(super) struct ReportSummary { - pub(super) job_count: usize, - pub(super) encoded_suite_count: usize, - pub(super) pass: usize, - pub(super) wrong_result: usize, - pub(super) lifecycle_fail: usize, - pub(super) incomplete: usize, - pub(super) blocked: usize, - pub(super) not_encoded: usize, - pub(super) unsupported_claim: usize, - pub(super) unsupported_claim_count: usize, - pub(super) wrong_result_count: usize, - #[serde(default)] - pub(super) stale_answer_count: usize, - #[serde(default)] - pub(super) conflict_detection_count: usize, - #[serde(default)] - pub(super) update_rationale_available_count: usize, - #[serde(default)] - pub(super) temporal_validity_not_encoded_count: usize, - #[serde(default)] - pub(super) history_readback_encoded_count: usize, - pub(super) expected_evidence_total: usize, - pub(super) expected_evidence_matched: usize, - pub(super) expected_evidence_recall: f64, - pub(super) irrelevant_context_count: usize, - pub(super) irrelevant_context_ratio: f64, - pub(super) trace_explainability_count: usize, - pub(super) wrong_result_stage_attribution_count: usize, - pub(super) mean_score: f64, - pub(super) mean_latency_ms: Option, - pub(super) total_cost: Option, - #[serde(default)] - pub(super) evidence_required_count: usize, - #[serde(default)] - pub(super) evidence_covered_count: usize, - #[serde(default)] - pub(super) evidence_coverage: f64, - #[serde(default)] - pub(super) source_ref_required_count: usize, - #[serde(default)] - pub(super) source_ref_covered_count: usize, - #[serde(default)] - pub(super) source_ref_coverage: f64, - #[serde(default)] - pub(super) quote_required_count: usize, - #[serde(default)] - pub(super) quote_covered_count: usize, - #[serde(default)] - pub(super) quote_coverage: f64, - #[serde(default)] - pub(super) stale_retrieval_count: usize, - #[serde(default)] - pub(super) scope_check_count: usize, - #[serde(default)] - pub(super) scope_correct_count: usize, - #[serde(default)] - pub(super) scope_correctness: f64, - #[serde(default)] - pub(super) scope_violation_count: usize, - #[serde(default)] - pub(super) redaction_leak_count: usize, - #[serde(default)] - pub(super) qdrant_rebuild_case_count: usize, - #[serde(default)] - pub(super) qdrant_rebuild_pass_count: usize, - #[serde(default)] - pub(super) operator_debug_job_count: usize, - #[serde(default)] - pub(super) raw_sql_needed_count: usize, - #[serde(default)] - pub(super) trace_incomplete_count: usize, - #[serde(default)] - pub(super) operator_ux_gap_count: usize, - #[serde(default)] - pub(super) consolidation: ConsolidationSummaryReport, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) memory_summary: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) proactive_brief: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) scheduled_memory: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) work_continuity: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) knowledge: Option, -} - -#[derive(Debug, Default, Deserialize, Serialize)] -pub(super) struct ConsolidationSummaryReport { - pub(super) proposal_count: usize, - pub(super) proposal_usefulness: Option, - pub(super) lineage_completeness: Option, - pub(super) review_action_correctness: Option, - pub(super) source_mutation_count: usize, - pub(super) proposal_unsupported_claim_count: usize, - pub(super) executable_gap_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct MemorySummaryReport { - pub(super) job_count: usize, - pub(super) summary_count: usize, - pub(super) entry_count: usize, - pub(super) required_category_count: usize, - pub(super) covered_required_category_count: usize, - pub(super) missing_required_category_count: usize, - pub(super) top_of_mind_count: usize, - pub(super) background_count: usize, - pub(super) stale_count: usize, - pub(super) superseded_count: usize, - pub(super) tombstone_count: usize, - pub(super) derived_project_profile_count: usize, - pub(super) source_ref_required_count: usize, - pub(super) source_ref_entry_count: usize, - pub(super) source_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) rationale_count: usize, - pub(super) rationale_coverage: f64, - pub(super) invalid_top_of_mind_count: usize, - pub(super) untraced_entry_count: usize, - pub(super) derived_with_source_or_unsupported_count: usize, - pub(super) derived_missing_source_or_unsupported_count: usize, - pub(super) unsupported_derived_entry_count: usize, - pub(super) unsupported_current_entry_count: usize, - pub(super) tombstone_ref_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ProactiveBriefSummaryReport { - pub(super) job_count: usize, - pub(super) brief_count: usize, - pub(super) suggestion_count: usize, - pub(super) required_suggestion_kind_count: usize, - pub(super) covered_required_suggestion_kind_count: usize, - pub(super) missing_required_suggestion_kind_count: usize, - pub(super) evidence_ref_required_count: usize, - pub(super) evidence_ref_suggestion_count: usize, - pub(super) evidence_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) action_rationale_count: usize, - pub(super) action_rationale_coverage: f64, - pub(super) recommended_count: usize, - pub(super) deferred_count: usize, - pub(super) rejected_count: usize, - pub(super) current_suggestion_count: usize, - pub(super) non_current_suggestion_count: usize, - pub(super) stale_warning_count: usize, - pub(super) invalid_current_suggestion_count: usize, - pub(super) untraced_suggestion_count: usize, - pub(super) unsupported_current_suggestion_count: usize, - pub(super) tombstone_violation_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ScheduledMemorySummaryReport { - pub(super) job_count: usize, - pub(super) task_run_count: usize, - pub(super) output_count: usize, - pub(super) required_task_kind_count: usize, - pub(super) covered_required_task_kind_count: usize, - pub(super) missing_required_task_kind_count: usize, - pub(super) evidence_ref_required_count: usize, - pub(super) evidence_ref_output_count: usize, - pub(super) evidence_ref_coverage: f64, - pub(super) freshness_marker_count: usize, - pub(super) freshness_coverage: f64, - pub(super) action_rationale_count: usize, - pub(super) action_rationale_coverage: f64, - pub(super) trace_required_count: usize, - pub(super) trace_complete_count: usize, - pub(super) trace_coverage: f64, - pub(super) source_mutation_count: usize, - pub(super) current_output_count: usize, - pub(super) non_current_output_count: usize, - pub(super) invalid_current_output_count: usize, - pub(super) untraced_output_count: usize, - pub(super) unsupported_current_output_count: usize, - pub(super) tombstone_violation_count: usize, - pub(super) source_trace_selected_count: usize, - pub(super) source_trace_dropped_count: usize, - pub(super) source_trace_stale_count: usize, - pub(super) source_trace_superseded_count: usize, - pub(super) source_trace_tombstone_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct WorkContinuitySummaryReport { - pub(super) job_count: usize, - pub(super) readback_count: usize, - pub(super) entry_count: usize, - pub(super) reset_resume_required_count: usize, - pub(super) reset_resume_success_count: usize, - pub(super) reset_resume_success_rate: f64, - pub(super) decision_rationale_required_count: usize, - pub(super) decision_rationale_recalled_count: usize, - pub(super) decision_rationale_recall_rate: f64, - pub(super) rejected_option_required_count: usize, - pub(super) rejected_option_suppressed_count: usize, - pub(super) rejected_option_resurrection_count: usize, - pub(super) rejected_option_suppression_rate: f64, - pub(super) explicit_next_step_required_count: usize, - pub(super) explicit_next_step_returned_count: usize, - pub(super) explicit_next_step_correct_count: usize, - pub(super) explicit_next_step_precision: f64, - pub(super) inferred_next_step_required_count: usize, - pub(super) inferred_next_step_labeled_count: usize, - pub(super) inferred_step_instruction_count: usize, - pub(super) inferred_next_step_labeling_rate: f64, - pub(super) handoff_source_ref_required_count: usize, - pub(super) handoff_source_ref_covered_count: usize, - pub(super) handoff_source_ref_coverage: f64, - pub(super) redaction_required_count: usize, - pub(super) redaction_applied_count: usize, - pub(super) sensitive_marker_persistence_count: usize, - pub(super) redaction_rate: f64, - pub(super) janitor_candidate_count: usize, - pub(super) janitor_false_promotion_count: usize, - pub(super) janitor_false_promotion_rate: f64, - pub(super) journal_only_authority_claim_count: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct KnowledgeSummary { - pub(super) job_count: usize, - pub(super) page_count: usize, - pub(super) section_count: usize, - pub(super) backlink_count: usize, - pub(super) pages_with_backlinks: usize, - pub(super) pages_with_version_diff: usize, - pub(super) citation_coverage: f64, - pub(super) stale_claim_detection: f64, - pub(super) rebuild_determinism: f64, - pub(super) backlink_coverage: f64, - pub(super) version_diff_coverage: f64, - pub(super) page_usefulness: f64, - pub(super) unsupported_summary_count: usize, - pub(super) untraced_section_count: usize, - pub(super) allowed_variance_count: usize, -} - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct SuiteReport { - pub(super) suite_id: String, - pub(super) status: TypedStatus, - pub(super) encoded_job_count: usize, - pub(super) score_mean: Option, - pub(super) unsupported_claim_count: usize, - pub(super) wrong_result_count: usize, - #[serde(default)] - pub(super) stale_answer_count: usize, - #[serde(default)] - pub(super) conflict_detection_count: usize, - #[serde(default)] - pub(super) update_rationale_available_count: usize, - #[serde(default)] - pub(super) temporal_validity_not_encoded_count: usize, - #[serde(default)] - pub(super) history_readback_encoded_count: usize, - pub(super) expected_evidence_recall: Option, - pub(super) irrelevant_context_ratio: Option, - pub(super) trace_explainability_count: usize, - pub(super) reason: String, -} +mod summary_report_core; +mod summary_report_domain; +mod summary_report_suite; + +pub(super) use summary_report_core::{ConsolidationSummaryReport, ReportSummary}; +pub(super) use summary_report_domain::{ + KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, + ScheduledMemorySummaryReport, WorkContinuitySummaryReport, +}; +pub(super) use summary_report_suite::SuiteReport; From 4b4445f507d0203056136d81e260dc2015d26468 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 30 Jun 2026 12:25:21 +0800 Subject: [PATCH 3/4] {"schema":"decodex/commit/1","summary":"Split external adapter report contract modules","authority":"manual"} --- .../external_adapter_detail_reports.rs | 100 ++++++++ .../external_adapter_manifest_reports.rs | 33 +++ .../external_adapter_misc_reports.rs | 27 +++ .../external_adapter_reports.rs | 227 ++---------------- .../external_adapter_summary_reports.rs | 54 +++++ 5 files changed, 233 insertions(+), 208 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs new file mode 100644 index 00000000..8ddffe51 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs @@ -0,0 +1,100 @@ +use crate::{ + AdapterCoverageStatus, Deserialize, ElfScenarioPosition, FollowUpInput, + ScenarioComparisonOutcome, Serialize, +}; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct ExternalAdapterReport { + pub(crate) adapter_id: String, + pub(crate) project: String, + pub(crate) adapter_kind: String, + pub(crate) evidence_class: String, + pub(crate) docker_default: bool, + pub(crate) host_global_installs_required: bool, + pub(crate) overall_status: AdapterCoverageStatus, + pub(crate) setup: AdapterExecutionEvidence, + pub(crate) run: AdapterExecutionEvidence, + pub(crate) result: AdapterExecutionEvidence, + #[serde(default)] + pub(crate) capabilities: Vec, + #[serde(default)] + pub(crate) suites: Vec, + #[serde(default)] + pub(crate) scenarios: Vec, + #[serde(default)] + pub(crate) evidence: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) execution_metadata: Option, + #[serde(default)] + pub(crate) notes: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) follow_up: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterExecutionEvidence { + pub(crate) status: AdapterCoverageStatus, + pub(crate) evidence: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) artifact: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterCapabilityCoverage { + pub(crate) capability: String, + pub(crate) status: AdapterCoverageStatus, + pub(crate) evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterSuiteCoverage { + pub(crate) suite_id: String, + pub(crate) status: AdapterCoverageStatus, + pub(crate) evidence: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterScenarioJudgment { + pub(crate) scenario_id: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) suite_id: Option, + pub(crate) status: AdapterCoverageStatus, + pub(crate) elf_position: ElfScenarioPosition, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) comparison_outcome: Option, + pub(crate) evidence: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) command: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) artifact: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterEvidencePointer { + pub(crate) kind: String, + #[serde(rename = "ref")] + pub(crate) reference: String, + pub(crate) status: AdapterCoverageStatus, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterExecutionMetadata { + #[serde(default)] + pub(crate) sources: Vec, + pub(crate) setup_path: String, + pub(crate) runtime_boundary: String, + pub(crate) resource_expectation: String, + #[serde(default)] + pub(crate) retry_guidance: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) research_depth: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct AdapterSource { + pub(crate) label: String, + pub(crate) url: String, + pub(crate) evidence: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs new file mode 100644 index 00000000..68050aec --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs @@ -0,0 +1,33 @@ +use crate::{Deserialize, Serialize}; + +use super::{ExternalAdapterReport, ExternalAdapterSummary}; + +#[derive(Debug, Deserialize)] +pub(crate) struct ExternalAdapterManifest { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) docker_isolation: ExternalDockerIsolation, + #[serde(default)] + pub(crate) adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ExternalAdapterSection { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) docker_isolation: ExternalDockerIsolation, + pub(crate) summary: ExternalAdapterSummary, + #[serde(default)] + pub(crate) adapters: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ExternalDockerIsolation { + pub(crate) default: bool, + pub(crate) compose_file: String, + pub(crate) runner: String, + pub(crate) artifact_dir: String, + pub(crate) host_global_installs_required: bool, + #[serde(default)] + pub(crate) notes: Vec, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs new file mode 100644 index 00000000..526ec8eb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs @@ -0,0 +1,27 @@ +use crate::{Deserialize, Serialize, TypedStatus}; + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct AdapterReport { + pub(crate) adapter_id: String, + pub(crate) name: String, + pub(crate) behavior: String, + pub(crate) storage: TypedStatus, + pub(crate) runtime: TypedStatus, + pub(crate) notes: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct CaptureIntegrationReport { + #[serde(default)] + pub(crate) real: Vec, + #[serde(default)] + pub(crate) fixture_backed: Vec, + #[serde(default)] + pub(crate) mocked: Vec, + #[serde(default)] + pub(crate) blocked: Vec, + #[serde(default)] + pub(crate) not_encoded: Vec, + #[serde(default)] + pub(crate) notes: Vec, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs index 3366383e..d9e86609 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs @@ -1,209 +1,20 @@ -use crate::{ - AdapterCoverageStatus, Deserialize, ElfScenarioPosition, FollowUpInput, - ScenarioComparisonOutcome, Serialize, TypedStatus, +mod external_adapter_detail_reports; +mod external_adapter_manifest_reports; +mod external_adapter_misc_reports; +mod external_adapter_summary_reports; + +#[allow(unused_imports)] +pub(super) use external_adapter_detail_reports::{ + AdapterCapabilityCoverage, AdapterEvidencePointer, AdapterExecutionEvidence, + AdapterExecutionMetadata, +}; +pub(super) use external_adapter_detail_reports::{ + AdapterScenarioJudgment, AdapterSource, AdapterSuiteCoverage, ExternalAdapterReport, +}; +pub(super) use external_adapter_manifest_reports::{ + ExternalAdapterManifest, ExternalAdapterSection, ExternalDockerIsolation, +}; +pub(super) use external_adapter_misc_reports::{AdapterReport, CaptureIntegrationReport}; +pub(super) use external_adapter_summary_reports::{ + AdapterStatusCounts, ExternalAdapterSummary, ScenarioOutcomeCounts, ScenarioPositionCounts, }; - -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct AdapterReport { - pub(super) adapter_id: String, - pub(super) name: String, - pub(super) behavior: String, - pub(super) storage: TypedStatus, - pub(super) runtime: TypedStatus, - pub(super) notes: String, -} - -#[derive(Debug, Deserialize)] -pub(super) struct ExternalAdapterManifest { - pub(super) schema: String, - pub(super) manifest_id: String, - pub(super) docker_isolation: ExternalDockerIsolation, - #[serde(default)] - pub(super) adapters: Vec, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ExternalAdapterSection { - pub(super) schema: String, - pub(super) manifest_id: String, - pub(super) docker_isolation: ExternalDockerIsolation, - pub(super) summary: ExternalAdapterSummary, - #[serde(default)] - pub(super) adapters: Vec, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ExternalDockerIsolation { - pub(super) default: bool, - pub(super) compose_file: String, - pub(super) runner: String, - pub(super) artifact_dir: String, - pub(super) host_global_installs_required: bool, - #[serde(default)] - pub(super) notes: Vec, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct ExternalAdapterReport { - pub(super) adapter_id: String, - pub(super) project: String, - pub(super) adapter_kind: String, - pub(super) evidence_class: String, - pub(super) docker_default: bool, - pub(super) host_global_installs_required: bool, - pub(super) overall_status: AdapterCoverageStatus, - pub(super) setup: AdapterExecutionEvidence, - pub(super) run: AdapterExecutionEvidence, - pub(super) result: AdapterExecutionEvidence, - #[serde(default)] - pub(super) capabilities: Vec, - #[serde(default)] - pub(super) suites: Vec, - #[serde(default)] - pub(super) scenarios: Vec, - #[serde(default)] - pub(super) evidence: Vec, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) execution_metadata: Option, - #[serde(default)] - pub(super) notes: Vec, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) follow_up: Option, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterExecutionEvidence { - pub(super) status: AdapterCoverageStatus, - pub(super) evidence: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) command: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) artifact: Option, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterCapabilityCoverage { - pub(super) capability: String, - pub(super) status: AdapterCoverageStatus, - pub(super) evidence: String, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterSuiteCoverage { - pub(super) suite_id: String, - pub(super) status: AdapterCoverageStatus, - pub(super) evidence: String, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterScenarioJudgment { - pub(super) scenario_id: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) suite_id: Option, - pub(super) status: AdapterCoverageStatus, - pub(super) elf_position: ElfScenarioPosition, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) comparison_outcome: Option, - pub(super) evidence: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) command: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) artifact: Option, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterEvidencePointer { - pub(super) kind: String, - #[serde(rename = "ref")] - pub(super) reference: String, - pub(super) status: AdapterCoverageStatus, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterExecutionMetadata { - #[serde(default)] - pub(super) sources: Vec, - pub(super) setup_path: String, - pub(super) runtime_boundary: String, - pub(super) resource_expectation: String, - #[serde(default)] - pub(super) retry_guidance: Vec, - #[serde(skip_serializing_if = "Option::is_none")] - pub(super) research_depth: Option, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct AdapterSource { - pub(super) label: String, - pub(super) url: String, - pub(super) evidence: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ExternalAdapterSummary { - pub(super) adapter_count: usize, - pub(super) external_project_count: usize, - pub(super) docker_default_count: usize, - pub(super) host_global_install_required_count: usize, - pub(super) fixture_backed_count: usize, - pub(super) live_baseline_only_count: usize, - pub(super) live_real_world_count: usize, - #[serde(default)] - pub(super) research_gate_count: usize, - pub(super) overall_status_counts: AdapterStatusCounts, - pub(super) capability_status_counts: AdapterStatusCounts, - pub(super) suite_status_counts: AdapterStatusCounts, - #[serde(default)] - pub(super) scenario_status_counts: AdapterStatusCounts, - #[serde(default)] - pub(super) scenario_position_counts: ScenarioPositionCounts, - #[serde(default)] - pub(super) scenario_outcome_counts: ScenarioOutcomeCounts, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct AdapterStatusCounts { - pub(super) real: usize, - pub(super) mocked: usize, - pub(super) unsupported: usize, - pub(super) blocked: usize, - pub(super) incomplete: usize, - pub(super) wrong_result: usize, - pub(super) lifecycle_fail: usize, - pub(super) pass: usize, - pub(super) not_encoded: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ScenarioPositionCounts { - pub(super) wins: usize, - pub(super) ties: usize, - pub(super) loses: usize, - pub(super) untested: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct ScenarioOutcomeCounts { - pub(super) win: usize, - pub(super) tie: usize, - pub(super) loss: usize, - pub(super) not_tested: usize, - pub(super) blocked: usize, - pub(super) non_goal: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct CaptureIntegrationReport { - #[serde(default)] - pub(super) real: Vec, - #[serde(default)] - pub(super) fixture_backed: Vec, - #[serde(default)] - pub(super) mocked: Vec, - #[serde(default)] - pub(super) blocked: Vec, - #[serde(default)] - pub(super) not_encoded: Vec, - #[serde(default)] - pub(super) notes: Vec, -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs new file mode 100644 index 00000000..5f1b987b --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs @@ -0,0 +1,54 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ExternalAdapterSummary { + pub(crate) adapter_count: usize, + pub(crate) external_project_count: usize, + pub(crate) docker_default_count: usize, + pub(crate) host_global_install_required_count: usize, + pub(crate) fixture_backed_count: usize, + pub(crate) live_baseline_only_count: usize, + pub(crate) live_real_world_count: usize, + #[serde(default)] + pub(crate) research_gate_count: usize, + pub(crate) overall_status_counts: AdapterStatusCounts, + pub(crate) capability_status_counts: AdapterStatusCounts, + pub(crate) suite_status_counts: AdapterStatusCounts, + #[serde(default)] + pub(crate) scenario_status_counts: AdapterStatusCounts, + #[serde(default)] + pub(crate) scenario_position_counts: ScenarioPositionCounts, + #[serde(default)] + pub(crate) scenario_outcome_counts: ScenarioOutcomeCounts, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct AdapterStatusCounts { + pub(crate) real: usize, + pub(crate) mocked: usize, + pub(crate) unsupported: usize, + pub(crate) blocked: usize, + pub(crate) incomplete: usize, + pub(crate) wrong_result: usize, + pub(crate) lifecycle_fail: usize, + pub(crate) pass: usize, + pub(crate) not_encoded: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ScenarioPositionCounts { + pub(crate) wins: usize, + pub(crate) ties: usize, + pub(crate) loses: usize, + pub(crate) untested: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct ScenarioOutcomeCounts { + pub(crate) win: usize, + pub(crate) tie: usize, + pub(crate) loss: usize, + pub(crate) not_tested: usize, + pub(crate) blocked: usize, + pub(crate) non_goal: usize, +} From f659998ecd74f407c8887cfb43745381ac709ab3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Tue, 30 Jun 2026 13:07:48 +0800 Subject: [PATCH 4/4] {"schema":"decodex/commit/1","summary":"Satisfy report contract import style checks","authority":"manual"} --- .../external_adapter_manifest_reports.rs | 7 +++-- .../external_adapter_reports.rs | 22 +++++++------- .../job_report_core.rs | 9 +++--- .../job_report_scoring.rs | 13 ++++---- .../real_world_job_benchmark/job_reports.rs | 30 +++++++++---------- .../summary_report_core.rs | 11 +++---- .../summary_reports.rs | 12 ++++---- 7 files changed, 55 insertions(+), 49 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs index 68050aec..d533c81b 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs @@ -1,6 +1,7 @@ -use crate::{Deserialize, Serialize}; - -use super::{ExternalAdapterReport, ExternalAdapterSummary}; +use crate::{ + Deserialize, Serialize, + external_adapter_reports::{ExternalAdapterReport, ExternalAdapterSummary}, +}; #[derive(Debug, Deserialize)] pub(crate) struct ExternalAdapterManifest { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs index d9e86609..af730ae2 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports.rs @@ -3,18 +3,20 @@ mod external_adapter_manifest_reports; mod external_adapter_misc_reports; mod external_adapter_summary_reports; +pub(super) use self::{ + external_adapter_detail_reports::{ + AdapterScenarioJudgment, AdapterSource, AdapterSuiteCoverage, ExternalAdapterReport, + }, + external_adapter_manifest_reports::{ + ExternalAdapterManifest, ExternalAdapterSection, ExternalDockerIsolation, + }, + external_adapter_misc_reports::{AdapterReport, CaptureIntegrationReport}, + external_adapter_summary_reports::{ + AdapterStatusCounts, ExternalAdapterSummary, ScenarioOutcomeCounts, ScenarioPositionCounts, + }, +}; #[allow(unused_imports)] pub(super) use external_adapter_detail_reports::{ AdapterCapabilityCoverage, AdapterEvidencePointer, AdapterExecutionEvidence, AdapterExecutionMetadata, }; -pub(super) use external_adapter_detail_reports::{ - AdapterScenarioJudgment, AdapterSource, AdapterSuiteCoverage, ExternalAdapterReport, -}; -pub(super) use external_adapter_manifest_reports::{ - ExternalAdapterManifest, ExternalAdapterSection, ExternalDockerIsolation, -}; -pub(super) use external_adapter_misc_reports::{AdapterReport, CaptureIntegrationReport}; -pub(super) use external_adapter_summary_reports::{ - AdapterStatusCounts, ExternalAdapterSummary, ScenarioOutcomeCounts, ScenarioPositionCounts, -}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs index d119db7f..95326fa8 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs @@ -1,11 +1,10 @@ use crate::{ AuthorityRecoveryDrillArtifact, CostReport, Deserialize, OperatorDebugEvidence, Serialize, TraceExplainability, TypedStatus, -}; - -use super::{ - ConsolidationJobReport, EvolutionJobReport, KnowledgeJobMetrics, MemorySummaryJobMetrics, - ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, + job_reports::{ + ConsolidationJobReport, EvolutionJobReport, KnowledgeJobMetrics, MemorySummaryJobMetrics, + ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, + }, }; #[derive(Debug, Deserialize, Serialize)] diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs index c56e5cf4..f4b95c9f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs @@ -1,9 +1,10 @@ -use crate::TypedStatus; - -use super::{ - ConsolidationJobReport, DimensionScoreReport, EvolutionJobReport, KnowledgeJobMetrics, - MemorySummaryJobMetrics, ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, - UnsupportedClaimReport, WorkContinuityJobMetrics, +use crate::{ + TypedStatus, + job_reports::{ + ConsolidationJobReport, DimensionScoreReport, EvolutionJobReport, KnowledgeJobMetrics, + MemorySummaryJobMetrics, ProactiveBriefJobMetrics, ScheduledMemoryJobMetrics, + UnsupportedClaimReport, WorkContinuityJobMetrics, + }, }; #[derive(Debug)] diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs index 91b4af55..713a44ba 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports.rs @@ -5,19 +5,19 @@ mod job_report_evolution; mod job_report_misc; mod job_report_scoring; -pub(super) use consolidation_reports::{ - ConsolidationExecutableGapReport, ConsolidationJobReport, ConsolidationProposalReport, -}; -pub(super) use job_report_core::{ - DimensionScoreReport, ExpectedEvidenceReport, JobReport, RetrievalQualityReport, - UnsupportedClaimReport, -}; -pub(super) use job_report_domain_metrics::{ - KnowledgeJobMetrics, MemorySummaryJobMetrics, ProactiveBriefJobMetrics, - ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, -}; -pub(super) use job_report_evolution::{EvolutionJobReport, EvolutionSummary}; -pub(super) use job_report_misc::{FollowUpReport, PrivateCorpusRedaction}; -pub(super) use job_report_scoring::{ - FailureCounts, JobMetrics, JobScoring, ScoreboardRankedMetrics, +pub(super) use self::{ + consolidation_reports::{ + ConsolidationExecutableGapReport, ConsolidationJobReport, ConsolidationProposalReport, + }, + job_report_core::{ + DimensionScoreReport, ExpectedEvidenceReport, JobReport, RetrievalQualityReport, + UnsupportedClaimReport, + }, + job_report_domain_metrics::{ + KnowledgeJobMetrics, MemorySummaryJobMetrics, ProactiveBriefJobMetrics, + ScheduledMemoryJobMetrics, WorkContinuityJobMetrics, + }, + job_report_evolution::{EvolutionJobReport, EvolutionSummary}, + job_report_misc::{FollowUpReport, PrivateCorpusRedaction}, + job_report_scoring::{FailureCounts, JobMetrics, JobScoring, ScoreboardRankedMetrics}, }; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs index b5776d11..dea563fa 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs @@ -1,8 +1,9 @@ -use crate::{CostReport, Deserialize, Serialize}; - -use super::{ - KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, - ScheduledMemorySummaryReport, WorkContinuitySummaryReport, +use crate::{ + CostReport, Deserialize, Serialize, + summary_reports::{ + KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, + ScheduledMemorySummaryReport, WorkContinuitySummaryReport, + }, }; #[derive(Debug, Default, Deserialize, Serialize)] diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs index 9f5ee6a0..2e9c194d 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports.rs @@ -2,9 +2,11 @@ mod summary_report_core; mod summary_report_domain; mod summary_report_suite; -pub(super) use summary_report_core::{ConsolidationSummaryReport, ReportSummary}; -pub(super) use summary_report_domain::{ - KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, - ScheduledMemorySummaryReport, WorkContinuitySummaryReport, +pub(super) use self::{ + summary_report_core::{ConsolidationSummaryReport, ReportSummary}, + summary_report_domain::{ + KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, + ScheduledMemorySummaryReport, WorkContinuitySummaryReport, + }, + summary_report_suite::SuiteReport, }; -pub(super) use summary_report_suite::SuiteReport;