From 304c4471f12911796056c3758509d1f39acb43be Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 22:52:23 +0200 Subject: [PATCH 01/37] feat: add flaw_tracer registry with full pipeline DAG mapping Static DAG registry mapping all 48 PlanExe pipeline stages to their output files, upstream dependencies, and source code paths. Includes lookup functions (find_stage_by_filename, get_upstream_files, get_source_code_paths) and 14 passing tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/__init__.py | 1 + .../flaw_tracer/registry.py | 781 ++++++++++++++++++ .../flaw_tracer/tests/__init__.py | 0 .../flaw_tracer/tests/test_registry.py | 109 +++ 4 files changed, 891 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/__init__.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/registry.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__init__.py b/worker_plan/worker_plan_internal/flaw_tracer/__init__.py new file mode 100644 index 000000000..e6ca3af64 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/__init__.py @@ -0,0 +1 @@ +"""Flaw Tracer — Root-cause analysis for PlanExe reports.""" diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py new file mode 100644 index 000000000..7fb3c9635 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -0,0 +1,781 @@ +# worker_plan/worker_plan_internal/flaw_tracer/registry.py +"""Static DAG mapping for the PlanExe pipeline. + +Maps every pipeline stage to its output files, upstream dependencies, +and source code files. Derived from the Luigi task classes in +worker_plan_internal/plan/stages/. +""" +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +# Base path for source code, relative to worker_plan/ +_SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ + + +@dataclass(frozen=True) +class StageInfo: + """One pipeline stage.""" + name: str + output_files: list[str] + primary_output: str # preferred file to read when checking for flaws + upstream_stages: list[str] = field(default_factory=list) + source_code_files: list[str] = field(default_factory=list) + + +# ── Complete pipeline registry ────────────────────────────────────────── + +STAGES: list[StageInfo] = [ + # Phase 1: Initialization + StageInfo( + name="start_time", + output_files=["001-1-start_time.json"], + primary_output="001-1-start_time.json", + upstream_stages=[], + source_code_files=["worker_plan_internal/plan/stages/start_time.py"], + ), + StageInfo( + name="setup", + output_files=["001-2-plan.txt"], + primary_output="001-2-plan.txt", + upstream_stages=[], + source_code_files=["worker_plan_internal/plan/stages/setup.py"], + ), + # Phase 2: Input Validation & Strategy + StageInfo( + name="screen_planning_prompt", + output_files=["002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"], + primary_output="002-0-screen_planning_prompt.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/screen_planning_prompt.py", + "worker_plan_internal/diagnostics/screen_planning_prompt.py", + ], + ), + StageInfo( + name="extract_constraints", + output_files=["002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"], + primary_output="002-0-extract_constraints.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/extract_constraints.py", + "worker_plan_internal/diagnostics/extract_constraints.py", + ], + ), + StageInfo( + name="redline_gate", + output_files=["002-1-redline_gate.json", "002-2-redline_gate.md"], + primary_output="002-2-redline_gate.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/redline_gate.py", + "worker_plan_internal/diagnostics/redline_gate.py", + ], + ), + StageInfo( + name="premise_attack", + output_files=["002-3-premise_attack.json", "002-4-premise_attack.md"], + primary_output="002-4-premise_attack.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/premise_attack.py", + "worker_plan_internal/diagnostics/premise_attack.py", + ], + ), + StageInfo( + name="identify_purpose", + output_files=["002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"], + primary_output="002-6-identify_purpose.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_purpose.py", + "worker_plan_internal/assume/identify_purpose.py", + ], + ), + StageInfo( + name="plan_type", + output_files=["002-7-plan_type_raw.json", "002-8-plan_type.md"], + primary_output="002-8-plan_type.md", + upstream_stages=["setup", "identify_purpose"], + source_code_files=[ + "worker_plan_internal/plan/stages/plan_type.py", + "worker_plan_internal/assume/identify_plan_type.py", + ], + ), + StageInfo( + name="potential_levers", + output_files=["002-9-potential_levers_raw.json", "002-10-potential_levers.json"], + primary_output="002-10-potential_levers.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "extract_constraints"], + source_code_files=[ + "worker_plan_internal/plan/stages/potential_levers.py", + "worker_plan_internal/lever/identify_potential_levers.py", + ], + ), + StageInfo( + name="deduplicate_levers", + output_files=["002-11-deduplicated_levers_raw.json"], + primary_output="002-11-deduplicated_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "potential_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/deduplicate_levers.py", + "worker_plan_internal/lever/deduplicate_levers.py", + ], + ), + StageInfo( + name="enrich_levers", + output_files=["002-12-enriched_levers_raw.json"], + primary_output="002-12-enriched_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "deduplicate_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_levers.py", + "worker_plan_internal/lever/enrich_potential_levers.py", + ], + ), + StageInfo( + name="focus_on_vital_few_levers", + output_files=["002-13-vital_few_levers_raw.json"], + primary_output="002-13-vital_few_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "enrich_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/focus_on_vital_few_levers.py", + "worker_plan_internal/lever/focus_on_vital_few_levers.py", + ], + ), + StageInfo( + name="strategic_decisions_markdown", + output_files=["002-14-strategic_decisions.md"], + primary_output="002-14-strategic_decisions.md", + upstream_stages=["enrich_levers", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/strategic_decisions_markdown.py", + "worker_plan_internal/lever/strategic_decisions_markdown.py", + ], + ), + StageInfo( + name="candidate_scenarios", + output_files=["002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"], + primary_output="002-16-candidate_scenarios.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/candidate_scenarios.py", + "worker_plan_internal/lever/candidate_scenarios.py", + ], + ), + StageInfo( + name="select_scenario", + output_files=["002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"], + primary_output="002-18-selected_scenario.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"], + source_code_files=[ + "worker_plan_internal/plan/stages/select_scenario.py", + "worker_plan_internal/lever/select_scenario.py", + ], + ), + StageInfo( + name="scenarios_markdown", + output_files=["002-19-scenarios.md"], + primary_output="002-19-scenarios.md", + upstream_stages=["candidate_scenarios", "select_scenario"], + source_code_files=[ + "worker_plan_internal/plan/stages/scenarios_markdown.py", + "worker_plan_internal/lever/scenarios_markdown.py", + ], + ), + # Constraint checkers + StageInfo( + name="potential_levers_constraint", + output_files=["002-10-potential_levers_constraint.json"], + primary_output="002-10-potential_levers_constraint.json", + upstream_stages=["extract_constraints", "potential_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="deduplicated_levers_constraint", + output_files=["002-11-deduplicated_levers_constraint.json"], + primary_output="002-11-deduplicated_levers_constraint.json", + upstream_stages=["extract_constraints", "deduplicate_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="enriched_levers_constraint", + output_files=["002-12-enriched_levers_constraint.json"], + primary_output="002-12-enriched_levers_constraint.json", + upstream_stages=["extract_constraints", "enrich_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="vital_few_levers_constraint", + output_files=["002-13-vital_few_levers_constraint.json"], + primary_output="002-13-vital_few_levers_constraint.json", + upstream_stages=["extract_constraints", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="candidate_scenarios_constraint", + output_files=["002-16-candidate_scenarios_constraint.json"], + primary_output="002-16-candidate_scenarios_constraint.json", + upstream_stages=["extract_constraints", "candidate_scenarios"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="selected_scenario_constraint", + output_files=["002-18-selected_scenario_constraint.json"], + primary_output="002-18-selected_scenario_constraint.json", + upstream_stages=["extract_constraints", "select_scenario"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + # Phase 3: Context & Assumptions + StageInfo( + name="physical_locations", + output_files=["002-20-physical_locations_raw.json", "002-21-physical_locations.md"], + primary_output="002-21-physical_locations.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/physical_locations.py", + "worker_plan_internal/assume/physical_locations.py", + ], + ), + StageInfo( + name="currency_strategy", + output_files=["002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"], + primary_output="002-23-currency_strategy.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/currency_strategy.py", + "worker_plan_internal/assume/currency_strategy.py", + ], + ), + StageInfo( + name="identify_risks", + output_files=["003-1-identify_risks_raw.json", "003-2-identify_risks.md"], + primary_output="003-2-identify_risks.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_risks.py", + "worker_plan_internal/assume/identify_risks.py", + ], + ), + StageInfo( + name="make_assumptions", + output_files=["003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"], + primary_output="003-5-make_assumptions.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"], + source_code_files=[ + "worker_plan_internal/plan/stages/make_assumptions.py", + "worker_plan_internal/assume/make_assumptions.py", + ], + ), + StageInfo( + name="distill_assumptions", + output_files=["003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"], + primary_output="003-7-distill_assumptions.md", + upstream_stages=["setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/distill_assumptions.py", + "worker_plan_internal/assume/distill_assumptions.py", + ], + ), + StageInfo( + name="review_assumptions", + output_files=["003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"], + primary_output="003-9-review_assumptions.md", + upstream_stages=["identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_assumptions.py", + "worker_plan_internal/assume/review_assumptions.py", + ], + ), + StageInfo( + name="consolidate_assumptions_markdown", + output_files=["003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"], + primary_output="003-10-consolidate_assumptions_full.md", + upstream_stages=["identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/consolidate_assumptions_markdown.py", + "worker_plan_internal/assume/shorten_markdown.py", + ], + ), + # Phase 4: Pre-Project Assessment & Project Plan + StageInfo( + name="pre_project_assessment", + output_files=["004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"], + primary_output="004-2-pre_project_assessment.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/pre_project_assessment.py", + "worker_plan_internal/expert/pre_project_assessment.py", + ], + ), + StageInfo( + name="project_plan", + output_files=["005-1-project_plan_raw.json", "005-2-project_plan.md"], + primary_output="005-2-project_plan.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"], + source_code_files=[ + "worker_plan_internal/plan/stages/project_plan.py", + "worker_plan_internal/plan/project_plan.py", + ], + ), + # Phase 5: Governance + StageInfo( + name="governance_phase1_audit", + output_files=["006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"], + primary_output="006-2-governance_phase1_audit.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase1_audit.py", + "worker_plan_internal/governance/governance_phase1_audit.py", + ], + ), + StageInfo( + name="governance_phase2_bodies", + output_files=["006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"], + primary_output="006-4-governance_phase2_bodies.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase2_bodies.py", + "worker_plan_internal/governance/governance_phase2_bodies.py", + ], + ), + StageInfo( + name="governance_phase3_impl_plan", + output_files=["006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"], + primary_output="006-6-governance_phase3_impl_plan.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase3_impl_plan.py", + "worker_plan_internal/governance/governance_phase3_impl_plan.py", + ], + ), + StageInfo( + name="governance_phase4_decision_escalation_matrix", + output_files=["006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"], + primary_output="006-8-governance_phase4_decision_escalation_matrix.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase4_decision_escalation_matrix.py", + "worker_plan_internal/governance/governance_phase4_decision_escalation_matrix.py", + ], + ), + StageInfo( + name="governance_phase5_monitoring_progress", + output_files=["006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"], + primary_output="006-10-governance_phase5_monitoring_progress.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase5_monitoring_progress.py", + "worker_plan_internal/governance/governance_phase5_monitoring_progress.py", + ], + ), + StageInfo( + name="governance_phase6_extra", + output_files=["006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"], + primary_output="006-12-governance_phase6_extra.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase6_extra.py", + "worker_plan_internal/governance/governance_phase6_extra.py", + ], + ), + StageInfo( + name="consolidate_governance", + output_files=["006-13-consolidate_governance.md"], + primary_output="006-13-consolidate_governance.md", + upstream_stages=["governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"], + source_code_files=["worker_plan_internal/plan/stages/consolidate_governance.py"], + ), + # Phase 6: Resources & Team + StageInfo( + name="related_resources", + output_files=["007-1-related_resources_raw.json", "007-8-related_resources.md"], + primary_output="007-8-related_resources.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/related_resources.py", + "worker_plan_internal/plan/related_resources.py", + ], + ), + StageInfo( + name="find_team_members", + output_files=["008-1-find_team_members_raw.json", "008-2-find_team_members.json"], + primary_output="008-2-find_team_members.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/find_team_members.py", + "worker_plan_internal/team/find_team_members.py", + ], + ), + StageInfo( + name="enrich_team_contract_type", + output_files=["009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"], + primary_output="009-2-enrich_team_members_contract_type.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_contract_type.py", + "worker_plan_internal/team/enrich_team_members_with_contract_type.py", + ], + ), + StageInfo( + name="enrich_team_background_story", + output_files=["010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"], + primary_output="010-2-enrich_team_members_background_story.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_background_story.py", + "worker_plan_internal/team/enrich_team_members_with_background_story.py", + ], + ), + StageInfo( + name="enrich_team_environment_info", + output_files=["011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"], + primary_output="011-2-enrich_team_members_environment_info.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_environment_info.py", + "worker_plan_internal/team/enrich_team_members_with_environment_info.py", + ], + ), + StageInfo( + name="review_team", + output_files=["012-review_team_raw.json"], + primary_output="012-review_team_raw.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_team.py", + "worker_plan_internal/team/review_team.py", + ], + ), + StageInfo( + name="team_markdown", + output_files=["013-team.md"], + primary_output="013-team.md", + upstream_stages=["enrich_team_environment_info", "review_team"], + source_code_files=[ + "worker_plan_internal/plan/stages/team_markdown.py", + "worker_plan_internal/team/team_markdown_document.py", + ], + ), + # Phase 7: Analysis & Experts + StageInfo( + name="swot_analysis", + output_files=["014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"], + primary_output="014-2-swot_analysis.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/swot_analysis.py", + "worker_plan_internal/swot/swot_analysis.py", + ], + ), + StageInfo( + name="expert_review", + output_files=["015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"], + primary_output="016-2-expert_criticism.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"], + source_code_files=[ + "worker_plan_internal/plan/stages/expert_review.py", + "worker_plan_internal/expert/expert_finder.py", + "worker_plan_internal/expert/expert_criticism.py", + ], + ), + # Phase 8: Data & Documents + StageInfo( + name="data_collection", + output_files=["017-1-data_collection_raw.json", "017-2-data_collection.md"], + primary_output="017-2-data_collection.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], + source_code_files=[ + "worker_plan_internal/plan/stages/data_collection.py", + "worker_plan_internal/plan/data_collection.py", + ], + ), + StageInfo( + name="identify_documents", + output_files=["017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"], + primary_output="017-4-identified_documents.md", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_documents.py", + "worker_plan_internal/document/identify_documents.py", + ], + ), + StageInfo( + name="filter_documents_to_find", + output_files=["017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"], + primary_output="017-8-filter_documents_to_find_clean.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], + source_code_files=[ + "worker_plan_internal/plan/stages/filter_documents_to_find.py", + "worker_plan_internal/document/filter_documents_to_find.py", + ], + ), + StageInfo( + name="filter_documents_to_create", + output_files=["017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"], + primary_output="017-10-filter_documents_to_create_clean.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], + source_code_files=[ + "worker_plan_internal/plan/stages/filter_documents_to_create.py", + "worker_plan_internal/document/filter_documents_to_create.py", + ], + ), + StageInfo( + name="draft_documents_to_find", + output_files=["017-12-draft_documents_to_find.json"], + primary_output="017-12-draft_documents_to_find.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"], + source_code_files=[ + "worker_plan_internal/plan/stages/draft_documents_to_find.py", + "worker_plan_internal/document/draft_document_to_find.py", + ], + ), + StageInfo( + name="draft_documents_to_create", + output_files=["017-14-draft_documents_to_create.json"], + primary_output="017-14-draft_documents_to_create.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"], + source_code_files=[ + "worker_plan_internal/plan/stages/draft_documents_to_create.py", + "worker_plan_internal/document/draft_document_to_create.py", + ], + ), + StageInfo( + name="markdown_documents", + output_files=["017-15-documents_to_create_and_find.md"], + primary_output="017-15-documents_to_create_and_find.md", + upstream_stages=["draft_documents_to_create", "draft_documents_to_find"], + source_code_files=[ + "worker_plan_internal/plan/stages/markdown_documents.py", + "worker_plan_internal/document/markdown_with_document.py", + ], + ), + # Phase 9: WBS + StageInfo( + name="create_wbs_level1", + output_files=["018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"], + primary_output="018-2-wbs_level1.json", + upstream_stages=["project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level1.py", + "worker_plan_internal/plan/create_wbs_level1.py", + ], + ), + StageInfo( + name="create_wbs_level2", + output_files=["018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"], + primary_output="018-5-wbs_level2.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level2.py", + "worker_plan_internal/plan/create_wbs_level2.py", + ], + ), + StageInfo( + name="wbs_project_level1_and_level2", + output_files=["019-wbs_project_level1_and_level2.json"], + primary_output="019-wbs_project_level1_and_level2.json", + upstream_stages=["create_wbs_level1", "create_wbs_level2"], + source_code_files=[ + "worker_plan_internal/plan/stages/wbs_project_level1_and_level2.py", + "worker_plan_internal/wbs/wbs_populate.py", + ], + ), + # Phase 10: Pitch & Dependencies + StageInfo( + name="create_pitch", + output_files=["020-1-pitch_raw.json"], + primary_output="020-1-pitch_raw.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_pitch.py", + "worker_plan_internal/pitch/create_pitch.py", + ], + ), + StageInfo( + name="convert_pitch_to_markdown", + output_files=["020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"], + primary_output="020-3-pitch.md", + upstream_stages=["create_pitch"], + source_code_files=[ + "worker_plan_internal/plan/stages/convert_pitch_to_markdown.py", + "worker_plan_internal/pitch/convert_pitch_to_markdown.py", + ], + ), + StageInfo( + name="identify_task_dependencies", + output_files=["021-task_dependencies_raw.json"], + primary_output="021-task_dependencies_raw.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_task_dependencies.py", + "worker_plan_internal/plan/identify_wbs_task_dependencies.py", + ], + ), + StageInfo( + name="estimate_task_durations", + output_files=["022-2-task_durations.json"], + primary_output="022-2-task_durations.json", + upstream_stages=["project_plan", "wbs_project_level1_and_level2"], + source_code_files=[ + "worker_plan_internal/plan/stages/estimate_task_durations.py", + "worker_plan_internal/plan/estimate_wbs_task_durations.py", + ], + ), + # Phase 11: WBS Level 3 + StageInfo( + name="create_wbs_level3", + output_files=["023-2-wbs_level3.json"], + primary_output="023-2-wbs_level3.json", + upstream_stages=["project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level3.py", + "worker_plan_internal/plan/create_wbs_level3.py", + ], + ), + StageInfo( + name="wbs_project_level1_level2_level3", + output_files=["023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"], + primary_output="023-3-wbs_project_level1_and_level2_and_level3.json", + upstream_stages=["wbs_project_level1_and_level2", "create_wbs_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/wbs_project_level1_level2_level3.py", + "worker_plan_internal/wbs/wbs_populate.py", + ], + ), + # Phase 12: Schedule & Reviews + StageInfo( + name="create_schedule", + output_files=["026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"], + primary_output="026-2-schedule_gantt_dhtmlx.html", + upstream_stages=["start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_schedule.py", + "worker_plan_internal/schedule/project_schedule_populator.py", + ], + ), + StageInfo( + name="review_plan", + output_files=["024-1-review_plan_raw.json", "024-2-review_plan.md"], + primary_output="024-2-review_plan.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_plan.py", + "worker_plan_internal/plan/review_plan.py", + ], + ), + StageInfo( + name="executive_summary", + output_files=["025-1-executive_summary_raw.json", "025-2-executive_summary.md"], + primary_output="025-2-executive_summary.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/executive_summary.py", + "worker_plan_internal/plan/executive_summary.py", + ], + ), + StageInfo( + name="questions_and_answers", + output_files=["027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"], + primary_output="027-2-questions_and_answers.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/questions_and_answers.py", + "worker_plan_internal/questions_answers/questions_answers.py", + ], + ), + StageInfo( + name="premortem", + output_files=["028-1-premortem_raw.json", "028-2-premortem.md"], + primary_output="028-2-premortem.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"], + source_code_files=[ + "worker_plan_internal/plan/stages/premortem.py", + "worker_plan_internal/diagnostics/premortem.py", + ], + ), + StageInfo( + name="self_audit", + output_files=["029-1-self_audit_raw.json", "029-2-self_audit.md"], + primary_output="029-2-self_audit.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"], + source_code_files=[ + "worker_plan_internal/plan/stages/self_audit.py", + "worker_plan_internal/self_audit/self_audit.py", + ], + ), + # Phase 13: Final Report + StageInfo( + name="report", + output_files=["030-report.html"], + primary_output="030-report.html", + upstream_stages=[ + "setup", "screen_planning_prompt", "redline_gate", "premise_attack", + "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", + "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", + "convert_pitch_to_markdown", "data_collection", "markdown_documents", + "create_wbs_level1", "wbs_project_level1_level2_level3", "expert_review", + "project_plan", "review_plan", "executive_summary", "create_schedule", + "questions_and_answers", "premortem", "self_audit", + ], + source_code_files=[ + "worker_plan_internal/plan/stages/report.py", + "worker_plan_internal/report/report_generator.py", + ], + ), +] + +# ── Lookup indexes (built once at import time) ────────────────────────── + +_STAGE_BY_NAME: dict[str, StageInfo] = {s.name: s for s in STAGES} +_STAGE_BY_FILENAME: dict[str, StageInfo] = {} +for _stage in STAGES: + for _fname in _stage.output_files: + _STAGE_BY_FILENAME[_fname] = _stage + + +def find_stage_by_filename(filename: str) -> Optional[StageInfo]: + """Given an output filename, return the stage that produced it.""" + return _STAGE_BY_FILENAME.get(filename) + + +def get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]: + """Return (stage_name, file_path) pairs for upstream stages whose primary output exists on disk.""" + stage = _STAGE_BY_NAME.get(stage_name) + if stage is None: + return [] + + result = [] + for upstream_name in stage.upstream_stages: + upstream_stage = _STAGE_BY_NAME.get(upstream_name) + if upstream_stage is None: + continue + primary_path = output_dir / upstream_stage.primary_output + if primary_path.exists(): + result.append((upstream_name, primary_path)) + return result + + +def get_source_code_paths(stage_name: str) -> list[Path]: + """Return absolute paths to source code files for a stage.""" + stage = _STAGE_BY_NAME.get(stage_name) + if stage is None: + return [] + return [_SOURCE_BASE / f for f in stage.source_code_files] diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py new file mode 100644 index 000000000..fa550c320 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -0,0 +1,109 @@ +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from worker_plan_internal.flaw_tracer.registry import ( + StageInfo, + STAGES, + find_stage_by_filename, + get_upstream_files, + get_source_code_paths, +) + + +class TestStageInfo(unittest.TestCase): + def test_stages_is_nonempty(self): + self.assertGreater(len(STAGES), 40) + + def test_all_stages_have_required_fields(self): + for stage in STAGES: + self.assertIsInstance(stage.name, str, f"{stage.name} name") + self.assertIsInstance(stage.output_files, list, f"{stage.name} output_files") + self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") + self.assertIsInstance(stage.upstream_stages, list, f"{stage.name} upstream_stages") + self.assertIsInstance(stage.source_code_files, list, f"{stage.name} source_code_files") + self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") + self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") + + def test_no_duplicate_stage_names(self): + names = [s.name for s in STAGES] + self.assertEqual(len(names), len(set(names))) + + def test_upstream_references_are_valid(self): + valid_names = {s.name for s in STAGES} + for stage in STAGES: + for upstream in stage.upstream_stages: + self.assertIn(upstream, valid_names, f"{stage.name} references unknown upstream '{upstream}'") + + +class TestFindStageByFilename(unittest.TestCase): + def test_find_report(self): + stage = find_stage_by_filename("030-report.html") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "report") + + def test_find_potential_levers_clean(self): + stage = find_stage_by_filename("002-10-potential_levers.json") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "potential_levers") + + def test_find_potential_levers_raw(self): + stage = find_stage_by_filename("002-9-potential_levers_raw.json") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "potential_levers") + + def test_find_executive_summary(self): + stage = find_stage_by_filename("025-2-executive_summary.md") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "executive_summary") + + def test_unknown_filename_returns_none(self): + stage = find_stage_by_filename("zzz-unknown.txt") + self.assertIsNone(stage) + + +class TestGetUpstreamFiles(unittest.TestCase): + def test_setup_has_no_upstream(self): + with TemporaryDirectory() as d: + result = get_upstream_files("setup", Path(d)) + self.assertEqual(result, []) + + def test_potential_levers_upstream(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create the expected upstream files on disk + (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") + (output_dir / "002-6-identify_purpose.md").write_text("purpose", encoding="utf-8") + (output_dir / "002-8-plan_type.md").write_text("type", encoding="utf-8") + (output_dir / "002-0-extract_constraints.md").write_text("constraints", encoding="utf-8") + + result = get_upstream_files("potential_levers", output_dir) + stage_names = [name for name, _ in result] + self.assertIn("setup", stage_names) + self.assertIn("identify_purpose", stage_names) + self.assertIn("plan_type", stage_names) + self.assertIn("extract_constraints", stage_names) + + def test_missing_files_are_skipped(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Only create one of the upstream files + (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") + + result = get_upstream_files("potential_levers", output_dir) + stage_names = [name for name, _ in result] + self.assertIn("setup", stage_names) + # The others should be skipped because their files don't exist + self.assertNotIn("identify_purpose", stage_names) + + +class TestGetSourceCodePaths(unittest.TestCase): + def test_potential_levers_source(self): + paths = get_source_code_paths("potential_levers") + filenames = [p.name for p in paths] + self.assertIn("potential_levers.py", filenames) + self.assertIn("identify_potential_levers.py", filenames) + + def test_unknown_stage_returns_empty(self): + paths = get_source_code_paths("nonexistent_stage") + self.assertEqual(paths, []) From 6525dca42e42a9f28fead07019fa24d99f5fa670 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:04:03 +0200 Subject: [PATCH 02/37] refactor: use tuples and modern type syntax in flaw_tracer registry Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/registry.py | 571 +++++++++--------- .../flaw_tracer/tests/test_registry.py | 6 +- 2 files changed, 288 insertions(+), 289 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index 7fb3c9635..02394a354 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -5,9 +5,8 @@ and source code files. Derived from the Luigi task classes in worker_plan_internal/plan/stages/. """ -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path -from typing import Optional # Base path for source code, relative to worker_plan/ _SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ @@ -17,716 +16,716 @@ class StageInfo: """One pipeline stage.""" name: str - output_files: list[str] + output_files: tuple[str, ...] primary_output: str # preferred file to read when checking for flaws - upstream_stages: list[str] = field(default_factory=list) - source_code_files: list[str] = field(default_factory=list) + upstream_stages: tuple[str, ...] = () + source_code_files: tuple[str, ...] = () # ── Complete pipeline registry ────────────────────────────────────────── -STAGES: list[StageInfo] = [ +STAGES: tuple[StageInfo, ...] = ( # Phase 1: Initialization StageInfo( name="start_time", - output_files=["001-1-start_time.json"], + output_files=("001-1-start_time.json",), primary_output="001-1-start_time.json", - upstream_stages=[], - source_code_files=["worker_plan_internal/plan/stages/start_time.py"], + upstream_stages=(), + source_code_files=("worker_plan_internal/plan/stages/start_time.py",), ), StageInfo( name="setup", - output_files=["001-2-plan.txt"], + output_files=("001-2-plan.txt",), primary_output="001-2-plan.txt", - upstream_stages=[], - source_code_files=["worker_plan_internal/plan/stages/setup.py"], + upstream_stages=(), + source_code_files=("worker_plan_internal/plan/stages/setup.py",), ), # Phase 2: Input Validation & Strategy StageInfo( name="screen_planning_prompt", - output_files=["002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"], + output_files=("002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"), primary_output="002-0-screen_planning_prompt.md", - upstream_stages=["setup"], - source_code_files=[ + upstream_stages=("setup",), + source_code_files=( "worker_plan_internal/plan/stages/screen_planning_prompt.py", "worker_plan_internal/diagnostics/screen_planning_prompt.py", - ], + ), ), StageInfo( name="extract_constraints", - output_files=["002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"], + output_files=("002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"), primary_output="002-0-extract_constraints.md", - upstream_stages=["setup"], - source_code_files=[ + upstream_stages=("setup",), + source_code_files=( "worker_plan_internal/plan/stages/extract_constraints.py", "worker_plan_internal/diagnostics/extract_constraints.py", - ], + ), ), StageInfo( name="redline_gate", - output_files=["002-1-redline_gate.json", "002-2-redline_gate.md"], + output_files=("002-1-redline_gate.json", "002-2-redline_gate.md"), primary_output="002-2-redline_gate.md", - upstream_stages=["setup"], - source_code_files=[ + upstream_stages=("setup",), + source_code_files=( "worker_plan_internal/plan/stages/redline_gate.py", "worker_plan_internal/diagnostics/redline_gate.py", - ], + ), ), StageInfo( name="premise_attack", - output_files=["002-3-premise_attack.json", "002-4-premise_attack.md"], + output_files=("002-3-premise_attack.json", "002-4-premise_attack.md"), primary_output="002-4-premise_attack.md", - upstream_stages=["setup"], - source_code_files=[ + upstream_stages=("setup",), + source_code_files=( "worker_plan_internal/plan/stages/premise_attack.py", "worker_plan_internal/diagnostics/premise_attack.py", - ], + ), ), StageInfo( name="identify_purpose", - output_files=["002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"], + output_files=("002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"), primary_output="002-6-identify_purpose.md", - upstream_stages=["setup"], - source_code_files=[ + upstream_stages=("setup",), + source_code_files=( "worker_plan_internal/plan/stages/identify_purpose.py", "worker_plan_internal/assume/identify_purpose.py", - ], + ), ), StageInfo( name="plan_type", - output_files=["002-7-plan_type_raw.json", "002-8-plan_type.md"], + output_files=("002-7-plan_type_raw.json", "002-8-plan_type.md"), primary_output="002-8-plan_type.md", - upstream_stages=["setup", "identify_purpose"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose"), + source_code_files=( "worker_plan_internal/plan/stages/plan_type.py", "worker_plan_internal/assume/identify_plan_type.py", - ], + ), ), StageInfo( name="potential_levers", - output_files=["002-9-potential_levers_raw.json", "002-10-potential_levers.json"], + output_files=("002-9-potential_levers_raw.json", "002-10-potential_levers.json"), primary_output="002-10-potential_levers.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "extract_constraints"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "extract_constraints"), + source_code_files=( "worker_plan_internal/plan/stages/potential_levers.py", "worker_plan_internal/lever/identify_potential_levers.py", - ], + ), ), StageInfo( name="deduplicate_levers", - output_files=["002-11-deduplicated_levers_raw.json"], + output_files=("002-11-deduplicated_levers_raw.json",), primary_output="002-11-deduplicated_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "potential_levers"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "potential_levers"), + source_code_files=( "worker_plan_internal/plan/stages/deduplicate_levers.py", "worker_plan_internal/lever/deduplicate_levers.py", - ], + ), ), StageInfo( name="enrich_levers", - output_files=["002-12-enriched_levers_raw.json"], + output_files=("002-12-enriched_levers_raw.json",), primary_output="002-12-enriched_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "deduplicate_levers"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "deduplicate_levers"), + source_code_files=( "worker_plan_internal/plan/stages/enrich_levers.py", "worker_plan_internal/lever/enrich_potential_levers.py", - ], + ), ), StageInfo( name="focus_on_vital_few_levers", - output_files=["002-13-vital_few_levers_raw.json"], + output_files=("002-13-vital_few_levers_raw.json",), primary_output="002-13-vital_few_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "enrich_levers"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "enrich_levers"), + source_code_files=( "worker_plan_internal/plan/stages/focus_on_vital_few_levers.py", "worker_plan_internal/lever/focus_on_vital_few_levers.py", - ], + ), ), StageInfo( name="strategic_decisions_markdown", - output_files=["002-14-strategic_decisions.md"], + output_files=("002-14-strategic_decisions.md",), primary_output="002-14-strategic_decisions.md", - upstream_stages=["enrich_levers", "focus_on_vital_few_levers"], - source_code_files=[ + upstream_stages=("enrich_levers", "focus_on_vital_few_levers"), + source_code_files=( "worker_plan_internal/plan/stages/strategic_decisions_markdown.py", "worker_plan_internal/lever/strategic_decisions_markdown.py", - ], + ), ), StageInfo( name="candidate_scenarios", - output_files=["002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"], + output_files=("002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"), primary_output="002-16-candidate_scenarios.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"), + source_code_files=( "worker_plan_internal/plan/stages/candidate_scenarios.py", "worker_plan_internal/lever/candidate_scenarios.py", - ], + ), ), StageInfo( name="select_scenario", - output_files=["002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"], + output_files=("002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"), primary_output="002-18-selected_scenario.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"), + source_code_files=( "worker_plan_internal/plan/stages/select_scenario.py", "worker_plan_internal/lever/select_scenario.py", - ], + ), ), StageInfo( name="scenarios_markdown", - output_files=["002-19-scenarios.md"], + output_files=("002-19-scenarios.md",), primary_output="002-19-scenarios.md", - upstream_stages=["candidate_scenarios", "select_scenario"], - source_code_files=[ + upstream_stages=("candidate_scenarios", "select_scenario"), + source_code_files=( "worker_plan_internal/plan/stages/scenarios_markdown.py", "worker_plan_internal/lever/scenarios_markdown.py", - ], + ), ), # Constraint checkers StageInfo( name="potential_levers_constraint", - output_files=["002-10-potential_levers_constraint.json"], + output_files=("002-10-potential_levers_constraint.json",), primary_output="002-10-potential_levers_constraint.json", - upstream_stages=["extract_constraints", "potential_levers"], - source_code_files=[ + upstream_stages=("extract_constraints", "potential_levers"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), StageInfo( name="deduplicated_levers_constraint", - output_files=["002-11-deduplicated_levers_constraint.json"], + output_files=("002-11-deduplicated_levers_constraint.json",), primary_output="002-11-deduplicated_levers_constraint.json", - upstream_stages=["extract_constraints", "deduplicate_levers"], - source_code_files=[ + upstream_stages=("extract_constraints", "deduplicate_levers"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), StageInfo( name="enriched_levers_constraint", - output_files=["002-12-enriched_levers_constraint.json"], + output_files=("002-12-enriched_levers_constraint.json",), primary_output="002-12-enriched_levers_constraint.json", - upstream_stages=["extract_constraints", "enrich_levers"], - source_code_files=[ + upstream_stages=("extract_constraints", "enrich_levers"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), StageInfo( name="vital_few_levers_constraint", - output_files=["002-13-vital_few_levers_constraint.json"], + output_files=("002-13-vital_few_levers_constraint.json",), primary_output="002-13-vital_few_levers_constraint.json", - upstream_stages=["extract_constraints", "focus_on_vital_few_levers"], - source_code_files=[ + upstream_stages=("extract_constraints", "focus_on_vital_few_levers"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), StageInfo( name="candidate_scenarios_constraint", - output_files=["002-16-candidate_scenarios_constraint.json"], + output_files=("002-16-candidate_scenarios_constraint.json",), primary_output="002-16-candidate_scenarios_constraint.json", - upstream_stages=["extract_constraints", "candidate_scenarios"], - source_code_files=[ + upstream_stages=("extract_constraints", "candidate_scenarios"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), StageInfo( name="selected_scenario_constraint", - output_files=["002-18-selected_scenario_constraint.json"], + output_files=("002-18-selected_scenario_constraint.json",), primary_output="002-18-selected_scenario_constraint.json", - upstream_stages=["extract_constraints", "select_scenario"], - source_code_files=[ + upstream_stages=("extract_constraints", "select_scenario"), + source_code_files=( "worker_plan_internal/plan/stages/constraint_checker_stages.py", "worker_plan_internal/diagnostics/constraint_checker.py", - ], + ), ), # Phase 3: Context & Assumptions StageInfo( name="physical_locations", - output_files=["002-20-physical_locations_raw.json", "002-21-physical_locations.md"], + output_files=("002-20-physical_locations_raw.json", "002-21-physical_locations.md"), primary_output="002-21-physical_locations.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"), + source_code_files=( "worker_plan_internal/plan/stages/physical_locations.py", "worker_plan_internal/assume/physical_locations.py", - ], + ), ), StageInfo( name="currency_strategy", - output_files=["002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"], + output_files=("002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"), primary_output="002-23-currency_strategy.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"), + source_code_files=( "worker_plan_internal/plan/stages/currency_strategy.py", "worker_plan_internal/assume/currency_strategy.py", - ], + ), ), StageInfo( name="identify_risks", - output_files=["003-1-identify_risks_raw.json", "003-2-identify_risks.md"], + output_files=("003-1-identify_risks_raw.json", "003-2-identify_risks.md"), primary_output="003-2-identify_risks.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"), + source_code_files=( "worker_plan_internal/plan/stages/identify_risks.py", "worker_plan_internal/assume/identify_risks.py", - ], + ), ), StageInfo( name="make_assumptions", - output_files=["003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"], + output_files=("003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"), primary_output="003-5-make_assumptions.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"), + source_code_files=( "worker_plan_internal/plan/stages/make_assumptions.py", "worker_plan_internal/assume/make_assumptions.py", - ], + ), ), StageInfo( name="distill_assumptions", - output_files=["003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"], + output_files=("003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"), primary_output="003-7-distill_assumptions.md", - upstream_stages=["setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"], - source_code_files=[ + upstream_stages=("setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"), + source_code_files=( "worker_plan_internal/plan/stages/distill_assumptions.py", "worker_plan_internal/assume/distill_assumptions.py", - ], + ), ), StageInfo( name="review_assumptions", - output_files=["003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"], + output_files=("003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"), primary_output="003-9-review_assumptions.md", - upstream_stages=["identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"], - source_code_files=[ + upstream_stages=("identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"), + source_code_files=( "worker_plan_internal/plan/stages/review_assumptions.py", "worker_plan_internal/assume/review_assumptions.py", - ], + ), ), StageInfo( name="consolidate_assumptions_markdown", - output_files=["003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"], + output_files=("003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"), primary_output="003-10-consolidate_assumptions_full.md", - upstream_stages=["identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"], - source_code_files=[ + upstream_stages=("identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"), + source_code_files=( "worker_plan_internal/plan/stages/consolidate_assumptions_markdown.py", "worker_plan_internal/assume/shorten_markdown.py", - ], + ), ), # Phase 4: Pre-Project Assessment & Project Plan StageInfo( name="pre_project_assessment", - output_files=["004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"], + output_files=("004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"), primary_output="004-2-pre_project_assessment.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"), + source_code_files=( "worker_plan_internal/plan/stages/pre_project_assessment.py", "worker_plan_internal/expert/pre_project_assessment.py", - ], + ), ), StageInfo( name="project_plan", - output_files=["005-1-project_plan_raw.json", "005-2-project_plan.md"], + output_files=("005-1-project_plan_raw.json", "005-2-project_plan.md"), primary_output="005-2-project_plan.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"), + source_code_files=( "worker_plan_internal/plan/stages/project_plan.py", "worker_plan_internal/plan/project_plan.py", - ], + ), ), # Phase 5: Governance StageInfo( name="governance_phase1_audit", - output_files=["006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"], + output_files=("006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"), primary_output="006-2-governance_phase1_audit.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase1_audit.py", "worker_plan_internal/governance/governance_phase1_audit.py", - ], + ), ), StageInfo( name="governance_phase2_bodies", - output_files=["006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"], + output_files=("006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"), primary_output="006-4-governance_phase2_bodies.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase2_bodies.py", "worker_plan_internal/governance/governance_phase2_bodies.py", - ], + ), ), StageInfo( name="governance_phase3_impl_plan", - output_files=["006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"], + output_files=("006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"), primary_output="006-6-governance_phase3_impl_plan.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase3_impl_plan.py", "worker_plan_internal/governance/governance_phase3_impl_plan.py", - ], + ), ), StageInfo( name="governance_phase4_decision_escalation_matrix", - output_files=["006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"], + output_files=("006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"), primary_output="006-8-governance_phase4_decision_escalation_matrix.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase4_decision_escalation_matrix.py", "worker_plan_internal/governance/governance_phase4_decision_escalation_matrix.py", - ], + ), ), StageInfo( name="governance_phase5_monitoring_progress", - output_files=["006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"], + output_files=("006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"), primary_output="006-10-governance_phase5_monitoring_progress.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase5_monitoring_progress.py", "worker_plan_internal/governance/governance_phase5_monitoring_progress.py", - ], + ), ), StageInfo( name="governance_phase6_extra", - output_files=["006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"], + output_files=("006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"), primary_output="006-12-governance_phase6_extra.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"), + source_code_files=( "worker_plan_internal/plan/stages/governance_phase6_extra.py", "worker_plan_internal/governance/governance_phase6_extra.py", - ], + ), ), StageInfo( name="consolidate_governance", - output_files=["006-13-consolidate_governance.md"], + output_files=("006-13-consolidate_governance.md",), primary_output="006-13-consolidate_governance.md", - upstream_stages=["governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"], - source_code_files=["worker_plan_internal/plan/stages/consolidate_governance.py"], + upstream_stages=("governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"), + source_code_files=("worker_plan_internal/plan/stages/consolidate_governance.py",), ), # Phase 6: Resources & Team StageInfo( name="related_resources", - output_files=["007-1-related_resources_raw.json", "007-8-related_resources.md"], + output_files=("007-1-related_resources_raw.json", "007-8-related_resources.md"), primary_output="007-8-related_resources.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"), + source_code_files=( "worker_plan_internal/plan/stages/related_resources.py", "worker_plan_internal/plan/related_resources.py", - ], + ), ), StageInfo( name="find_team_members", - output_files=["008-1-find_team_members_raw.json", "008-2-find_team_members.json"], + output_files=("008-1-find_team_members_raw.json", "008-2-find_team_members.json"), primary_output="008-2-find_team_members.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/find_team_members.py", "worker_plan_internal/team/find_team_members.py", - ], + ), ), StageInfo( name="enrich_team_contract_type", - output_files=["009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"], + output_files=("009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"), primary_output="009-2-enrich_team_members_contract_type.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/enrich_team_contract_type.py", "worker_plan_internal/team/enrich_team_members_with_contract_type.py", - ], + ), ), StageInfo( name="enrich_team_background_story", - output_files=["010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"], + output_files=("010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"), primary_output="010-2-enrich_team_members_background_story.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/enrich_team_background_story.py", "worker_plan_internal/team/enrich_team_members_with_background_story.py", - ], + ), ), StageInfo( name="enrich_team_environment_info", - output_files=["011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"], + output_files=("011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"), primary_output="011-2-enrich_team_members_environment_info.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/enrich_team_environment_info.py", "worker_plan_internal/team/enrich_team_members_with_environment_info.py", - ], + ), ), StageInfo( name="review_team", - output_files=["012-review_team_raw.json"], + output_files=("012-review_team_raw.json",), primary_output="012-review_team_raw.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/review_team.py", "worker_plan_internal/team/review_team.py", - ], + ), ), StageInfo( name="team_markdown", - output_files=["013-team.md"], + output_files=("013-team.md",), primary_output="013-team.md", - upstream_stages=["enrich_team_environment_info", "review_team"], - source_code_files=[ + upstream_stages=("enrich_team_environment_info", "review_team"), + source_code_files=( "worker_plan_internal/plan/stages/team_markdown.py", "worker_plan_internal/team/team_markdown_document.py", - ], + ), ), # Phase 7: Analysis & Experts StageInfo( name="swot_analysis", - output_files=["014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"], + output_files=("014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"), primary_output="014-2-swot_analysis.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/swot_analysis.py", "worker_plan_internal/swot/swot_analysis.py", - ], + ), ), StageInfo( name="expert_review", - output_files=["015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"], + output_files=("015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"), primary_output="016-2-expert_criticism.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"], - source_code_files=[ + upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"), + source_code_files=( "worker_plan_internal/plan/stages/expert_review.py", "worker_plan_internal/expert/expert_finder.py", "worker_plan_internal/expert/expert_criticism.py", - ], + ), ), # Phase 8: Data & Documents StageInfo( name="data_collection", - output_files=["017-1-data_collection_raw.json", "017-2-data_collection.md"], + output_files=("017-1-data_collection_raw.json", "017-2-data_collection.md"), primary_output="017-2-data_collection.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"), + source_code_files=( "worker_plan_internal/plan/stages/data_collection.py", "worker_plan_internal/plan/data_collection.py", - ], + ), ), StageInfo( name="identify_documents", - output_files=["017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"], + output_files=("017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"), primary_output="017-4-identified_documents.md", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], - source_code_files=[ + upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"), + source_code_files=( "worker_plan_internal/plan/stages/identify_documents.py", "worker_plan_internal/document/identify_documents.py", - ], + ), ), StageInfo( name="filter_documents_to_find", - output_files=["017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"], + output_files=("017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"), primary_output="017-8-filter_documents_to_find_clean.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], - source_code_files=[ + upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"), + source_code_files=( "worker_plan_internal/plan/stages/filter_documents_to_find.py", "worker_plan_internal/document/filter_documents_to_find.py", - ], + ), ), StageInfo( name="filter_documents_to_create", - output_files=["017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"], + output_files=("017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"), primary_output="017-10-filter_documents_to_create_clean.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], - source_code_files=[ + upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"), + source_code_files=( "worker_plan_internal/plan/stages/filter_documents_to_create.py", "worker_plan_internal/document/filter_documents_to_create.py", - ], + ), ), StageInfo( name="draft_documents_to_find", - output_files=["017-12-draft_documents_to_find.json"], + output_files=("017-12-draft_documents_to_find.json",), primary_output="017-12-draft_documents_to_find.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"], - source_code_files=[ + upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"), + source_code_files=( "worker_plan_internal/plan/stages/draft_documents_to_find.py", "worker_plan_internal/document/draft_document_to_find.py", - ], + ), ), StageInfo( name="draft_documents_to_create", - output_files=["017-14-draft_documents_to_create.json"], + output_files=("017-14-draft_documents_to_create.json",), primary_output="017-14-draft_documents_to_create.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"], - source_code_files=[ + upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"), + source_code_files=( "worker_plan_internal/plan/stages/draft_documents_to_create.py", "worker_plan_internal/document/draft_document_to_create.py", - ], + ), ), StageInfo( name="markdown_documents", - output_files=["017-15-documents_to_create_and_find.md"], + output_files=("017-15-documents_to_create_and_find.md",), primary_output="017-15-documents_to_create_and_find.md", - upstream_stages=["draft_documents_to_create", "draft_documents_to_find"], - source_code_files=[ + upstream_stages=("draft_documents_to_create", "draft_documents_to_find"), + source_code_files=( "worker_plan_internal/plan/stages/markdown_documents.py", "worker_plan_internal/document/markdown_with_document.py", - ], + ), ), # Phase 9: WBS StageInfo( name="create_wbs_level1", - output_files=["018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"], + output_files=("018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"), primary_output="018-2-wbs_level1.json", - upstream_stages=["project_plan"], - source_code_files=[ + upstream_stages=("project_plan",), + source_code_files=( "worker_plan_internal/plan/stages/create_wbs_level1.py", "worker_plan_internal/plan/create_wbs_level1.py", - ], + ), ), StageInfo( name="create_wbs_level2", - output_files=["018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"], + output_files=("018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"), primary_output="018-5-wbs_level2.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"), + source_code_files=( "worker_plan_internal/plan/stages/create_wbs_level2.py", "worker_plan_internal/plan/create_wbs_level2.py", - ], + ), ), StageInfo( name="wbs_project_level1_and_level2", - output_files=["019-wbs_project_level1_and_level2.json"], + output_files=("019-wbs_project_level1_and_level2.json",), primary_output="019-wbs_project_level1_and_level2.json", - upstream_stages=["create_wbs_level1", "create_wbs_level2"], - source_code_files=[ + upstream_stages=("create_wbs_level1", "create_wbs_level2"), + source_code_files=( "worker_plan_internal/plan/stages/wbs_project_level1_and_level2.py", "worker_plan_internal/wbs/wbs_populate.py", - ], + ), ), # Phase 10: Pitch & Dependencies StageInfo( name="create_pitch", - output_files=["020-1-pitch_raw.json"], + output_files=("020-1-pitch_raw.json",), primary_output="020-1-pitch_raw.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"), + source_code_files=( "worker_plan_internal/plan/stages/create_pitch.py", "worker_plan_internal/pitch/create_pitch.py", - ], + ), ), StageInfo( name="convert_pitch_to_markdown", - output_files=["020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"], + output_files=("020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"), primary_output="020-3-pitch.md", - upstream_stages=["create_pitch"], - source_code_files=[ + upstream_stages=("create_pitch",), + source_code_files=( "worker_plan_internal/plan/stages/convert_pitch_to_markdown.py", "worker_plan_internal/pitch/convert_pitch_to_markdown.py", - ], + ), ), StageInfo( name="identify_task_dependencies", - output_files=["021-task_dependencies_raw.json"], + output_files=("021-task_dependencies_raw.json",), primary_output="021-task_dependencies_raw.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"), + source_code_files=( "worker_plan_internal/plan/stages/identify_task_dependencies.py", "worker_plan_internal/plan/identify_wbs_task_dependencies.py", - ], + ), ), StageInfo( name="estimate_task_durations", - output_files=["022-2-task_durations.json"], + output_files=("022-2-task_durations.json",), primary_output="022-2-task_durations.json", - upstream_stages=["project_plan", "wbs_project_level1_and_level2"], - source_code_files=[ + upstream_stages=("project_plan", "wbs_project_level1_and_level2"), + source_code_files=( "worker_plan_internal/plan/stages/estimate_task_durations.py", "worker_plan_internal/plan/estimate_wbs_task_durations.py", - ], + ), ), # Phase 11: WBS Level 3 StageInfo( name="create_wbs_level3", - output_files=["023-2-wbs_level3.json"], + output_files=("023-2-wbs_level3.json",), primary_output="023-2-wbs_level3.json", - upstream_stages=["project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"], - source_code_files=[ + upstream_stages=("project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"), + source_code_files=( "worker_plan_internal/plan/stages/create_wbs_level3.py", "worker_plan_internal/plan/create_wbs_level3.py", - ], + ), ), StageInfo( name="wbs_project_level1_level2_level3", - output_files=["023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"], + output_files=("023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"), primary_output="023-3-wbs_project_level1_and_level2_and_level3.json", - upstream_stages=["wbs_project_level1_and_level2", "create_wbs_level3"], - source_code_files=[ + upstream_stages=("wbs_project_level1_and_level2", "create_wbs_level3"), + source_code_files=( "worker_plan_internal/plan/stages/wbs_project_level1_level2_level3.py", "worker_plan_internal/wbs/wbs_populate.py", - ], + ), ), # Phase 12: Schedule & Reviews StageInfo( name="create_schedule", - output_files=["026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"], + output_files=("026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"), primary_output="026-2-schedule_gantt_dhtmlx.html", - upstream_stages=["start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"], - source_code_files=[ + upstream_stages=("start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"), + source_code_files=( "worker_plan_internal/plan/stages/create_schedule.py", "worker_plan_internal/schedule/project_schedule_populator.py", - ], + ), ), StageInfo( name="review_plan", - output_files=["024-1-review_plan_raw.json", "024-2-review_plan.md"], + output_files=("024-1-review_plan_raw.json", "024-2-review_plan.md"), primary_output="024-2-review_plan.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"), + source_code_files=( "worker_plan_internal/plan/stages/review_plan.py", "worker_plan_internal/plan/review_plan.py", - ], + ), ), StageInfo( name="executive_summary", - output_files=["025-1-executive_summary_raw.json", "025-2-executive_summary.md"], + output_files=("025-1-executive_summary_raw.json", "025-2-executive_summary.md"), primary_output="025-2-executive_summary.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"), + source_code_files=( "worker_plan_internal/plan/stages/executive_summary.py", "worker_plan_internal/plan/executive_summary.py", - ], + ), ), StageInfo( name="questions_and_answers", - output_files=["027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"], + output_files=("027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"), primary_output="027-2-questions_and_answers.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"), + source_code_files=( "worker_plan_internal/plan/stages/questions_and_answers.py", "worker_plan_internal/questions_answers/questions_answers.py", - ], + ), ), StageInfo( name="premortem", - output_files=["028-1-premortem_raw.json", "028-2-premortem.md"], + output_files=("028-1-premortem_raw.json", "028-2-premortem.md"), primary_output="028-2-premortem.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"), + source_code_files=( "worker_plan_internal/plan/stages/premortem.py", "worker_plan_internal/diagnostics/premortem.py", - ], + ), ), StageInfo( name="self_audit", - output_files=["029-1-self_audit_raw.json", "029-2-self_audit.md"], + output_files=("029-1-self_audit_raw.json", "029-2-self_audit.md"), primary_output="029-2-self_audit.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"], - source_code_files=[ + upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"), + source_code_files=( "worker_plan_internal/plan/stages/self_audit.py", "worker_plan_internal/self_audit/self_audit.py", - ], + ), ), # Phase 13: Final Report StageInfo( name="report", - output_files=["030-report.html"], + output_files=("030-report.html",), primary_output="030-report.html", - upstream_stages=[ + upstream_stages=( "setup", "screen_planning_prompt", "redline_gate", "premise_attack", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", @@ -734,13 +733,13 @@ class StageInfo: "create_wbs_level1", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "executive_summary", "create_schedule", "questions_and_answers", "premortem", "self_audit", - ], - source_code_files=[ + ), + source_code_files=( "worker_plan_internal/plan/stages/report.py", "worker_plan_internal/report/report_generator.py", - ], + ), ), -] +) # ── Lookup indexes (built once at import time) ────────────────────────── @@ -751,7 +750,7 @@ class StageInfo: _STAGE_BY_FILENAME[_fname] = _stage -def find_stage_by_filename(filename: str) -> Optional[StageInfo]: +def find_stage_by_filename(filename: str) -> StageInfo | None: """Given an output filename, return the stage that produced it.""" return _STAGE_BY_FILENAME.get(filename) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index fa550c320..d3c0abe67 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -18,10 +18,10 @@ def test_stages_is_nonempty(self): def test_all_stages_have_required_fields(self): for stage in STAGES: self.assertIsInstance(stage.name, str, f"{stage.name} name") - self.assertIsInstance(stage.output_files, list, f"{stage.name} output_files") + self.assertIsInstance(stage.output_files, tuple, f"{stage.name} output_files") self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") - self.assertIsInstance(stage.upstream_stages, list, f"{stage.name} upstream_stages") - self.assertIsInstance(stage.source_code_files, list, f"{stage.name} source_code_files") + self.assertIsInstance(stage.upstream_stages, tuple, f"{stage.name} upstream_stages") + self.assertIsInstance(stage.source_code_files, tuple, f"{stage.name} source_code_files") self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") From 2fa4de3c16ce1bf60de3c318e8ae1b8b99cefebe Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:06:30 +0200 Subject: [PATCH 03/37] feat: add flaw_tracer Pydantic models and prompt builders Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/prompts.py | 125 +++++++++++++++++ .../flaw_tracer/tests/test_prompts.py | 127 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/prompts.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py new file mode 100644 index 000000000..43f7cb8e9 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py @@ -0,0 +1,125 @@ +# worker_plan/worker_plan_internal/flaw_tracer/prompts.py +"""Pydantic models and prompt builders for the flaw tracer.""" +from typing import Literal +from pydantic import BaseModel, Field +from llama_index.core.llms import ChatMessage, MessageRole + + +# -- Pydantic models for structured LLM output -------------------------------- + +class IdentifiedFlaw(BaseModel): + """A discrete flaw found in a pipeline output file.""" + description: str = Field(description="One-sentence description of the flaw") + evidence: str = Field(description="Direct quote from the file demonstrating the flaw") + severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( + description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." + ) + + +class FlawIdentificationResult(BaseModel): + """Result of analyzing a file for flaws.""" + flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") + + +class UpstreamCheckResult(BaseModel): + """Result of checking an upstream file for a flaw precursor.""" + found: bool = Field(description="True if this file contains the flaw or a precursor to it") + evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") + explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + + +class SourceCodeAnalysisResult(BaseModel): + """Result of analyzing source code at a flaw's origin stage.""" + likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + relevant_code_section: str = Field(description="The specific code or prompt text responsible") + suggestion: str = Field(description="How to fix or prevent this flaw") + + +# -- Prompt builders ----------------------------------------------------------- + +def build_flaw_identification_messages( + filename: str, + file_content: str, + user_flaw_description: str, +) -> list[ChatMessage]: + """Build messages for Phase 1: identifying discrete flaws in a file.""" + system = ( + "You are analyzing an intermediary file from a project planning pipeline.\n" + "The user has identified problems in this output. Identify each discrete flaw.\n" + "For each flaw, provide a short description (one sentence), a direct quote " + "from the file as evidence, and a severity level.\n" + "Only identify real flaws — do not flag stylistic preferences or minor formatting issues.\n" + "Severity levels:\n" + "- HIGH: fabricated data, invented statistics, or missing critical analysis\n" + "- MEDIUM: weak reasoning, vague unsupported claims, or shallow treatment\n" + "- LOW: minor gaps that don't significantly impact the plan" + ) + user = ( + f"User's observation:\n{user_flaw_description}\n\n" + f"Filename: {filename}\n" + f"File content:\n{file_content}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] + + +def build_upstream_check_messages( + flaw_description: str, + evidence_quote: str, + upstream_filename: str, + upstream_file_content: str, +) -> list[ChatMessage]: + """Build messages for Phase 2: checking if a flaw exists in an upstream file.""" + system = ( + "You are tracing a flaw through a project planning pipeline to find where it originated.\n" + "A downstream file contains a flaw. You are examining an upstream file that was an input " + "to the stage that produced the flawed output.\n" + "Determine if this upstream file contains the same problem or a precursor to it.\n" + "If YES: quote the relevant passage and explain how it connects to the downstream flaw.\n" + "If NO: explain why this file is clean regarding this specific flaw." + ) + user = ( + f"Flaw: {flaw_description}\n" + f"Evidence from downstream: {evidence_quote}\n\n" + f"Upstream filename: {upstream_filename}\n" + f"Upstream file content:\n{upstream_file_content}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] + + +def build_source_code_analysis_messages( + flaw_description: str, + evidence_quote: str, + source_code_contents: list[tuple[str, str]], +) -> list[ChatMessage]: + """Build messages for Phase 3: analyzing source code at flaw origin. + + Args: + source_code_contents: list of (filename, content) tuples + """ + system = ( + "A flaw was introduced at this pipeline stage. The flaw exists in its output " + "but NOT in any of its inputs, so this stage created it.\n" + "Examine the source code to identify what in the prompt text, logic, or processing " + "likely caused this flaw. Be specific — point to lines or prompt phrases.\n" + "Focus on the system prompt text and the data transformation logic." + ) + source_sections = [] + for fname, content in source_code_contents: + source_sections.append(f"--- {fname} ---\n{content}") + source_text = "\n\n".join(source_sections) + + user = ( + f"Flaw: {flaw_description}\n" + f"Evidence from output: {evidence_quote}\n\n" + f"Source code files:\n{source_text}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py new file mode 100644 index 000000000..cbe79c920 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py @@ -0,0 +1,127 @@ +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +import unittest +from llama_index.core.llms import ChatMessage, MessageRole +from worker_plan_internal.flaw_tracer.prompts import ( + IdentifiedFlaw, + FlawIdentificationResult, + UpstreamCheckResult, + SourceCodeAnalysisResult, + build_flaw_identification_messages, + build_upstream_check_messages, + build_source_code_analysis_messages, +) + + +class TestPydanticModels(unittest.TestCase): + def test_identified_flaw_valid(self): + flaw = IdentifiedFlaw( + description="Budget figure is fabricated", + evidence="The budget is CZK 500,000", + severity="HIGH", + ) + self.assertEqual(flaw.severity, "HIGH") + + def test_identified_flaw_rejects_invalid_severity(self): + with self.assertRaises(Exception): + IdentifiedFlaw( + description="test", + evidence="test", + severity="CRITICAL", + ) + + def test_flaw_identification_result(self): + result = FlawIdentificationResult(flaws=[ + IdentifiedFlaw(description="test", evidence="quote", severity="LOW"), + ]) + self.assertEqual(len(result.flaws), 1) + + def test_upstream_check_result_found(self): + result = UpstreamCheckResult(found=True, evidence="quote", explanation="precursor") + self.assertTrue(result.found) + self.assertEqual(result.evidence, "quote") + + def test_upstream_check_result_not_found(self): + result = UpstreamCheckResult(found=False, evidence=None, explanation="clean") + self.assertFalse(result.found) + + def test_source_code_analysis_result(self): + result = SourceCodeAnalysisResult( + likely_cause="prompt lacks validation", + relevant_code_section="system_prompt = ...", + suggestion="add grounding check", + ) + self.assertIsInstance(result.likely_cause, str) + + +class TestBuildFlawIdentificationMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_flaw_identification_messages( + filename="030-report.html", + file_content="report content", + user_flaw_description="budget is wrong", + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + self.assertEqual(messages[0].role, MessageRole.SYSTEM) + self.assertEqual(messages[1].role, MessageRole.USER) + + def test_user_message_contains_inputs(self): + messages = build_flaw_identification_messages( + filename="025-2-executive_summary.md", + file_content="# Summary\nBudget: 500k", + user_flaw_description="fabricated budget", + ) + user_content = messages[1].content + self.assertIn("025-2-executive_summary.md", user_content) + self.assertIn("# Summary", user_content) + self.assertIn("fabricated budget", user_content) + + +class TestBuildUpstreamCheckMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_upstream_check_messages( + flaw_description="Budget is fabricated", + evidence_quote="CZK 500,000", + upstream_filename="005-2-project_plan.md", + upstream_file_content="# Project Plan\nBudget: 500k", + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + + def test_user_message_contains_flaw_and_upstream(self): + messages = build_upstream_check_messages( + flaw_description="Missing market sizing", + evidence_quote="growing Czech market", + upstream_filename="003-5-make_assumptions.md", + upstream_file_content="# Assumptions\nMarket is growing", + ) + user_content = messages[1].content + self.assertIn("Missing market sizing", user_content) + self.assertIn("growing Czech market", user_content) + self.assertIn("003-5-make_assumptions.md", user_content) + + +class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_source_code_analysis_messages( + flaw_description="Budget fabricated", + evidence_quote="CZK 500,000", + source_code_contents=[ + ("stages/make_assumptions.py", "class MakeAssumptionsTask: ..."), + ("assume/make_assumptions.py", "def execute(llm, query): ..."), + ], + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + + def test_user_message_contains_source_code(self): + messages = build_source_code_analysis_messages( + flaw_description="Missing analysis", + evidence_quote="no data", + source_code_contents=[ + ("my_stage.py", "SYSTEM_PROMPT = 'Generate assumptions'"), + ], + ) + user_content = messages[1].content + self.assertIn("my_stage.py", user_content) + self.assertIn("SYSTEM_PROMPT", user_content) From b1cdb29529335df075b3b5fa58cbe7cf28d0edd2 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:26:04 +0200 Subject: [PATCH 04/37] feat: add flaw_tracer recursive tracing algorithm FlawTracer orchestrates three-phase flaw tracing through the pipeline DAG: - Phase 1: LLM-based flaw identification in starting file - Phase 2: Recursive upstream tracing with deduplication and max depth - Phase 3: Source code analysis at flaw origin stages Tests mock the LLM-calling methods to verify tracing logic, deduplication, depth limits, multi-flaw handling, and depth-sorted output. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/tests/test_tracer.py | 413 ++++++++++++++++++ .../flaw_tracer/tracer.py | 280 ++++++++++++ 2 files changed, 693 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tracer.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py new file mode 100644 index 000000000..c5734bbbe --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py @@ -0,0 +1,413 @@ +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +"""Tests for the flaw tracer recursive algorithm. + +Since ResponseMockLLM does NOT support as_structured_llm(), we mock the three +private LLM-calling methods (_identify_flaws, _check_upstream, +_analyze_source_code) directly. This tests the tracing logic — recursion, +deduplication, max depth — which is the important part. +""" +import json +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import patch, MagicMock + +from worker_plan_internal.flaw_tracer.tracer import ( + FlawTracer, + FlawTraceResult, + TracedFlaw, + TraceEntry, + OriginInfo, +) +from worker_plan_internal.flaw_tracer.prompts import ( + FlawIdentificationResult, + IdentifiedFlaw, + UpstreamCheckResult, + SourceCodeAnalysisResult, +) +from worker_plan_internal.llm_util.response_mockllm import ResponseMockLLM +from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelWithInstance + + +def _make_executor() -> LLMExecutor: + """Create a dummy LLMExecutor (won't actually be called when methods are mocked).""" + llm = ResponseMockLLM(responses=["unused"]) + llm_models = LLMModelWithInstance.from_instances([llm]) + return LLMExecutor(llm_models=llm_models) + + +def _make_tracer(output_dir: Path, max_depth: int = 15, verbose: bool = False) -> FlawTracer: + """Create a FlawTracer with a dummy executor and a real source_code_base.""" + executor = _make_executor() + source_base = Path(__file__).resolve().parent.parent.parent.parent # worker_plan/ + return FlawTracer( + output_dir=output_dir, + llm_executor=executor, + source_code_base=source_base, + max_depth=max_depth, + verbose=verbose, + ) + + +class TestFlawTraceResult(unittest.TestCase): + def test_dataclass_creation(self): + result = FlawTraceResult( + starting_file="030-report.html", + flaw_description="test", + output_dir="/tmp/test", + flaws=[], + llm_calls_made=0, + ) + self.assertEqual(result.starting_file, "030-report.html") + self.assertEqual(len(result.flaws), 0) + self.assertEqual(result.llm_calls_made, 0) + + def test_dataclass_with_flaws(self): + flaw = TracedFlaw( + id="flaw_001", + description="Budget fabricated", + severity="HIGH", + starting_evidence="CZK 500,000", + trace=[TraceEntry(stage="test", file="test.md", evidence="ev")], + ) + result = FlawTraceResult( + starting_file="test.md", + flaw_description="test", + output_dir="/tmp/test", + flaws=[flaw], + llm_calls_made=1, + ) + self.assertEqual(len(result.flaws), 1) + self.assertEqual(result.flaws[0].severity, "HIGH") + + +class TestTracedFlaw(unittest.TestCase): + def test_defaults(self): + flaw = TracedFlaw( + id="flaw_001", + description="test", + severity="LOW", + starting_evidence="ev", + trace=[], + ) + self.assertIsNone(flaw.origin_stage) + self.assertIsNone(flaw.origin) + self.assertEqual(flaw.depth, 0) + self.assertTrue(flaw.trace_complete) + + +class TestFlawTracerPhase1(unittest.TestCase): + """Test flaw identification (Phase 1) with mocked LLM methods.""" + + def test_identify_flaws_returns_flaws(self): + """The tracer should produce TracedFlaw objects from Phase 1 identification.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create a minimal output file + report_file = output_dir / "025-2-executive_summary.md" + report_file.write_text("# Summary\nBudget: CZK 500,000", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + # Mock Phase 1: identify flaws + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw( + description="Budget is unvalidated", + evidence="CZK 500,000", + severity="HIGH", + ) + ] + ) + + # Mock Phase 2: upstream check — not found (no upstream files on disk) + # Mock Phase 3: source code analysis + mock_analysis = SourceCodeAnalysisResult( + likely_cause="Prompt asks for budget without data", + relevant_code_section="system_prompt = ...", + suggestion="Add validation step", + ) + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_analyze_source_code') as mock_analyze: + result = tracer.trace("025-2-executive_summary.md", "budget is unvalidated") + + self.assertIsInstance(result, FlawTraceResult) + self.assertGreaterEqual(len(result.flaws), 1) + flaw = result.flaws[0] + self.assertEqual(flaw.description, "Budget is unvalidated") + self.assertEqual(flaw.severity, "HIGH") + + def test_file_not_found_raises(self): + """The tracer should raise FileNotFoundError for missing starting files.""" + with TemporaryDirectory() as d: + tracer = _make_tracer(Path(d)) + with self.assertRaises(FileNotFoundError): + tracer.trace("nonexistent.md", "test") + + +class TestFlawTracerUpstreamTrace(unittest.TestCase): + """Test upstream tracing (Phase 2) with a simple two-level chain.""" + + def test_traces_flaw_upstream(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create files for a chain: executive_summary -> project_plan -> setup + (output_dir / "025-2-executive_summary.md").write_text("Budget: CZK 500,000", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("Budget: CZK 500,000", encoding="utf-8") + (output_dir / "001-2-plan.txt").write_text("Open a tea shop", encoding="utf-8") + # Create other upstream files that executive_summary depends on + (output_dir / "002-14-strategic_decisions.md").write_text("decisions", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("scenarios", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + # Mock Phase 1: identify flaws + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw( + description="Budget fabricated", + evidence="CZK 500,000", + severity="HIGH", + ) + ] + ) + + # Track upstream check calls to return different results per file + upstream_call_count = 0 + upstream_responses = {} + + def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + nonlocal upstream_call_count + upstream_call_count += 1 + # project_plan has the flaw; others are clean + if "project_plan" in upstream_filename: + return UpstreamCheckResult( + found=True, + evidence="Budget: CZK 500,000", + explanation="Budget originates here", + ) + else: + return UpstreamCheckResult( + found=False, + evidence=None, + explanation="clean", + ) + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "budget is fabricated") + + self.assertEqual(len(result.flaws), 1) + flaw = result.flaws[0] + # The trace should include at least executive_summary and project_plan + trace_stages = [entry.stage for entry in flaw.trace] + self.assertIn("executive_summary", trace_stages) + self.assertIn("project_plan", trace_stages) + # Origin should be project_plan (flaw found there but not in its upstream 'setup') + self.assertEqual(flaw.origin_stage, "project_plan") + + def test_deduplication_works(self): + """Stages already checked for the same flaw should be skipped.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + # executive_summary depends on strategic_decisions_markdown, scenarios_markdown, etc. + # project_plan also depends on strategic_decisions_markdown, scenarios_markdown. + # When we trace through project_plan, those shared upstreams should be skipped. + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "001-2-plan.txt").write_text("Open a tea shop", encoding="utf-8") + (output_dir / "002-14-strategic_decisions.md").write_text("decisions", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("scenarios", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH") + ] + ) + + checked_stages = [] + + def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + checked_stages.append(upstream_filename) + if "project_plan" in upstream_filename: + return UpstreamCheckResult(found=True, evidence="500k", explanation="found here") + return UpstreamCheckResult(found=False, evidence=None, explanation="clean") + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "budget fabricated") + + # Count unique filenames checked — dedup should prevent re-checking + # strategic_decisions and scenarios at the project_plan level + unique_checked = set(checked_stages) + # Each file should appear at most once + self.assertEqual(len(checked_stages), len(unique_checked), + f"Dedup failed: checked {checked_stages}") + + +class TestFlawTracerMaxDepth(unittest.TestCase): + def test_respects_max_depth_zero(self): + """With max_depth=0, no upstream tracing happens.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + + tracer = _make_tracer(output_dir, max_depth=0) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="test flaw", evidence="500k", severity="LOW") + ] + ) + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream') as mock_check, \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "test") + + self.assertEqual(len(result.flaws), 1) + # With max_depth=0, no upstream tracing happens + self.assertEqual(len(result.flaws[0].trace), 1) # only the starting file + # _check_upstream should never have been called + mock_check.assert_not_called() + + def test_max_depth_limits_recursion(self): + """With max_depth=1, tracing should stop after one level of upstream.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") + (output_dir / "002-14-strategic_decisions.md").write_text("decisions", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("scenarios", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") + + tracer = _make_tracer(output_dir, max_depth=1) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="flaw", evidence="500k", severity="MEDIUM") + ] + ) + + def always_found(flaw_desc, evidence, upstream_filename, upstream_content): + return UpstreamCheckResult(found=True, evidence="500k", explanation="found") + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream', side_effect=always_found), \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "test") + + self.assertEqual(len(result.flaws), 1) + flaw = result.flaws[0] + # trace_complete should be False because max depth was hit + self.assertFalse(flaw.trace_complete) + + +class TestFlawTracerSourceCodeAnalysis(unittest.TestCase): + """Test that Phase 3 source code analysis is invoked at the origin stage.""" + + def test_source_code_analysis_called_at_origin(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="flaw", evidence="500k", severity="HIGH") + ] + ) + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_analyze_source_code') as mock_analyze: + result = tracer.trace("025-2-executive_summary.md", "test") + + # _analyze_source_code should have been called once for the origin + mock_analyze.assert_called_once() + args = mock_analyze.call_args + # First positional arg is the TracedFlaw, second is the stage name + self.assertEqual(args[0][1], "executive_summary") + + +class TestFlawTracerMultipleFlaws(unittest.TestCase): + """Test that multiple flaws are traced independently.""" + + def test_traces_multiple_flaws(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k\nTimeline: 2 months", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH"), + IdentifiedFlaw(description="Timeline unrealistic", evidence="2 months", severity="MEDIUM"), + ] + ) + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "multiple issues") + + self.assertEqual(len(result.flaws), 2) + descriptions = {f.description for f in result.flaws} + self.assertIn("Budget fabricated", descriptions) + self.assertIn("Timeline unrealistic", descriptions) + # Each flaw should have a unique ID + ids = [f.id for f in result.flaws] + self.assertEqual(len(ids), len(set(ids))) + + +class TestFlawTracerSortsByDepth(unittest.TestCase): + """Test that results are sorted by depth (deepest origin first).""" + + def test_flaws_sorted_by_depth_descending(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("content", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("content", encoding="utf-8") + (output_dir / "002-14-strategic_decisions.md").write_text("content", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("content", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("content", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="shallow flaw", evidence="ev1", severity="LOW"), + IdentifiedFlaw(description="deep flaw", evidence="ev2", severity="HIGH"), + ] + ) + + call_count = 0 + + def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + nonlocal call_count + call_count += 1 + # For "deep flaw", find it in project_plan + if "deep flaw" in flaw_desc and "project_plan" in upstream_filename: + return UpstreamCheckResult(found=True, evidence="ev2", explanation="found") + return UpstreamCheckResult(found=False, evidence=None, explanation="clean") + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ + patch.object(tracer, '_analyze_source_code'): + result = tracer.trace("025-2-executive_summary.md", "test") + + self.assertEqual(len(result.flaws), 2) + # Deepest origin should be first + self.assertGreaterEqual(result.flaws[0].depth, result.flaws[1].depth) + + +if __name__ == "__main__": + unittest.main() diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py new file mode 100644 index 000000000..f3b0fc744 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -0,0 +1,280 @@ +# worker_plan/worker_plan_internal/flaw_tracer/tracer.py +"""Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" +import json +import logging +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from llama_index.core.llms.llm import LLM + +from worker_plan_internal.flaw_tracer.registry import ( + find_stage_by_filename, + get_upstream_files, + get_source_code_paths, +) +from worker_plan_internal.flaw_tracer.prompts import ( + FlawIdentificationResult, + UpstreamCheckResult, + SourceCodeAnalysisResult, + build_flaw_identification_messages, + build_upstream_check_messages, + build_source_code_analysis_messages, +) +from worker_plan_internal.llm_util.llm_executor import LLMExecutor + +logger = logging.getLogger(__name__) + + +@dataclass +class TraceEntry: + """One hop in a flaw's upstream trace.""" + stage: str + file: str + evidence: str + is_origin: bool = False + + +@dataclass +class OriginInfo: + """Source code analysis at a flaw's origin stage.""" + stage: str + file: str + source_code_files: list[str] + likely_cause: str + suggestion: str + + +@dataclass +class TracedFlaw: + """A fully traced flaw with its upstream chain.""" + id: str + description: str + severity: str + starting_evidence: str + trace: list[TraceEntry] + origin_stage: Optional[str] = None + origin: Optional[OriginInfo] = None + depth: int = 0 + trace_complete: bool = True + + +@dataclass +class FlawTraceResult: + """Complete result of a flaw trace run.""" + starting_file: str + flaw_description: str + output_dir: str + flaws: list[TracedFlaw] + llm_calls_made: int = 0 + + +class FlawTracer: + """Traces flaws upstream through the PlanExe pipeline DAG.""" + + def __init__( + self, + output_dir: Path, + llm_executor: LLMExecutor, + source_code_base: Path, + max_depth: int = 15, + verbose: bool = False, + ): + self.output_dir = output_dir + self.llm_executor = llm_executor + self.source_code_base = source_code_base + self.max_depth = max_depth + self.verbose = verbose + self._llm_calls = 0 + self._checked: set[tuple[str, str]] = set() # (stage_name, flaw_description) dedup + + def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: + """Main entry point. Identify flaws and trace each upstream.""" + self._llm_calls = 0 + self._checked.clear() + + file_path = self.output_dir / starting_file + if not file_path.exists(): + raise FileNotFoundError(f"Starting file not found: {file_path}") + + file_content = file_path.read_text(encoding="utf-8") + stage = find_stage_by_filename(starting_file) + stage_name = stage.name if stage else "unknown" + + # Phase 1: Identify flaws + self._log(f"Phase 1: Identifying flaws in {starting_file}") + identified = self._identify_flaws(starting_file, file_content, flaw_description) + self._log(f" Found {len(identified.flaws)} flaw(s)") + + traced_flaws: list[TracedFlaw] = [] + for i, flaw in enumerate(identified.flaws): + flaw_id = f"flaw_{i + 1:03d}" + self._log(f"\nTracing {flaw_id}: {flaw.description}") + + starting_entry = TraceEntry( + stage=stage_name, + file=starting_file, + evidence=flaw.evidence, + is_origin=False, + ) + + traced = TracedFlaw( + id=flaw_id, + description=flaw.description, + severity=flaw.severity, + starting_evidence=flaw.evidence, + trace=[starting_entry], + ) + + if stage and self.max_depth > 0: + self._trace_upstream(traced, stage_name, flaw.description, flaw.evidence, depth=0) + + # Mark the last trace entry as origin if no deeper origin was found + if traced.origin_stage is None and traced.trace: + last = traced.trace[-1] + last.is_origin = True + traced.origin_stage = last.stage + traced.depth = len(traced.trace) - 1 + + # Phase 3: Source code analysis at origin + self._analyze_source_code(traced, last.stage, flaw.description, last.evidence) + + traced_flaws.append(traced) + + # Sort by depth (deepest origin first) + traced_flaws.sort(key=lambda f: f.depth, reverse=True) + + return FlawTraceResult( + starting_file=starting_file, + flaw_description=flaw_description, + output_dir=str(self.output_dir), + flaws=traced_flaws, + llm_calls_made=self._llm_calls, + ) + + def _identify_flaws(self, filename: str, file_content: str, user_description: str) -> FlawIdentificationResult: + """Phase 1: Ask LLM to identify discrete flaws in the starting file.""" + messages = build_flaw_identification_messages(filename, file_content, user_description) + + def execute(llm: LLM) -> FlawIdentificationResult: + sllm = llm.as_structured_llm(FlawIdentificationResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + return self.llm_executor.run(execute) + + def _check_upstream(self, flaw_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: + """Phase 2: Ask LLM if a flaw exists in an upstream file.""" + messages = build_upstream_check_messages(flaw_description, evidence, upstream_filename, upstream_content) + + def execute(llm: LLM) -> UpstreamCheckResult: + sllm = llm.as_structured_llm(UpstreamCheckResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + return self.llm_executor.run(execute) + + def _trace_upstream( + self, + traced: TracedFlaw, + current_stage: str, + flaw_description: str, + evidence: str, + depth: int, + ) -> None: + """Recursively trace a flaw through upstream stages.""" + if depth >= self.max_depth: + traced.trace_complete = False + self._log(f" Max depth {self.max_depth} reached at {current_stage}") + return + + upstream_files = get_upstream_files(current_stage, self.output_dir) + if not upstream_files: + return # No upstream = this is the origin + + found_upstream = False + for upstream_name, upstream_path in upstream_files: + dedup_key = (upstream_name, flaw_description) + if dedup_key in self._checked: + self._log(f" Skipping {upstream_name} (already checked for this flaw)") + continue + self._checked.add(dedup_key) + + upstream_content = upstream_path.read_text(encoding="utf-8") + self._log(f" Checking upstream: {upstream_name} ({upstream_path.name})") + + result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) + + if result.found: + self._log(f" -> FOUND in {upstream_name}") + found_upstream = True + entry = TraceEntry( + stage=upstream_name, + file=upstream_path.name, + evidence=result.evidence or "", + is_origin=False, + ) + traced.trace.append(entry) + + # Recurse deeper + self._trace_upstream( + traced, upstream_name, flaw_description, + result.evidence or evidence, depth + 1, + ) + # After recursion, if origin was found deeper, stop tracing other branches + if traced.origin_stage is not None: + return + + if not found_upstream: + # Current stage is the origin — flaw exists here but not in any upstream + traced.origin_stage = current_stage + traced.depth = len(traced.trace) + # Mark the current stage entry as origin + for entry in traced.trace: + if entry.stage == current_stage: + entry.is_origin = True + + def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_description: str, evidence: str) -> None: + """Phase 3: Analyze source code at the origin stage.""" + source_paths = get_source_code_paths(stage_name) + if not source_paths: + return + + source_contents: list[tuple[str, str]] = [] + for path in source_paths: + if path.exists(): + content = path.read_text(encoding="utf-8") + source_contents.append((path.name, content)) + + if not source_contents: + return + + self._log(f" Phase 3: Analyzing source code for {stage_name}") + messages = build_source_code_analysis_messages(flaw_description, evidence, source_contents) + + def execute(llm: LLM) -> SourceCodeAnalysisResult: + sllm = llm.as_structured_llm(SourceCodeAnalysisResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + try: + analysis = self.llm_executor.run(execute) + source_file_names = [name for name, _ in source_contents] + traced.origin = OriginInfo( + stage=stage_name, + file=traced.trace[-1].file if traced.trace else "", + source_code_files=source_file_names, + likely_cause=analysis.likely_cause, + suggestion=analysis.suggestion, + ) + except Exception as e: + logger.warning(f"Source code analysis failed for {stage_name}: {e}") + + def _log(self, message: str) -> None: + """Print to stderr if verbose mode is enabled.""" + if self.verbose: + print(message, file=sys.stderr) From d05f78d58637123aee019df3222917cd581a24f4 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:31:48 +0200 Subject: [PATCH 05/37] fix: ensure Phase 3 source code analysis runs for upstream-traced flaws Previously, _analyze_source_code was only called when no upstream origin was found (the fallback path). When _trace_upstream successfully identified a deeper origin, Phase 3 was skipped entirely. Now Phase 3 runs whenever an origin stage is known, regardless of how it was determined. Also removes unused imports (json in tracer.py, MagicMock and json in test_tracer.py) and adds a test verifying Phase 3 is called at a deep upstream origin. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/tests/test_tracer.py | 42 ++++++++++++++++++- .../flaw_tracer/tracer.py | 9 ++-- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py index c5734bbbe..15a9effe2 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py @@ -6,11 +6,10 @@ _analyze_source_code) directly. This tests the tracing logic — recursion, deduplication, max depth — which is the important part. """ -import json import unittest from pathlib import Path from tempfile import TemporaryDirectory -from unittest.mock import patch, MagicMock +from unittest.mock import patch from worker_plan_internal.flaw_tracer.tracer import ( FlawTracer, @@ -337,6 +336,45 @@ def test_source_code_analysis_called_at_origin(self): # First positional arg is the TracedFlaw, second is the stage name self.assertEqual(args[0][1], "executive_summary") + def test_source_code_analysis_called_at_deep_origin(self): + """Phase 3 should run when the origin is found at a deeper upstream stage.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create files for a chain: executive_summary -> project_plan (origin) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("Budget: 500k", encoding="utf-8") + (output_dir / "001-2-plan.txt").write_text("Open a tea shop", encoding="utf-8") + (output_dir / "002-14-strategic_decisions.md").write_text("decisions", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("scenarios", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") + + tracer = _make_tracer(output_dir) + + mock_identification = FlawIdentificationResult( + flaws=[ + IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH") + ] + ) + + def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + # project_plan has the flaw; others are clean + if "project_plan" in upstream_filename: + return UpstreamCheckResult( + found=True, evidence="Budget: 500k", explanation="Budget originates here" + ) + return UpstreamCheckResult(found=False, evidence=None, explanation="clean") + + with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ + patch.object(tracer, '_analyze_source_code') as mock_analyze: + result = tracer.trace("025-2-executive_summary.md", "budget fabricated") + + # Phase 3 should have been called at the deep origin (project_plan) + mock_analyze.assert_called_once() + args = mock_analyze.call_args + # Second positional arg is the origin stage name + self.assertEqual(args[0][1], "project_plan") + class TestFlawTracerMultipleFlaws(unittest.TestCase): """Test that multiple flaws are traced independently.""" diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index f3b0fc744..ae328a0b8 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -1,6 +1,5 @@ # worker_plan/worker_plan_internal/flaw_tracer/tracer.py """Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" -import json import logging import sys from dataclasses import dataclass, field @@ -137,8 +136,12 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: traced.origin_stage = last.stage traced.depth = len(traced.trace) - 1 - # Phase 3: Source code analysis at origin - self._analyze_source_code(traced, last.stage, flaw.description, last.evidence) + # Phase 3: Source code analysis at origin (always, when origin is known) + if traced.origin_stage is not None: + self._analyze_source_code( + traced, traced.origin_stage, flaw.description, + next((e.evidence for e in traced.trace if e.stage == traced.origin_stage), flaw.evidence) + ) traced_flaws.append(traced) From 0435abe9d02fd48b021528b624f5fa846b3f20ef Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:41:14 +0200 Subject: [PATCH 06/37] =?UTF-8?q?refactor:=20clean=20up=20tracer.py=20?= =?UTF-8?q?=E2=80=94=20remove=20unused=20imports,=20params,=20modernize=20?= =?UTF-8?q?types?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused `field` import from dataclasses - Remove unused `source_code_base` parameter from FlawTracer.__init__() (registry handles source code path resolution via its own _SOURCE_BASE) - Replace `Optional[X]` with `X | None` using `from __future__ import annotations` - Add clarifying comments for dedup strategy and first-match-wins logic - Remove dead `mock_analysis` variable and unused `SourceCodeAnalysisResult` import from test_tracer.py Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/tests/test_tracer.py | 13 +------------ .../worker_plan_internal/flaw_tracer/tracer.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py index 15a9effe2..18c9d6ea8 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py @@ -22,7 +22,6 @@ FlawIdentificationResult, IdentifiedFlaw, UpstreamCheckResult, - SourceCodeAnalysisResult, ) from worker_plan_internal.llm_util.response_mockllm import ResponseMockLLM from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelWithInstance @@ -36,13 +35,11 @@ def _make_executor() -> LLMExecutor: def _make_tracer(output_dir: Path, max_depth: int = 15, verbose: bool = False) -> FlawTracer: - """Create a FlawTracer with a dummy executor and a real source_code_base.""" + """Create a FlawTracer with a dummy executor.""" executor = _make_executor() - source_base = Path(__file__).resolve().parent.parent.parent.parent # worker_plan/ return FlawTracer( output_dir=output_dir, llm_executor=executor, - source_code_base=source_base, max_depth=max_depth, verbose=verbose, ) @@ -119,14 +116,6 @@ def test_identify_flaws_returns_flaws(self): ] ) - # Mock Phase 2: upstream check — not found (no upstream files on disk) - # Mock Phase 3: source code analysis - mock_analysis = SourceCodeAnalysisResult( - likely_cause="Prompt asks for budget without data", - relevant_code_section="system_prompt = ...", - suggestion="Add validation step", - ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ patch.object(tracer, '_analyze_source_code') as mock_analyze: result = tracer.trace("025-2-executive_summary.md", "budget is unvalidated") diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index ae328a0b8..54a7fc796 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -1,10 +1,11 @@ # worker_plan/worker_plan_internal/flaw_tracer/tracer.py """Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" +from __future__ import annotations + import logging import sys -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path -from typing import Optional from llama_index.core.llms.llm import LLM @@ -53,8 +54,8 @@ class TracedFlaw: severity: str starting_evidence: str trace: list[TraceEntry] - origin_stage: Optional[str] = None - origin: Optional[OriginInfo] = None + origin_stage: str | None = None + origin: OriginInfo | None = None depth: int = 0 trace_complete: bool = True @@ -76,13 +77,11 @@ def __init__( self, output_dir: Path, llm_executor: LLMExecutor, - source_code_base: Path, max_depth: int = 15, verbose: bool = False, ): self.output_dir = output_dir self.llm_executor = llm_executor - self.source_code_base = source_code_base self.max_depth = max_depth self.verbose = verbose self._llm_calls = 0 @@ -200,6 +199,9 @@ def _trace_upstream( found_upstream = False for upstream_name, upstream_path in upstream_files: + # Dedup key uses flaw_description so different flaws get independent + # upstream checks. If the LLM returns duplicate descriptions, they + # share check results. dedup_key = (upstream_name, flaw_description) if dedup_key in self._checked: self._log(f" Skipping {upstream_name} (already checked for this flaw)") @@ -227,7 +229,8 @@ def _trace_upstream( traced, upstream_name, flaw_description, result.evidence or evidence, depth + 1, ) - # After recursion, if origin was found deeper, stop tracing other branches + # First-match-wins: once an origin is found in one upstream + # branch, stop exploring others. if traced.origin_stage is not None: return From 5c7dd8223a3662355b3dbe84c8efac8b6c49ba85 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:47:13 +0200 Subject: [PATCH 07/37] feat: add flaw_tracer JSON and markdown report generation Co-Authored-By: Claude Sonnet 4.6 --- .../flaw_tracer/output.py | 131 ++++++++++++++++ .../flaw_tracer/tests/test_output.py | 141 ++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/output.py create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/output.py b/worker_plan/worker_plan_internal/flaw_tracer/output.py new file mode 100644 index 000000000..f497ae4bc --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/output.py @@ -0,0 +1,131 @@ +# worker_plan/worker_plan_internal/flaw_tracer/output.py +"""JSON and markdown report generation for flaw trace results.""" +import json +from datetime import datetime, UTC +from pathlib import Path + +from worker_plan_internal.flaw_tracer.tracer import FlawTraceResult + + +def write_json_report(result: FlawTraceResult, output_path: Path) -> None: + """Write the flaw trace result as a JSON file.""" + data = { + "input": { + "starting_file": result.starting_file, + "flaw_description": result.flaw_description, + "output_dir": result.output_dir, + "timestamp": datetime.now(UTC).isoformat(), + }, + "flaws": [], + "summary": { + "total_flaws": len(result.flaws), + "deepest_origin_stage": None, + "deepest_origin_depth": 0, + "llm_calls_made": result.llm_calls_made, + }, + } + + max_depth = 0 + deepest_stage = None + + for flaw in result.flaws: + flaw_data = { + "id": flaw.id, + "description": flaw.description, + "severity": flaw.severity, + "starting_evidence": flaw.starting_evidence, + "trace": [ + { + "stage": entry.stage, + "file": entry.file, + "evidence": entry.evidence, + "is_origin": entry.is_origin, + } + for entry in flaw.trace + ], + "origin": None, + "depth": flaw.depth, + "trace_complete": flaw.trace_complete, + } + + if flaw.origin: + flaw_data["origin"] = { + "stage": flaw.origin.stage, + "file": flaw.origin.file, + "source_code_files": flaw.origin.source_code_files, + "likely_cause": flaw.origin.likely_cause, + "suggestion": flaw.origin.suggestion, + } + + if flaw.depth > max_depth: + max_depth = flaw.depth + deepest_stage = flaw.origin_stage + + data["flaws"].append(flaw_data) + + data["flaws"].sort(key=lambda f: f["depth"], reverse=True) + data["summary"]["deepest_origin_stage"] = deepest_stage + data["summary"]["deepest_origin_depth"] = max_depth + + output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: + """Write the flaw trace result as a markdown report.""" + lines: list[str] = [] + lines.append("# Flaw Trace Report") + lines.append("") + lines.append(f"**Input:** {result.starting_file}") + lines.append(f"**Flaws found:** {len(result.flaws)}") + + if result.flaws: + deepest = max(result.flaws, key=lambda f: f.depth) + lines.append(f"**Deepest origin:** {deepest.origin_stage} (depth {deepest.depth})") + lines.append(f"**LLM calls:** {result.llm_calls_made}") + lines.append("") + + for flaw in result.flaws: + lines.append("---") + lines.append("") + lines.append(f"## {flaw.id.replace('_', ' ').title()} ({flaw.severity}): {flaw.description}") + lines.append("") + + # Trace chain summary + stage_names = [entry.stage for entry in flaw.trace] + chain_parts = [] + for name in stage_names: + if name == flaw.origin_stage: + chain_parts.append(f"**{name}** (origin)") + else: + chain_parts.append(name) + lines.append(f"**Trace:** {' -> '.join(chain_parts)}") + lines.append("") + + if not flaw.trace_complete: + lines.append("*Note: trace incomplete — max depth reached.*") + lines.append("") + + # Trace table + lines.append("| Stage | File | Evidence |") + lines.append("|-------|------|----------|") + for entry in flaw.trace: + stage_cell = f"**{entry.stage}**" if entry.is_origin else entry.stage + evidence_cell = _escape_table_cell(entry.evidence) + lines.append(f"| {stage_cell} | {entry.file} | {evidence_cell} |") + lines.append("") + + # Origin analysis + if flaw.origin: + lines.append(f"**Root cause:** {flaw.origin.likely_cause}") + lines.append("") + lines.append(f"**Source files:** {', '.join(flaw.origin.source_code_files)}") + lines.append("") + lines.append(f"**Suggestion:** {flaw.origin.suggestion}") + lines.append("") + + output_path.write_text("\n".join(lines), encoding="utf-8") + + +def _escape_table_cell(text: str) -> str: + """Escape pipe characters and collapse newlines for markdown table cells.""" + return text.replace("|", "\\|").replace("\n", " ") diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py new file mode 100644 index 000000000..46ed25e54 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py @@ -0,0 +1,141 @@ +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +import json +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from worker_plan_internal.flaw_tracer.tracer import ( + FlawTraceResult, + TracedFlaw, + TraceEntry, + OriginInfo, +) +from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report + + +def _make_sample_result() -> FlawTraceResult: + """Create a sample FlawTraceResult for testing.""" + return FlawTraceResult( + starting_file="025-2-executive_summary.md", + flaw_description="Budget is unvalidated", + output_dir="/tmp/test_output", + flaws=[ + TracedFlaw( + id="flaw_001", + description="Budget of CZK 500,000 is unvalidated", + severity="HIGH", + starting_evidence="CZK 500,000", + trace=[ + TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="CZK 500,000", is_origin=False), + TraceEntry(stage="project_plan", file="005-2-project_plan.md", evidence="Budget: 500k", is_origin=False), + TraceEntry(stage="make_assumptions", file="003-5-make_assumptions.md", evidence="Assume budget of 500k", is_origin=True), + ], + origin_stage="make_assumptions", + origin=OriginInfo( + stage="make_assumptions", + file="003-5-make_assumptions.md", + source_code_files=["make_assumptions.py"], + likely_cause="Prompt generates budget without data", + suggestion="Add validation step", + ), + depth=3, + ), + TracedFlaw( + id="flaw_002", + description="Missing market sizing", + severity="MEDIUM", + starting_evidence="growing Czech market", + trace=[ + TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="growing Czech market", is_origin=True), + ], + origin_stage="executive_summary", + depth=1, + ), + ], + llm_calls_made=8, + ) + + +class TestWriteJsonReport(unittest.TestCase): + def test_writes_valid_json(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + self.assertTrue(output_path.exists()) + data = json.loads(output_path.read_text(encoding="utf-8")) + self.assertIn("input", data) + self.assertIn("flaws", data) + self.assertIn("summary", data) + + def test_json_contains_correct_summary(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + data = json.loads(output_path.read_text(encoding="utf-8")) + summary = data["summary"] + self.assertEqual(summary["total_flaws"], 2) + self.assertEqual(summary["deepest_origin_stage"], "make_assumptions") + self.assertEqual(summary["deepest_origin_depth"], 3) + self.assertEqual(summary["llm_calls_made"], 8) + + def test_json_flaws_sorted_by_depth(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + data = json.loads(output_path.read_text(encoding="utf-8")) + depths = [f["depth"] for f in data["flaws"]] + self.assertEqual(depths, sorted(depths, reverse=True)) + + +class TestWriteMarkdownReport(unittest.TestCase): + def test_writes_markdown_file(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + self.assertTrue(output_path.exists()) + content = output_path.read_text(encoding="utf-8") + self.assertIn("# Flaw Trace Report", content) + + def test_markdown_contains_flaw_details(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("Budget of CZK 500,000 is unvalidated", content) + self.assertIn("make_assumptions", content) + self.assertIn("executive_summary", content) + + def test_markdown_contains_trace_table(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("| Stage |", content) + self.assertIn("| File |", content) + + def test_empty_result_produces_valid_markdown(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = FlawTraceResult( + starting_file="030-report.html", + flaw_description="test", + output_dir="/tmp", + flaws=[], + llm_calls_made=1, + ) + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("Flaws found:** 0", content) From bdacb19055f06603bdbbe0cdd3aa3ee4b90e058b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:54:38 +0200 Subject: [PATCH 08/37] fix: sort flaws by depth in markdown report output --- worker_plan/worker_plan_internal/flaw_tracer/output.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/output.py b/worker_plan/worker_plan_internal/flaw_tracer/output.py index f497ae4bc..583a0d071 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/output.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/output.py @@ -84,7 +84,8 @@ def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: lines.append(f"**LLM calls:** {result.llm_calls_made}") lines.append("") - for flaw in result.flaws: + sorted_flaws = sorted(result.flaws, key=lambda f: f.depth, reverse=True) + for flaw in sorted_flaws: lines.append("---") lines.append("") lines.append(f"## {flaw.id.replace('_', ' ').title()} ({flaw.severity}): {flaw.description}") From e4792834ea725a7db7c200e1b970cc19eeadb79b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sun, 5 Apr 2026 23:58:11 +0200 Subject: [PATCH 09/37] feat: add flaw_tracer CLI entry point --- .../flaw_tracer/__main__.py | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/__main__.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py new file mode 100644 index 000000000..679f8ed38 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py @@ -0,0 +1,107 @@ +# worker_plan/worker_plan_internal/flaw_tracer/__main__.py +"""CLI entry point for the flaw tracer. + +Usage: + python -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output \ + --file 030-report.html \ + --flaw "The budget appears unvalidated..." \ + --output-dir /path/to/output \ + --max-depth 15 \ + --verbose +""" +import argparse +import sys +from pathlib import Path + +from worker_plan_internal.flaw_tracer.tracer import FlawTracer +from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelFromName, RetryConfig +from worker_plan_internal.llm_factory import get_llm_names_by_priority + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Trace flaws in PlanExe reports upstream to their root cause.", + ) + parser.add_argument( + "--dir", required=True, type=Path, + help="Path to the output directory containing intermediary files", + ) + parser.add_argument( + "--file", required=True, + help="Starting file to analyze (relative to --dir)", + ) + parser.add_argument( + "--flaw", required=True, + help="Text description of the observed flaw(s)", + ) + parser.add_argument( + "--output-dir", type=Path, default=None, + help="Where to write flaw_trace.json and flaw_trace.md (defaults to --dir)", + ) + parser.add_argument( + "--max-depth", type=int, default=15, + help="Maximum upstream hops per flaw (default: 15)", + ) + parser.add_argument( + "--verbose", action="store_true", + help="Print each LLM call and result to stderr", + ) + args = parser.parse_args() + + output_dir: Path = args.dir.resolve() + if not output_dir.is_dir(): + print(f"Error: --dir is not a directory: {output_dir}", file=sys.stderr) + sys.exit(1) + + starting_file = args.file + if not (output_dir / starting_file).exists(): + print(f"Error: starting file not found: {output_dir / starting_file}", file=sys.stderr) + sys.exit(1) + + report_dir: Path = (args.output_dir or args.dir).resolve() + report_dir.mkdir(parents=True, exist_ok=True) + + # Set up LLM executor with priority-ordered models from the active profile + llm_names = get_llm_names_by_priority() + if not llm_names: + print("Error: no LLM models configured. Check PLANEXE_MODEL_PROFILE.", file=sys.stderr) + sys.exit(1) + + llm_models = LLMModelFromName.from_names(llm_names) + executor = LLMExecutor( + llm_models=llm_models, + retry_config=RetryConfig(max_retries=2), + max_validation_retries=1, + ) + + tracer = FlawTracer( + output_dir=output_dir, + llm_executor=executor, + max_depth=args.max_depth, + verbose=args.verbose, + ) + + print(f"Tracing flaws in {starting_file}...", file=sys.stderr) + result = tracer.trace(starting_file, args.flaw) + + # Write reports + json_path = report_dir / "flaw_trace.json" + md_path = report_dir / "flaw_trace.md" + write_json_report(result, json_path) + write_markdown_report(result, md_path) + + # Print summary + print(f"\nFlaws found: {len(result.flaws)}", file=sys.stderr) + if result.flaws: + deepest = max(result.flaws, key=lambda f: f.depth) + print(f"Deepest origin: {deepest.origin_stage} (depth {deepest.depth})", file=sys.stderr) + print(f"LLM calls made: {result.llm_calls_made}", file=sys.stderr) + print(f"\nReports written:", file=sys.stderr) + print(f" JSON: {json_path}", file=sys.stderr) + print(f" Markdown: {md_path}", file=sys.stderr) + + +if __name__ == "__main__": + main() From 831ea6bb667924e84885fe983604fbf3c858a2c8 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 00:13:18 +0200 Subject: [PATCH 10/37] docs: add flaw tracer design spec and implementation plan Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-04-05-flaw-tracer.md | 2207 +++++++++++++++++ .../specs/2026-04-05-flaw-tracer-design.md | 281 +++ 2 files changed, 2488 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-05-flaw-tracer.md create mode 100644 docs/superpowers/specs/2026-04-05-flaw-tracer-design.md diff --git a/docs/superpowers/plans/2026-04-05-flaw-tracer.md b/docs/superpowers/plans/2026-04-05-flaw-tracer.md new file mode 100644 index 000000000..97b49ddb8 --- /dev/null +++ b/docs/superpowers/plans/2026-04-05-flaw-tracer.md @@ -0,0 +1,2207 @@ +# Flaw Tracer Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a CLI tool that traces flaws in PlanExe reports upstream through the pipeline DAG to find root causes. + +**Architecture:** Recursive depth-first search through a static DAG registry. Three LLM prompts (flaw identification, upstream check, source code analysis) use Pydantic structured output via LLMExecutor. Produces JSON + markdown reports. + +**Tech Stack:** Python 3.13, llama-index LLM infrastructure, Pydantic v2, argparse, pytest + +--- + +## File Structure + +``` +worker_plan/worker_plan_internal/flaw_tracer/ + __init__.py — Package init, exports FlawTracer + registry.py — Static DAG mapping: 48 stages with deps, files, source code + prompts.py — 3 Pydantic models + 3 prompt builders + tracer.py — Recursive tracing algorithm (FlawTracer class) + output.py — JSON + markdown report generation + __main__.py — CLI entry point (argparse) + tests/ + __init__.py + test_registry.py — Lookup function tests + test_prompts.py — Prompt construction tests + test_tracer.py — Tracing algorithm tests with mock LLM + test_output.py — Report generation tests +``` + +--- + +### Task 1: Registry — DAG Mapping + +**Files:** +- Create: `worker_plan/worker_plan_internal/flaw_tracer/__init__.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/registry.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py` + +- [ ] **Step 1: Write the failing tests** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from worker_plan_internal.flaw_tracer.registry import ( + StageInfo, + STAGES, + find_stage_by_filename, + get_upstream_files, + get_source_code_paths, +) + + +class TestStageInfo(unittest.TestCase): + def test_stages_is_nonempty(self): + self.assertGreater(len(STAGES), 40) + + def test_all_stages_have_required_fields(self): + for stage in STAGES: + self.assertIsInstance(stage.name, str, f"{stage.name} name") + self.assertIsInstance(stage.output_files, list, f"{stage.name} output_files") + self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") + self.assertIsInstance(stage.upstream_stages, list, f"{stage.name} upstream_stages") + self.assertIsInstance(stage.source_code_files, list, f"{stage.name} source_code_files") + self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") + self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") + + def test_no_duplicate_stage_names(self): + names = [s.name for s in STAGES] + self.assertEqual(len(names), len(set(names))) + + def test_upstream_references_are_valid(self): + valid_names = {s.name for s in STAGES} + for stage in STAGES: + for upstream in stage.upstream_stages: + self.assertIn(upstream, valid_names, f"{stage.name} references unknown upstream '{upstream}'") + + +class TestFindStageByFilename(unittest.TestCase): + def test_find_report(self): + stage = find_stage_by_filename("030-report.html") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "report") + + def test_find_potential_levers_clean(self): + stage = find_stage_by_filename("002-10-potential_levers.json") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "potential_levers") + + def test_find_potential_levers_raw(self): + stage = find_stage_by_filename("002-9-potential_levers_raw.json") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "potential_levers") + + def test_find_executive_summary(self): + stage = find_stage_by_filename("025-2-executive_summary.md") + self.assertIsNotNone(stage) + self.assertEqual(stage.name, "executive_summary") + + def test_unknown_filename_returns_none(self): + stage = find_stage_by_filename("zzz-unknown.txt") + self.assertIsNone(stage) + + +class TestGetUpstreamFiles(unittest.TestCase): + def test_setup_has_no_upstream(self): + with TemporaryDirectory() as d: + result = get_upstream_files("setup", Path(d)) + self.assertEqual(result, []) + + def test_potential_levers_upstream(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create the expected upstream files on disk + (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") + (output_dir / "002-6-identify_purpose.md").write_text("purpose", encoding="utf-8") + (output_dir / "002-8-plan_type.md").write_text("type", encoding="utf-8") + (output_dir / "002-0-extract_constraints.md").write_text("constraints", encoding="utf-8") + + result = get_upstream_files("potential_levers", output_dir) + stage_names = [name for name, _ in result] + self.assertIn("setup", stage_names) + self.assertIn("identify_purpose", stage_names) + self.assertIn("plan_type", stage_names) + self.assertIn("extract_constraints", stage_names) + + def test_missing_files_are_skipped(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Only create one of the upstream files + (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") + + result = get_upstream_files("potential_levers", output_dir) + stage_names = [name for name, _ in result] + self.assertIn("setup", stage_names) + # The others should be skipped because their files don't exist + self.assertNotIn("identify_purpose", stage_names) + + +class TestGetSourceCodePaths(unittest.TestCase): + def test_potential_levers_source(self): + paths = get_source_code_paths("potential_levers") + filenames = [p.name for p in paths] + self.assertIn("potential_levers.py", filenames) + self.assertIn("identify_potential_levers.py", filenames) + + def test_unknown_stage_returns_empty(self): + paths = get_source_code_paths("nonexistent_stage") + self.assertEqual(paths, []) +``` + +- [ ] **Step 2: Create package init files** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/__init__.py +"""Flaw Tracer — Root-cause analysis for PlanExe reports.""" +``` + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py +``` + +- [ ] **Step 3: Run tests to verify they fail** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_registry.py -v` +Expected: FAIL with `ModuleNotFoundError` or `ImportError` + +- [ ] **Step 4: Implement registry.py** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/registry.py +"""Static DAG mapping for the PlanExe pipeline. + +Maps every pipeline stage to its output files, upstream dependencies, +and source code files. Derived from the Luigi task classes in +worker_plan_internal/plan/stages/. +""" +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +# Base path for source code, relative to worker_plan/ +_SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ + + +@dataclass(frozen=True) +class StageInfo: + """One pipeline stage.""" + name: str + output_files: list[str] + primary_output: str # preferred file to read when checking for flaws + upstream_stages: list[str] = field(default_factory=list) + source_code_files: list[str] = field(default_factory=list) + + +# ── Complete pipeline registry ────────────────────────────────────────── + +STAGES: list[StageInfo] = [ + # Phase 1: Initialization + StageInfo( + name="start_time", + output_files=["001-1-start_time.json"], + primary_output="001-1-start_time.json", + upstream_stages=[], + source_code_files=["worker_plan_internal/plan/stages/start_time.py"], + ), + StageInfo( + name="setup", + output_files=["001-2-plan.txt"], + primary_output="001-2-plan.txt", + upstream_stages=[], + source_code_files=["worker_plan_internal/plan/stages/setup.py"], + ), + # Phase 2: Input Validation & Strategy + StageInfo( + name="screen_planning_prompt", + output_files=["002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"], + primary_output="002-0-screen_planning_prompt.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/screen_planning_prompt.py", + "worker_plan_internal/diagnostics/screen_planning_prompt.py", + ], + ), + StageInfo( + name="extract_constraints", + output_files=["002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"], + primary_output="002-0-extract_constraints.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/extract_constraints.py", + "worker_plan_internal/diagnostics/extract_constraints.py", + ], + ), + StageInfo( + name="redline_gate", + output_files=["002-1-redline_gate.json", "002-2-redline_gate.md"], + primary_output="002-2-redline_gate.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/redline_gate.py", + "worker_plan_internal/diagnostics/redline_gate.py", + ], + ), + StageInfo( + name="premise_attack", + output_files=["002-3-premise_attack.json", "002-4-premise_attack.md"], + primary_output="002-4-premise_attack.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/premise_attack.py", + "worker_plan_internal/diagnostics/premise_attack.py", + ], + ), + StageInfo( + name="identify_purpose", + output_files=["002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"], + primary_output="002-6-identify_purpose.md", + upstream_stages=["setup"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_purpose.py", + "worker_plan_internal/assume/identify_purpose.py", + ], + ), + StageInfo( + name="plan_type", + output_files=["002-7-plan_type_raw.json", "002-8-plan_type.md"], + primary_output="002-8-plan_type.md", + upstream_stages=["setup", "identify_purpose"], + source_code_files=[ + "worker_plan_internal/plan/stages/plan_type.py", + "worker_plan_internal/assume/identify_plan_type.py", + ], + ), + StageInfo( + name="potential_levers", + output_files=["002-9-potential_levers_raw.json", "002-10-potential_levers.json"], + primary_output="002-10-potential_levers.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "extract_constraints"], + source_code_files=[ + "worker_plan_internal/plan/stages/potential_levers.py", + "worker_plan_internal/lever/identify_potential_levers.py", + ], + ), + StageInfo( + name="deduplicate_levers", + output_files=["002-11-deduplicated_levers_raw.json"], + primary_output="002-11-deduplicated_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "potential_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/deduplicate_levers.py", + "worker_plan_internal/lever/deduplicate_levers.py", + ], + ), + StageInfo( + name="enrich_levers", + output_files=["002-12-enriched_levers_raw.json"], + primary_output="002-12-enriched_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "deduplicate_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_levers.py", + "worker_plan_internal/lever/enrich_potential_levers.py", + ], + ), + StageInfo( + name="focus_on_vital_few_levers", + output_files=["002-13-vital_few_levers_raw.json"], + primary_output="002-13-vital_few_levers_raw.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "enrich_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/focus_on_vital_few_levers.py", + "worker_plan_internal/lever/focus_on_vital_few_levers.py", + ], + ), + StageInfo( + name="strategic_decisions_markdown", + output_files=["002-14-strategic_decisions.md"], + primary_output="002-14-strategic_decisions.md", + upstream_stages=["enrich_levers", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/strategic_decisions_markdown.py", + "worker_plan_internal/lever/strategic_decisions_markdown.py", + ], + ), + StageInfo( + name="candidate_scenarios", + output_files=["002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"], + primary_output="002-16-candidate_scenarios.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/candidate_scenarios.py", + "worker_plan_internal/lever/candidate_scenarios.py", + ], + ), + StageInfo( + name="select_scenario", + output_files=["002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"], + primary_output="002-18-selected_scenario.json", + upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"], + source_code_files=[ + "worker_plan_internal/plan/stages/select_scenario.py", + "worker_plan_internal/lever/select_scenario.py", + ], + ), + StageInfo( + name="scenarios_markdown", + output_files=["002-19-scenarios.md"], + primary_output="002-19-scenarios.md", + upstream_stages=["candidate_scenarios", "select_scenario"], + source_code_files=[ + "worker_plan_internal/plan/stages/scenarios_markdown.py", + "worker_plan_internal/lever/scenarios_markdown.py", + ], + ), + # Constraint checkers + StageInfo( + name="potential_levers_constraint", + output_files=["002-10-potential_levers_constraint.json"], + primary_output="002-10-potential_levers_constraint.json", + upstream_stages=["extract_constraints", "potential_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="deduplicated_levers_constraint", + output_files=["002-11-deduplicated_levers_constraint.json"], + primary_output="002-11-deduplicated_levers_constraint.json", + upstream_stages=["extract_constraints", "deduplicate_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="enriched_levers_constraint", + output_files=["002-12-enriched_levers_constraint.json"], + primary_output="002-12-enriched_levers_constraint.json", + upstream_stages=["extract_constraints", "enrich_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="vital_few_levers_constraint", + output_files=["002-13-vital_few_levers_constraint.json"], + primary_output="002-13-vital_few_levers_constraint.json", + upstream_stages=["extract_constraints", "focus_on_vital_few_levers"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="candidate_scenarios_constraint", + output_files=["002-16-candidate_scenarios_constraint.json"], + primary_output="002-16-candidate_scenarios_constraint.json", + upstream_stages=["extract_constraints", "candidate_scenarios"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + StageInfo( + name="selected_scenario_constraint", + output_files=["002-18-selected_scenario_constraint.json"], + primary_output="002-18-selected_scenario_constraint.json", + upstream_stages=["extract_constraints", "select_scenario"], + source_code_files=[ + "worker_plan_internal/plan/stages/constraint_checker_stages.py", + "worker_plan_internal/diagnostics/constraint_checker.py", + ], + ), + # Phase 3: Context & Assumptions + StageInfo( + name="physical_locations", + output_files=["002-20-physical_locations_raw.json", "002-21-physical_locations.md"], + primary_output="002-21-physical_locations.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/physical_locations.py", + "worker_plan_internal/assume/physical_locations.py", + ], + ), + StageInfo( + name="currency_strategy", + output_files=["002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"], + primary_output="002-23-currency_strategy.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/currency_strategy.py", + "worker_plan_internal/assume/currency_strategy.py", + ], + ), + StageInfo( + name="identify_risks", + output_files=["003-1-identify_risks_raw.json", "003-2-identify_risks.md"], + primary_output="003-2-identify_risks.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_risks.py", + "worker_plan_internal/assume/identify_risks.py", + ], + ), + StageInfo( + name="make_assumptions", + output_files=["003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"], + primary_output="003-5-make_assumptions.md", + upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"], + source_code_files=[ + "worker_plan_internal/plan/stages/make_assumptions.py", + "worker_plan_internal/assume/make_assumptions.py", + ], + ), + StageInfo( + name="distill_assumptions", + output_files=["003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"], + primary_output="003-7-distill_assumptions.md", + upstream_stages=["setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/distill_assumptions.py", + "worker_plan_internal/assume/distill_assumptions.py", + ], + ), + StageInfo( + name="review_assumptions", + output_files=["003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"], + primary_output="003-9-review_assumptions.md", + upstream_stages=["identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_assumptions.py", + "worker_plan_internal/assume/review_assumptions.py", + ], + ), + StageInfo( + name="consolidate_assumptions_markdown", + output_files=["003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"], + primary_output="003-10-consolidate_assumptions_full.md", + upstream_stages=["identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"], + source_code_files=[ + "worker_plan_internal/plan/stages/consolidate_assumptions_markdown.py", + "worker_plan_internal/assume/shorten_markdown.py", + ], + ), + # Phase 4: Pre-Project Assessment & Project Plan + StageInfo( + name="pre_project_assessment", + output_files=["004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"], + primary_output="004-2-pre_project_assessment.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"], + source_code_files=[ + "worker_plan_internal/plan/stages/pre_project_assessment.py", + "worker_plan_internal/expert/pre_project_assessment.py", + ], + ), + StageInfo( + name="project_plan", + output_files=["005-1-project_plan_raw.json", "005-2-project_plan.md"], + primary_output="005-2-project_plan.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"], + source_code_files=[ + "worker_plan_internal/plan/stages/project_plan.py", + "worker_plan_internal/plan/project_plan.py", + ], + ), + # Phase 5: Governance + StageInfo( + name="governance_phase1_audit", + output_files=["006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"], + primary_output="006-2-governance_phase1_audit.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase1_audit.py", + "worker_plan_internal/governance/governance_phase1_audit.py", + ], + ), + StageInfo( + name="governance_phase2_bodies", + output_files=["006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"], + primary_output="006-4-governance_phase2_bodies.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase2_bodies.py", + "worker_plan_internal/governance/governance_phase2_bodies.py", + ], + ), + StageInfo( + name="governance_phase3_impl_plan", + output_files=["006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"], + primary_output="006-6-governance_phase3_impl_plan.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase3_impl_plan.py", + "worker_plan_internal/governance/governance_phase3_impl_plan.py", + ], + ), + StageInfo( + name="governance_phase4_decision_escalation_matrix", + output_files=["006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"], + primary_output="006-8-governance_phase4_decision_escalation_matrix.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase4_decision_escalation_matrix.py", + "worker_plan_internal/governance/governance_phase4_decision_escalation_matrix.py", + ], + ), + StageInfo( + name="governance_phase5_monitoring_progress", + output_files=["006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"], + primary_output="006-10-governance_phase5_monitoring_progress.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase5_monitoring_progress.py", + "worker_plan_internal/governance/governance_phase5_monitoring_progress.py", + ], + ), + StageInfo( + name="governance_phase6_extra", + output_files=["006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"], + primary_output="006-12-governance_phase6_extra.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"], + source_code_files=[ + "worker_plan_internal/plan/stages/governance_phase6_extra.py", + "worker_plan_internal/governance/governance_phase6_extra.py", + ], + ), + StageInfo( + name="consolidate_governance", + output_files=["006-13-consolidate_governance.md"], + primary_output="006-13-consolidate_governance.md", + upstream_stages=["governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"], + source_code_files=["worker_plan_internal/plan/stages/consolidate_governance.py"], + ), + # Phase 6: Resources & Team + StageInfo( + name="related_resources", + output_files=["007-1-related_resources_raw.json", "007-8-related_resources.md"], + primary_output="007-8-related_resources.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/related_resources.py", + "worker_plan_internal/plan/related_resources.py", + ], + ), + StageInfo( + name="find_team_members", + output_files=["008-1-find_team_members_raw.json", "008-2-find_team_members.json"], + primary_output="008-2-find_team_members.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/find_team_members.py", + "worker_plan_internal/team/find_team_members.py", + ], + ), + StageInfo( + name="enrich_team_contract_type", + output_files=["009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"], + primary_output="009-2-enrich_team_members_contract_type.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_contract_type.py", + "worker_plan_internal/team/enrich_team_members_with_contract_type.py", + ], + ), + StageInfo( + name="enrich_team_background_story", + output_files=["010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"], + primary_output="010-2-enrich_team_members_background_story.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_background_story.py", + "worker_plan_internal/team/enrich_team_members_with_background_story.py", + ], + ), + StageInfo( + name="enrich_team_environment_info", + output_files=["011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"], + primary_output="011-2-enrich_team_members_environment_info.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/enrich_team_environment_info.py", + "worker_plan_internal/team/enrich_team_members_with_environment_info.py", + ], + ), + StageInfo( + name="review_team", + output_files=["012-review_team_raw.json"], + primary_output="012-review_team_raw.json", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_team.py", + "worker_plan_internal/team/review_team.py", + ], + ), + StageInfo( + name="team_markdown", + output_files=["013-team.md"], + primary_output="013-team.md", + upstream_stages=["enrich_team_environment_info", "review_team"], + source_code_files=[ + "worker_plan_internal/plan/stages/team_markdown.py", + "worker_plan_internal/team/team_markdown_document.py", + ], + ), + # Phase 7: Analysis & Experts + StageInfo( + name="swot_analysis", + output_files=["014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"], + primary_output="014-2-swot_analysis.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/swot_analysis.py", + "worker_plan_internal/swot/swot_analysis.py", + ], + ), + StageInfo( + name="expert_review", + output_files=["015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"], + primary_output="016-2-expert_criticism.md", + upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"], + source_code_files=[ + "worker_plan_internal/plan/stages/expert_review.py", + "worker_plan_internal/expert/expert_finder.py", + "worker_plan_internal/expert/expert_criticism.py", + ], + ), + # Phase 8: Data & Documents + StageInfo( + name="data_collection", + output_files=["017-1-data_collection_raw.json", "017-2-data_collection.md"], + primary_output="017-2-data_collection.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], + source_code_files=[ + "worker_plan_internal/plan/stages/data_collection.py", + "worker_plan_internal/plan/data_collection.py", + ], + ), + StageInfo( + name="identify_documents", + output_files=["017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"], + primary_output="017-4-identified_documents.md", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_documents.py", + "worker_plan_internal/document/identify_documents.py", + ], + ), + StageInfo( + name="filter_documents_to_find", + output_files=["017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"], + primary_output="017-8-filter_documents_to_find_clean.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], + source_code_files=[ + "worker_plan_internal/plan/stages/filter_documents_to_find.py", + "worker_plan_internal/document/filter_documents_to_find.py", + ], + ), + StageInfo( + name="filter_documents_to_create", + output_files=["017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"], + primary_output="017-10-filter_documents_to_create_clean.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], + source_code_files=[ + "worker_plan_internal/plan/stages/filter_documents_to_create.py", + "worker_plan_internal/document/filter_documents_to_create.py", + ], + ), + StageInfo( + name="draft_documents_to_find", + output_files=["017-12-draft_documents_to_find.json"], + primary_output="017-12-draft_documents_to_find.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"], + source_code_files=[ + "worker_plan_internal/plan/stages/draft_documents_to_find.py", + "worker_plan_internal/document/draft_document_to_find.py", + ], + ), + StageInfo( + name="draft_documents_to_create", + output_files=["017-14-draft_documents_to_create.json"], + primary_output="017-14-draft_documents_to_create.json", + upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"], + source_code_files=[ + "worker_plan_internal/plan/stages/draft_documents_to_create.py", + "worker_plan_internal/document/draft_document_to_create.py", + ], + ), + StageInfo( + name="markdown_documents", + output_files=["017-15-documents_to_create_and_find.md"], + primary_output="017-15-documents_to_create_and_find.md", + upstream_stages=["draft_documents_to_create", "draft_documents_to_find"], + source_code_files=[ + "worker_plan_internal/plan/stages/markdown_documents.py", + "worker_plan_internal/document/markdown_with_document.py", + ], + ), + # Phase 9: WBS + StageInfo( + name="create_wbs_level1", + output_files=["018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"], + primary_output="018-2-wbs_level1.json", + upstream_stages=["project_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level1.py", + "worker_plan_internal/plan/create_wbs_level1.py", + ], + ), + StageInfo( + name="create_wbs_level2", + output_files=["018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"], + primary_output="018-5-wbs_level2.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level2.py", + "worker_plan_internal/plan/create_wbs_level2.py", + ], + ), + StageInfo( + name="wbs_project_level1_and_level2", + output_files=["019-wbs_project_level1_and_level2.json"], + primary_output="019-wbs_project_level1_and_level2.json", + upstream_stages=["create_wbs_level1", "create_wbs_level2"], + source_code_files=[ + "worker_plan_internal/plan/stages/wbs_project_level1_and_level2.py", + "worker_plan_internal/wbs/wbs_populate.py", + ], + ), + # Phase 10: Pitch & Dependencies + StageInfo( + name="create_pitch", + output_files=["020-1-pitch_raw.json"], + primary_output="020-1-pitch_raw.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_pitch.py", + "worker_plan_internal/pitch/create_pitch.py", + ], + ), + StageInfo( + name="convert_pitch_to_markdown", + output_files=["020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"], + primary_output="020-3-pitch.md", + upstream_stages=["create_pitch"], + source_code_files=[ + "worker_plan_internal/plan/stages/convert_pitch_to_markdown.py", + "worker_plan_internal/pitch/convert_pitch_to_markdown.py", + ], + ), + StageInfo( + name="identify_task_dependencies", + output_files=["021-task_dependencies_raw.json"], + primary_output="021-task_dependencies_raw.json", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/identify_task_dependencies.py", + "worker_plan_internal/plan/identify_wbs_task_dependencies.py", + ], + ), + StageInfo( + name="estimate_task_durations", + output_files=["022-2-task_durations.json"], + primary_output="022-2-task_durations.json", + upstream_stages=["project_plan", "wbs_project_level1_and_level2"], + source_code_files=[ + "worker_plan_internal/plan/stages/estimate_task_durations.py", + "worker_plan_internal/plan/estimate_wbs_task_durations.py", + ], + ), + # Phase 11: WBS Level 3 + StageInfo( + name="create_wbs_level3", + output_files=["023-2-wbs_level3.json"], + primary_output="023-2-wbs_level3.json", + upstream_stages=["project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_wbs_level3.py", + "worker_plan_internal/plan/create_wbs_level3.py", + ], + ), + StageInfo( + name="wbs_project_level1_level2_level3", + output_files=["023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"], + primary_output="023-3-wbs_project_level1_and_level2_and_level3.json", + upstream_stages=["wbs_project_level1_and_level2", "create_wbs_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/wbs_project_level1_level2_level3.py", + "worker_plan_internal/wbs/wbs_populate.py", + ], + ), + # Phase 12: Schedule & Reviews + StageInfo( + name="create_schedule", + output_files=["026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"], + primary_output="026-2-schedule_gantt_dhtmlx.html", + upstream_stages=["start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/create_schedule.py", + "worker_plan_internal/schedule/project_schedule_populator.py", + ], + ), + StageInfo( + name="review_plan", + output_files=["024-1-review_plan_raw.json", "024-2-review_plan.md"], + primary_output="024-2-review_plan.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"], + source_code_files=[ + "worker_plan_internal/plan/stages/review_plan.py", + "worker_plan_internal/plan/review_plan.py", + ], + ), + StageInfo( + name="executive_summary", + output_files=["025-1-executive_summary_raw.json", "025-2-executive_summary.md"], + primary_output="025-2-executive_summary.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/executive_summary.py", + "worker_plan_internal/plan/executive_summary.py", + ], + ), + StageInfo( + name="questions_and_answers", + output_files=["027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"], + primary_output="027-2-questions_and_answers.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"], + source_code_files=[ + "worker_plan_internal/plan/stages/questions_and_answers.py", + "worker_plan_internal/questions_answers/questions_answers.py", + ], + ), + StageInfo( + name="premortem", + output_files=["028-1-premortem_raw.json", "028-2-premortem.md"], + primary_output="028-2-premortem.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"], + source_code_files=[ + "worker_plan_internal/plan/stages/premortem.py", + "worker_plan_internal/diagnostics/premortem.py", + ], + ), + StageInfo( + name="self_audit", + output_files=["029-1-self_audit_raw.json", "029-2-self_audit.md"], + primary_output="029-2-self_audit.md", + upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"], + source_code_files=[ + "worker_plan_internal/plan/stages/self_audit.py", + "worker_plan_internal/self_audit/self_audit.py", + ], + ), + # Phase 13: Final Report + StageInfo( + name="report", + output_files=["030-report.html"], + primary_output="030-report.html", + upstream_stages=[ + "setup", "screen_planning_prompt", "redline_gate", "premise_attack", + "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", + "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", + "convert_pitch_to_markdown", "data_collection", "markdown_documents", + "create_wbs_level1", "wbs_project_level1_level2_level3", "expert_review", + "project_plan", "review_plan", "executive_summary", "create_schedule", + "questions_and_answers", "premortem", "self_audit", + ], + source_code_files=[ + "worker_plan_internal/plan/stages/report.py", + "worker_plan_internal/report/report_generator.py", + ], + ), +] + +# ── Lookup indexes (built once at import time) ────────────────────────── + +_STAGE_BY_NAME: dict[str, StageInfo] = {s.name: s for s in STAGES} +_STAGE_BY_FILENAME: dict[str, StageInfo] = {} +for _stage in STAGES: + for _fname in _stage.output_files: + _STAGE_BY_FILENAME[_fname] = _stage + + +def find_stage_by_filename(filename: str) -> Optional[StageInfo]: + """Given an output filename, return the stage that produced it.""" + return _STAGE_BY_FILENAME.get(filename) + + +def get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]: + """Return (stage_name, file_path) pairs for upstream stages whose primary output exists on disk.""" + stage = _STAGE_BY_NAME.get(stage_name) + if stage is None: + return [] + + result = [] + for upstream_name in stage.upstream_stages: + upstream_stage = _STAGE_BY_NAME.get(upstream_name) + if upstream_stage is None: + continue + primary_path = output_dir / upstream_stage.primary_output + if primary_path.exists(): + result.append((upstream_name, primary_path)) + return result + + +def get_source_code_paths(stage_name: str) -> list[Path]: + """Return absolute paths to source code files for a stage.""" + stage = _STAGE_BY_NAME.get(stage_name) + if stage is None: + return [] + return [_SOURCE_BASE / f for f in stage.source_code_files] +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_registry.py -v` +Expected: All tests PASS + +- [ ] **Step 6: Commit** + +```bash +git add worker_plan/worker_plan_internal/flaw_tracer/__init__.py worker_plan/worker_plan_internal/flaw_tracer/registry.py worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +git commit -m "feat: add flaw_tracer registry with full pipeline DAG mapping" +``` + +--- + +### Task 2: Prompts — Pydantic Models and Prompt Builders + +**Files:** +- Create: `worker_plan/worker_plan_internal/flaw_tracer/prompts.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py` + +- [ ] **Step 1: Write the failing tests** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +import unittest +from llama_index.core.llms import ChatMessage, MessageRole +from worker_plan_internal.flaw_tracer.prompts import ( + IdentifiedFlaw, + FlawIdentificationResult, + UpstreamCheckResult, + SourceCodeAnalysisResult, + build_flaw_identification_messages, + build_upstream_check_messages, + build_source_code_analysis_messages, +) + + +class TestPydanticModels(unittest.TestCase): + def test_identified_flaw_valid(self): + flaw = IdentifiedFlaw( + description="Budget figure is fabricated", + evidence="The budget is CZK 500,000", + severity="HIGH", + ) + self.assertEqual(flaw.severity, "HIGH") + + def test_identified_flaw_rejects_invalid_severity(self): + with self.assertRaises(Exception): + IdentifiedFlaw( + description="test", + evidence="test", + severity="CRITICAL", + ) + + def test_flaw_identification_result(self): + result = FlawIdentificationResult(flaws=[ + IdentifiedFlaw(description="test", evidence="quote", severity="LOW"), + ]) + self.assertEqual(len(result.flaws), 1) + + def test_upstream_check_result_found(self): + result = UpstreamCheckResult(found=True, evidence="quote", explanation="precursor") + self.assertTrue(result.found) + self.assertEqual(result.evidence, "quote") + + def test_upstream_check_result_not_found(self): + result = UpstreamCheckResult(found=False, evidence=None, explanation="clean") + self.assertFalse(result.found) + + def test_source_code_analysis_result(self): + result = SourceCodeAnalysisResult( + likely_cause="prompt lacks validation", + relevant_code_section="system_prompt = ...", + suggestion="add grounding check", + ) + self.assertIsInstance(result.likely_cause, str) + + +class TestBuildFlawIdentificationMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_flaw_identification_messages( + filename="030-report.html", + file_content="report content", + user_flaw_description="budget is wrong", + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + self.assertEqual(messages[0].role, MessageRole.SYSTEM) + self.assertEqual(messages[1].role, MessageRole.USER) + + def test_user_message_contains_inputs(self): + messages = build_flaw_identification_messages( + filename="025-2-executive_summary.md", + file_content="# Summary\nBudget: 500k", + user_flaw_description="fabricated budget", + ) + user_content = messages[1].content + self.assertIn("025-2-executive_summary.md", user_content) + self.assertIn("# Summary", user_content) + self.assertIn("fabricated budget", user_content) + + +class TestBuildUpstreamCheckMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_upstream_check_messages( + flaw_description="Budget is fabricated", + evidence_quote="CZK 500,000", + upstream_filename="005-2-project_plan.md", + upstream_file_content="# Project Plan\nBudget: 500k", + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + + def test_user_message_contains_flaw_and_upstream(self): + messages = build_upstream_check_messages( + flaw_description="Missing market sizing", + evidence_quote="growing Czech market", + upstream_filename="003-5-make_assumptions.md", + upstream_file_content="# Assumptions\nMarket is growing", + ) + user_content = messages[1].content + self.assertIn("Missing market sizing", user_content) + self.assertIn("growing Czech market", user_content) + self.assertIn("003-5-make_assumptions.md", user_content) + + +class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): + def test_returns_chat_messages(self): + messages = build_source_code_analysis_messages( + flaw_description="Budget fabricated", + evidence_quote="CZK 500,000", + source_code_contents=[ + ("stages/make_assumptions.py", "class MakeAssumptionsTask: ..."), + ("assume/make_assumptions.py", "def execute(llm, query): ..."), + ], + ) + self.assertIsInstance(messages, list) + self.assertEqual(len(messages), 2) + + def test_user_message_contains_source_code(self): + messages = build_source_code_analysis_messages( + flaw_description="Missing analysis", + evidence_quote="no data", + source_code_contents=[ + ("my_stage.py", "SYSTEM_PROMPT = 'Generate assumptions'"), + ], + ) + user_content = messages[1].content + self.assertIn("my_stage.py", user_content) + self.assertIn("SYSTEM_PROMPT", user_content) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_prompts.py -v` +Expected: FAIL with `ImportError` + +- [ ] **Step 3: Implement prompts.py** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/prompts.py +"""Pydantic models and prompt builders for the flaw tracer.""" +from typing import Literal +from pydantic import BaseModel, Field +from llama_index.core.llms import ChatMessage, MessageRole + + +# ── Pydantic models for structured LLM output ────────────────────────── + +class IdentifiedFlaw(BaseModel): + """A discrete flaw found in a pipeline output file.""" + description: str = Field(description="One-sentence description of the flaw") + evidence: str = Field(description="Direct quote from the file demonstrating the flaw") + severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( + description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." + ) + + +class FlawIdentificationResult(BaseModel): + """Result of analyzing a file for flaws.""" + flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") + + +class UpstreamCheckResult(BaseModel): + """Result of checking an upstream file for a flaw precursor.""" + found: bool = Field(description="True if this file contains the flaw or a precursor to it") + evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") + explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + + +class SourceCodeAnalysisResult(BaseModel): + """Result of analyzing source code at a flaw's origin stage.""" + likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + relevant_code_section: str = Field(description="The specific code or prompt text responsible") + suggestion: str = Field(description="How to fix or prevent this flaw") + + +# ── Prompt builders ───────────────────────────────────────────────────── + +def build_flaw_identification_messages( + filename: str, + file_content: str, + user_flaw_description: str, +) -> list[ChatMessage]: + """Build messages for Phase 1: identifying discrete flaws in a file.""" + system = ( + "You are analyzing an intermediary file from a project planning pipeline.\n" + "The user has identified problems in this output. Identify each discrete flaw.\n" + "For each flaw, provide a short description (one sentence), a direct quote " + "from the file as evidence, and a severity level.\n" + "Only identify real flaws — do not flag stylistic preferences or minor formatting issues.\n" + "Severity levels:\n" + "- HIGH: fabricated data, invented statistics, or missing critical analysis\n" + "- MEDIUM: weak reasoning, vague unsupported claims, or shallow treatment\n" + "- LOW: minor gaps that don't significantly impact the plan" + ) + user = ( + f"User's observation:\n{user_flaw_description}\n\n" + f"Filename: {filename}\n" + f"File content:\n{file_content}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] + + +def build_upstream_check_messages( + flaw_description: str, + evidence_quote: str, + upstream_filename: str, + upstream_file_content: str, +) -> list[ChatMessage]: + """Build messages for Phase 2: checking if a flaw exists in an upstream file.""" + system = ( + "You are tracing a flaw through a project planning pipeline to find where it originated.\n" + "A downstream file contains a flaw. You are examining an upstream file that was an input " + "to the stage that produced the flawed output.\n" + "Determine if this upstream file contains the same problem or a precursor to it.\n" + "If YES: quote the relevant passage and explain how it connects to the downstream flaw.\n" + "If NO: explain why this file is clean regarding this specific flaw." + ) + user = ( + f"Flaw: {flaw_description}\n" + f"Evidence from downstream: {evidence_quote}\n\n" + f"Upstream filename: {upstream_filename}\n" + f"Upstream file content:\n{upstream_file_content}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] + + +def build_source_code_analysis_messages( + flaw_description: str, + evidence_quote: str, + source_code_contents: list[tuple[str, str]], +) -> list[ChatMessage]: + """Build messages for Phase 3: analyzing source code at flaw origin. + + Args: + source_code_contents: list of (filename, content) tuples + """ + system = ( + "A flaw was introduced at this pipeline stage. The flaw exists in its output " + "but NOT in any of its inputs, so this stage created it.\n" + "Examine the source code to identify what in the prompt text, logic, or processing " + "likely caused this flaw. Be specific — point to lines or prompt phrases.\n" + "Focus on the system prompt text and the data transformation logic." + ) + source_sections = [] + for fname, content in source_code_contents: + source_sections.append(f"--- {fname} ---\n{content}") + source_text = "\n\n".join(source_sections) + + user = ( + f"Flaw: {flaw_description}\n" + f"Evidence from output: {evidence_quote}\n\n" + f"Source code files:\n{source_text}" + ) + return [ + ChatMessage(role=MessageRole.SYSTEM, content=system), + ChatMessage(role=MessageRole.USER, content=user), + ] +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_prompts.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add worker_plan/worker_plan_internal/flaw_tracer/prompts.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +git commit -m "feat: add flaw_tracer Pydantic models and prompt builders" +``` + +--- + +### Task 3: Tracer — Recursive Algorithm + +**Files:** +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tracer.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py` + +- [ ] **Step 1: Write the failing tests** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +import json +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from worker_plan_internal.flaw_tracer.tracer import FlawTracer, FlawTraceResult, TracedFlaw, TraceEntry +from worker_plan_internal.llm_util.response_mockllm import ResponseMockLLM +from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelWithInstance + + +def _make_executor(responses: list[str]) -> LLMExecutor: + """Create an LLMExecutor backed by a mock LLM with pre-set responses.""" + llm = ResponseMockLLM(responses=responses) + llm_models = LLMModelWithInstance.from_instances([llm]) + return LLMExecutor(llm_models=llm_models) + + +class TestFlawTraceResult(unittest.TestCase): + def test_dataclass_creation(self): + result = FlawTraceResult( + starting_file="030-report.html", + flaw_description="test", + output_dir="/tmp/test", + flaws=[], + llm_calls_made=0, + ) + self.assertEqual(result.starting_file, "030-report.html") + self.assertEqual(len(result.flaws), 0) + + +class TestFlawTracerPhase1(unittest.TestCase): + """Test flaw identification (Phase 1) using mock LLM.""" + + def test_identify_flaws_returns_flaws(self): + """The tracer should parse LLM output into IdentifiedFlaw objects.""" + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create a minimal output file + report_file = output_dir / "025-2-executive_summary.md" + report_file.write_text("# Summary\nBudget: CZK 500,000", encoding="utf-8") + # Create upstream file so trace can proceed + (output_dir / "005-2-project_plan.md").write_text("# Plan", encoding="utf-8") + + # Mock LLM response for flaw identification (Phase 1) + flaw_response = json.dumps({ + "flaws": [ + { + "description": "Budget is unvalidated", + "evidence": "CZK 500,000", + "severity": "HIGH", + } + ] + }) + # Mock LLM response for upstream check (Phase 2) — not found, so origin is here + upstream_response = json.dumps({ + "found": False, + "evidence": None, + "explanation": "No budget mentioned upstream", + }) + # Mock LLM response for source code analysis (Phase 3) + source_response = json.dumps({ + "likely_cause": "Prompt asks for budget without data", + "relevant_code_section": "system_prompt = ...", + "suggestion": "Add validation step", + }) + + executor = _make_executor([flaw_response, upstream_response, source_response]) + source_base = Path(__file__).resolve().parent.parent.parent.parent # worker_plan/ + tracer = FlawTracer( + output_dir=output_dir, + llm_executor=executor, + source_code_base=source_base, + max_depth=15, + verbose=False, + ) + result = tracer.trace("025-2-executive_summary.md", "budget is unvalidated") + + self.assertIsInstance(result, FlawTraceResult) + self.assertGreaterEqual(len(result.flaws), 1) + flaw = result.flaws[0] + self.assertEqual(flaw.description, "Budget is unvalidated") + self.assertEqual(flaw.severity, "HIGH") + + +class TestFlawTracerUpstreamTrace(unittest.TestCase): + """Test upstream tracing (Phase 2) with a simple two-level chain.""" + + def test_traces_flaw_upstream(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + # Create files for a simple chain: executive_summary -> project_plan -> setup + (output_dir / "025-2-executive_summary.md").write_text("Budget: CZK 500,000", encoding="utf-8") + (output_dir / "005-2-project_plan.md").write_text("Budget: CZK 500,000", encoding="utf-8") + (output_dir / "001-2-plan.txt").write_text("Open a tea shop", encoding="utf-8") + # Create other upstream files that executive_summary depends on + (output_dir / "002-14-strategic_decisions.md").write_text("decisions", encoding="utf-8") + (output_dir / "002-19-scenarios.md").write_text("scenarios", encoding="utf-8") + (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") + + responses = [ + # Phase 1: identify flaws in executive_summary + json.dumps({"flaws": [{"description": "Budget fabricated", "evidence": "CZK 500,000", "severity": "HIGH"}]}), + # Phase 2: check each upstream of executive_summary + # strategic_decisions_markdown + json.dumps({"found": False, "evidence": None, "explanation": "clean"}), + # scenarios_markdown + json.dumps({"found": False, "evidence": None, "explanation": "clean"}), + # consolidate_assumptions_markdown + json.dumps({"found": False, "evidence": None, "explanation": "clean"}), + # project_plan — flaw found here + json.dumps({"found": True, "evidence": "Budget: CZK 500,000", "explanation": "Budget originates here"}), + # Now trace project_plan's upstreams + # setup + json.dumps({"found": False, "evidence": None, "explanation": "clean"}), + # strategic_decisions_markdown (already checked, dedup) + # scenarios_markdown (already checked, dedup) + # consolidate_assumptions_markdown (already checked, dedup) + # pre_project_assessment (not on disk, skipped) + # Phase 3: source code analysis at project_plan (the origin) + json.dumps({"likely_cause": "Prompt generates budget", "relevant_code_section": "...", "suggestion": "fix"}), + ] + + executor = _make_executor(responses) + source_base = Path(__file__).resolve().parent.parent.parent.parent + tracer = FlawTracer( + output_dir=output_dir, + llm_executor=executor, + source_code_base=source_base, + max_depth=15, + verbose=False, + ) + result = tracer.trace("025-2-executive_summary.md", "budget is fabricated") + + self.assertEqual(len(result.flaws), 1) + flaw = result.flaws[0] + # The trace should include at least executive_summary and project_plan + trace_stages = [entry.stage for entry in flaw.trace] + self.assertIn("executive_summary", trace_stages) + self.assertIn("project_plan", trace_stages) + # Origin should be project_plan (flaw found there but not in its upstream) + self.assertEqual(flaw.origin_stage, "project_plan") + + +class TestFlawTracerMaxDepth(unittest.TestCase): + def test_respects_max_depth(self): + with TemporaryDirectory() as d: + output_dir = Path(d) + (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") + + responses = [ + json.dumps({"flaws": [{"description": "test flaw", "evidence": "500k", "severity": "LOW"}]}), + ] + executor = _make_executor(responses) + source_base = Path(__file__).resolve().parent.parent.parent.parent + tracer = FlawTracer( + output_dir=output_dir, + llm_executor=executor, + source_code_base=source_base, + max_depth=0, # zero depth = no upstream tracing + verbose=False, + ) + result = tracer.trace("025-2-executive_summary.md", "test") + + self.assertEqual(len(result.flaws), 1) + # With max_depth=0, no upstream tracing happens + self.assertEqual(len(result.flaws[0].trace), 1) # only the starting file +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_tracer.py -v` +Expected: FAIL with `ImportError` + +- [ ] **Step 3: Implement tracer.py** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tracer.py +"""Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" +import json +import logging +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from llama_index.core.llms.llm import LLM + +from worker_plan_internal.flaw_tracer.registry import ( + find_stage_by_filename, + get_upstream_files, + get_source_code_paths, +) +from worker_plan_internal.flaw_tracer.prompts import ( + FlawIdentificationResult, + UpstreamCheckResult, + SourceCodeAnalysisResult, + build_flaw_identification_messages, + build_upstream_check_messages, + build_source_code_analysis_messages, +) +from worker_plan_internal.llm_util.llm_executor import LLMExecutor + +logger = logging.getLogger(__name__) + + +@dataclass +class TraceEntry: + """One hop in a flaw's upstream trace.""" + stage: str + file: str + evidence: str + is_origin: bool = False + + +@dataclass +class OriginInfo: + """Source code analysis at a flaw's origin stage.""" + stage: str + file: str + source_code_files: list[str] + likely_cause: str + suggestion: str + + +@dataclass +class TracedFlaw: + """A fully traced flaw with its upstream chain.""" + id: str + description: str + severity: str + starting_evidence: str + trace: list[TraceEntry] + origin_stage: Optional[str] = None + origin: Optional[OriginInfo] = None + depth: int = 0 + trace_complete: bool = True + + +@dataclass +class FlawTraceResult: + """Complete result of a flaw trace run.""" + starting_file: str + flaw_description: str + output_dir: str + flaws: list[TracedFlaw] + llm_calls_made: int = 0 + + +class FlawTracer: + """Traces flaws upstream through the PlanExe pipeline DAG.""" + + def __init__( + self, + output_dir: Path, + llm_executor: LLMExecutor, + source_code_base: Path, + max_depth: int = 15, + verbose: bool = False, + ): + self.output_dir = output_dir + self.llm_executor = llm_executor + self.source_code_base = source_code_base + self.max_depth = max_depth + self.verbose = verbose + self._llm_calls = 0 + self._checked: set[tuple[str, str]] = set() # (stage_name, flaw_description) dedup + + def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: + """Main entry point. Identify flaws and trace each upstream.""" + self._llm_calls = 0 + self._checked.clear() + + file_path = self.output_dir / starting_file + if not file_path.exists(): + raise FileNotFoundError(f"Starting file not found: {file_path}") + + file_content = file_path.read_text(encoding="utf-8") + stage = find_stage_by_filename(starting_file) + stage_name = stage.name if stage else "unknown" + + # Phase 1: Identify flaws + self._log(f"Phase 1: Identifying flaws in {starting_file}") + identified = self._identify_flaws(starting_file, file_content, flaw_description) + self._log(f" Found {len(identified.flaws)} flaw(s)") + + traced_flaws: list[TracedFlaw] = [] + for i, flaw in enumerate(identified.flaws): + flaw_id = f"flaw_{i + 1:03d}" + self._log(f"\nTracing {flaw_id}: {flaw.description}") + + starting_entry = TraceEntry( + stage=stage_name, + file=starting_file, + evidence=flaw.evidence, + is_origin=False, + ) + + traced = TracedFlaw( + id=flaw_id, + description=flaw.description, + severity=flaw.severity, + starting_evidence=flaw.evidence, + trace=[starting_entry], + ) + + if stage and self.max_depth > 0: + self._trace_upstream(traced, stage_name, flaw.description, flaw.evidence, depth=0) + + # Mark the last trace entry as origin if no deeper origin was found + if traced.origin_stage is None and traced.trace: + last = traced.trace[-1] + last.is_origin = True + traced.origin_stage = last.stage + traced.depth = len(traced.trace) - 1 + + # Phase 3: Source code analysis at origin + self._analyze_source_code(traced, last.stage, flaw.description, last.evidence) + + traced_flaws.append(traced) + + # Sort by depth (deepest origin first) + traced_flaws.sort(key=lambda f: f.depth, reverse=True) + + return FlawTraceResult( + starting_file=starting_file, + flaw_description=flaw_description, + output_dir=str(self.output_dir), + flaws=traced_flaws, + llm_calls_made=self._llm_calls, + ) + + def _identify_flaws(self, filename: str, file_content: str, user_description: str) -> FlawIdentificationResult: + """Phase 1: Ask LLM to identify discrete flaws in the starting file.""" + messages = build_flaw_identification_messages(filename, file_content, user_description) + + def execute(llm: LLM) -> FlawIdentificationResult: + sllm = llm.as_structured_llm(FlawIdentificationResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + return self.llm_executor.run(execute) + + def _check_upstream(self, flaw_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: + """Phase 2: Ask LLM if a flaw exists in an upstream file.""" + messages = build_upstream_check_messages(flaw_description, evidence, upstream_filename, upstream_content) + + def execute(llm: LLM) -> UpstreamCheckResult: + sllm = llm.as_structured_llm(UpstreamCheckResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + return self.llm_executor.run(execute) + + def _trace_upstream( + self, + traced: TracedFlaw, + current_stage: str, + flaw_description: str, + evidence: str, + depth: int, + ) -> None: + """Recursively trace a flaw through upstream stages.""" + if depth >= self.max_depth: + traced.trace_complete = False + self._log(f" Max depth {self.max_depth} reached at {current_stage}") + return + + upstream_files = get_upstream_files(current_stage, self.output_dir) + if not upstream_files: + return # No upstream = this is the origin + + found_upstream = False + for upstream_name, upstream_path in upstream_files: + dedup_key = (upstream_name, flaw_description) + if dedup_key in self._checked: + self._log(f" Skipping {upstream_name} (already checked for this flaw)") + continue + self._checked.add(dedup_key) + + upstream_content = upstream_path.read_text(encoding="utf-8") + self._log(f" Checking upstream: {upstream_name} ({upstream_path.name})") + + result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) + + if result.found: + self._log(f" -> FOUND in {upstream_name}") + found_upstream = True + entry = TraceEntry( + stage=upstream_name, + file=upstream_path.name, + evidence=result.evidence or "", + is_origin=False, + ) + traced.trace.append(entry) + + # Recurse deeper + self._trace_upstream( + traced, upstream_name, flaw_description, + result.evidence or evidence, depth + 1, + ) + # After recursion, if origin was found deeper, stop tracing other branches + if traced.origin_stage is not None: + return + + if not found_upstream: + # Current stage is the origin — flaw exists here but not in any upstream + traced.origin_stage = current_stage + traced.depth = len(traced.trace) + # Mark the current stage entry as origin + for entry in traced.trace: + if entry.stage == current_stage: + entry.is_origin = True + + def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_description: str, evidence: str) -> None: + """Phase 3: Analyze source code at the origin stage.""" + source_paths = get_source_code_paths(stage_name) + if not source_paths: + return + + source_contents: list[tuple[str, str]] = [] + for path in source_paths: + if path.exists(): + content = path.read_text(encoding="utf-8") + source_contents.append((path.name, content)) + + if not source_contents: + return + + self._log(f" Phase 3: Analyzing source code for {stage_name}") + messages = build_source_code_analysis_messages(flaw_description, evidence, source_contents) + + def execute(llm: LLM) -> SourceCodeAnalysisResult: + sllm = llm.as_structured_llm(SourceCodeAnalysisResult) + response = sllm.chat(messages) + return response.raw + + self._llm_calls += 1 + try: + analysis = self.llm_executor.run(execute) + source_file_names = [name for name, _ in source_contents] + traced.origin = OriginInfo( + stage=stage_name, + file=traced.trace[-1].file if traced.trace else "", + source_code_files=source_file_names, + likely_cause=analysis.likely_cause, + suggestion=analysis.suggestion, + ) + except Exception as e: + logger.warning(f"Source code analysis failed for {stage_name}: {e}") + + def _log(self, message: str) -> None: + """Print to stderr if verbose mode is enabled.""" + if self.verbose: + print(message, file=sys.stderr) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_tracer.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add worker_plan/worker_plan_internal/flaw_tracer/tracer.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +git commit -m "feat: add flaw_tracer recursive tracing algorithm" +``` + +--- + +### Task 4: Output — JSON and Markdown Reports + +**Files:** +- Create: `worker_plan/worker_plan_internal/flaw_tracer/output.py` +- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py` + +- [ ] **Step 1: Write the failing tests** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +import json +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from worker_plan_internal.flaw_tracer.tracer import ( + FlawTraceResult, + TracedFlaw, + TraceEntry, + OriginInfo, +) +from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report + + +def _make_sample_result() -> FlawTraceResult: + """Create a sample FlawTraceResult for testing.""" + return FlawTraceResult( + starting_file="025-2-executive_summary.md", + flaw_description="Budget is unvalidated", + output_dir="/tmp/test_output", + flaws=[ + TracedFlaw( + id="flaw_001", + description="Budget of CZK 500,000 is unvalidated", + severity="HIGH", + starting_evidence="CZK 500,000", + trace=[ + TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="CZK 500,000", is_origin=False), + TraceEntry(stage="project_plan", file="005-2-project_plan.md", evidence="Budget: 500k", is_origin=False), + TraceEntry(stage="make_assumptions", file="003-5-make_assumptions.md", evidence="Assume budget of 500k", is_origin=True), + ], + origin_stage="make_assumptions", + origin=OriginInfo( + stage="make_assumptions", + file="003-5-make_assumptions.md", + source_code_files=["make_assumptions.py"], + likely_cause="Prompt generates budget without data", + suggestion="Add validation step", + ), + depth=3, + ), + TracedFlaw( + id="flaw_002", + description="Missing market sizing", + severity="MEDIUM", + starting_evidence="growing Czech market", + trace=[ + TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="growing Czech market", is_origin=True), + ], + origin_stage="executive_summary", + depth=1, + ), + ], + llm_calls_made=8, + ) + + +class TestWriteJsonReport(unittest.TestCase): + def test_writes_valid_json(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + self.assertTrue(output_path.exists()) + data = json.loads(output_path.read_text(encoding="utf-8")) + self.assertIn("input", data) + self.assertIn("flaws", data) + self.assertIn("summary", data) + + def test_json_contains_correct_summary(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + data = json.loads(output_path.read_text(encoding="utf-8")) + summary = data["summary"] + self.assertEqual(summary["total_flaws"], 2) + self.assertEqual(summary["deepest_origin_stage"], "make_assumptions") + self.assertEqual(summary["deepest_origin_depth"], 3) + self.assertEqual(summary["llm_calls_made"], 8) + + def test_json_flaws_sorted_by_depth(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.json" + result = _make_sample_result() + write_json_report(result, output_path) + + data = json.loads(output_path.read_text(encoding="utf-8")) + depths = [f["depth"] for f in data["flaws"]] + self.assertEqual(depths, sorted(depths, reverse=True)) + + +class TestWriteMarkdownReport(unittest.TestCase): + def test_writes_markdown_file(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + self.assertTrue(output_path.exists()) + content = output_path.read_text(encoding="utf-8") + self.assertIn("# Flaw Trace Report", content) + + def test_markdown_contains_flaw_details(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("Budget of CZK 500,000 is unvalidated", content) + self.assertIn("make_assumptions", content) + self.assertIn("executive_summary", content) + + def test_markdown_contains_trace_table(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = _make_sample_result() + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("| Stage |", content) + self.assertIn("| File |", content) + + def test_empty_result_produces_valid_markdown(self): + with TemporaryDirectory() as d: + output_path = Path(d) / "flaw_trace.md" + result = FlawTraceResult( + starting_file="030-report.html", + flaw_description="test", + output_dir="/tmp", + flaws=[], + llm_calls_made=1, + ) + write_markdown_report(result, output_path) + + content = output_path.read_text(encoding="utf-8") + self.assertIn("Flaws found:** 0", content) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_output.py -v` +Expected: FAIL with `ImportError` + +- [ ] **Step 3: Implement output.py** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/output.py +"""JSON and markdown report generation for flaw trace results.""" +import json +from datetime import datetime, UTC +from pathlib import Path + +from worker_plan_internal.flaw_tracer.tracer import FlawTraceResult + + +def write_json_report(result: FlawTraceResult, output_path: Path) -> None: + """Write the flaw trace result as a JSON file.""" + data = { + "input": { + "starting_file": result.starting_file, + "flaw_description": result.flaw_description, + "output_dir": result.output_dir, + "timestamp": datetime.now(UTC).isoformat(), + }, + "flaws": [], + "summary": { + "total_flaws": len(result.flaws), + "deepest_origin_stage": None, + "deepest_origin_depth": 0, + "llm_calls_made": result.llm_calls_made, + }, + } + + max_depth = 0 + deepest_stage = None + + for flaw in result.flaws: + flaw_data = { + "id": flaw.id, + "description": flaw.description, + "severity": flaw.severity, + "starting_evidence": flaw.starting_evidence, + "trace": [ + { + "stage": entry.stage, + "file": entry.file, + "evidence": entry.evidence, + "is_origin": entry.is_origin, + } + for entry in flaw.trace + ], + "origin": None, + "depth": flaw.depth, + "trace_complete": flaw.trace_complete, + } + + if flaw.origin: + flaw_data["origin"] = { + "stage": flaw.origin.stage, + "file": flaw.origin.file, + "source_code_files": flaw.origin.source_code_files, + "likely_cause": flaw.origin.likely_cause, + "suggestion": flaw.origin.suggestion, + } + + if flaw.depth > max_depth: + max_depth = flaw.depth + deepest_stage = flaw.origin_stage + + data["flaws"].append(flaw_data) + + data["summary"]["deepest_origin_stage"] = deepest_stage + data["summary"]["deepest_origin_depth"] = max_depth + + output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: + """Write the flaw trace result as a markdown report.""" + lines: list[str] = [] + lines.append("# Flaw Trace Report") + lines.append("") + lines.append(f"**Input:** {result.starting_file}") + lines.append(f"**Flaws found:** {len(result.flaws)}") + + if result.flaws: + deepest = max(result.flaws, key=lambda f: f.depth) + lines.append(f"**Deepest origin:** {deepest.origin_stage} (depth {deepest.depth})") + lines.append(f"**LLM calls:** {result.llm_calls_made}") + lines.append("") + + for flaw in result.flaws: + lines.append("---") + lines.append("") + lines.append(f"## {flaw.id.replace('_', ' ').title()} ({flaw.severity}): {flaw.description}") + lines.append("") + + # Trace chain summary + stage_names = [entry.stage for entry in flaw.trace] + chain_parts = [] + for name in stage_names: + if name == flaw.origin_stage: + chain_parts.append(f"**{name}** (origin)") + else: + chain_parts.append(name) + lines.append(f"**Trace:** {' -> '.join(chain_parts)}") + lines.append("") + + if not flaw.trace_complete: + lines.append("*Note: trace incomplete — max depth reached.*") + lines.append("") + + # Trace table + lines.append("| Stage | File | Evidence |") + lines.append("|-------|------|----------|") + for entry in flaw.trace: + stage_cell = f"**{entry.stage}**" if entry.is_origin else entry.stage + evidence_cell = _escape_table_cell(entry.evidence) + lines.append(f"| {stage_cell} | {entry.file} | {evidence_cell} |") + lines.append("") + + # Origin analysis + if flaw.origin: + lines.append(f"**Root cause:** {flaw.origin.likely_cause}") + lines.append("") + lines.append(f"**Source files:** {', '.join(flaw.origin.source_code_files)}") + lines.append("") + lines.append(f"**Suggestion:** {flaw.origin.suggestion}") + lines.append("") + + output_path.write_text("\n".join(lines), encoding="utf-8") + + +def _escape_table_cell(text: str) -> str: + """Escape pipe characters and collapse newlines for markdown table cells.""" + return text.replace("|", "\\|").replace("\n", " ") +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_output.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add worker_plan/worker_plan_internal/flaw_tracer/output.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +git commit -m "feat: add flaw_tracer JSON and markdown report generation" +``` + +--- + +### Task 5: CLI Entry Point + +**Files:** +- Create: `worker_plan/worker_plan_internal/flaw_tracer/__main__.py` + +- [ ] **Step 1: Implement __main__.py** + +```python +# worker_plan/worker_plan_internal/flaw_tracer/__main__.py +"""CLI entry point for the flaw tracer. + +Usage: + python -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output \ + --file 030-report.html \ + --flaw "The budget appears unvalidated..." \ + --output-dir /path/to/output \ + --max-depth 15 \ + --verbose +""" +import argparse +import sys +from pathlib import Path + +from worker_plan_internal.flaw_tracer.tracer import FlawTracer +from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelFromName, RetryConfig +from worker_plan_internal.llm_factory import get_llm_names_by_priority + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Trace flaws in PlanExe reports upstream to their root cause.", + ) + parser.add_argument( + "--dir", required=True, type=Path, + help="Path to the output directory containing intermediary files", + ) + parser.add_argument( + "--file", required=True, + help="Starting file to analyze (relative to --dir)", + ) + parser.add_argument( + "--flaw", required=True, + help="Text description of the observed flaw(s)", + ) + parser.add_argument( + "--output-dir", type=Path, default=None, + help="Where to write flaw_trace.json and flaw_trace.md (defaults to --dir)", + ) + parser.add_argument( + "--max-depth", type=int, default=15, + help="Maximum upstream hops per flaw (default: 15)", + ) + parser.add_argument( + "--verbose", action="store_true", + help="Print each LLM call and result to stderr", + ) + args = parser.parse_args() + + output_dir: Path = args.dir.resolve() + if not output_dir.is_dir(): + print(f"Error: --dir is not a directory: {output_dir}", file=sys.stderr) + sys.exit(1) + + starting_file = args.file + if not (output_dir / starting_file).exists(): + print(f"Error: starting file not found: {output_dir / starting_file}", file=sys.stderr) + sys.exit(1) + + report_dir: Path = (args.output_dir or args.dir).resolve() + report_dir.mkdir(parents=True, exist_ok=True) + + # Set up LLM executor with priority-ordered models from the active profile + llm_names = get_llm_names_by_priority() + if not llm_names: + print("Error: no LLM models configured. Check PLANEXE_MODEL_PROFILE.", file=sys.stderr) + sys.exit(1) + + llm_models = LLMModelFromName.from_names(llm_names) + executor = LLMExecutor( + llm_models=llm_models, + retry_config=RetryConfig(max_retries=2), + max_validation_retries=1, + ) + + # Source code base is the worker_plan/ directory + source_code_base = Path(__file__).resolve().parent.parent.parent + + tracer = FlawTracer( + output_dir=output_dir, + llm_executor=executor, + source_code_base=source_code_base, + max_depth=args.max_depth, + verbose=args.verbose, + ) + + print(f"Tracing flaws in {starting_file}...", file=sys.stderr) + result = tracer.trace(starting_file, args.flaw) + + # Write reports + json_path = report_dir / "flaw_trace.json" + md_path = report_dir / "flaw_trace.md" + write_json_report(result, json_path) + write_markdown_report(result, md_path) + + # Print summary + print(f"\nFlaws found: {len(result.flaws)}", file=sys.stderr) + if result.flaws: + deepest = max(result.flaws, key=lambda f: f.depth) + print(f"Deepest origin: {deepest.origin_stage} (depth {deepest.depth})", file=sys.stderr) + print(f"LLM calls made: {result.llm_calls_made}", file=sys.stderr) + print(f"\nReports written:", file=sys.stderr) + print(f" JSON: {json_path}", file=sys.stderr) + print(f" Markdown: {md_path}", file=sys.stderr) + + +if __name__ == "__main__": + main() +``` + +- [ ] **Step 2: Verify the module is importable** + +Run: `cd worker_plan && python -c "from worker_plan_internal.flaw_tracer.__main__ import main; print('OK')"` +Expected: `OK` + +- [ ] **Step 3: Verify --help works** + +Run: `cd worker_plan && python -m worker_plan_internal.flaw_tracer --help` +Expected: Help text showing `--dir`, `--file`, `--flaw`, `--output-dir`, `--max-depth`, `--verbose` + +- [ ] **Step 4: Commit** + +```bash +git add worker_plan/worker_plan_internal/flaw_tracer/__main__.py +git commit -m "feat: add flaw_tracer CLI entry point" +``` + +--- + +### Task 6: Run All Tests and Final Verification + +**Files:** +- No new files + +- [ ] **Step 1: Run the full test suite for the flaw_tracer package** + +Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/ -v` +Expected: All tests PASS + +- [ ] **Step 2: Run the broader worker_plan test suite to check for regressions** + +Run: `cd worker_plan && python -m pytest -v --timeout=30` +Expected: No new failures + +- [ ] **Step 3: Commit any fixes if needed** + +If tests required fixes, commit them: +```bash +git add -u +git commit -m "fix: address test issues in flaw_tracer" +``` diff --git a/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md b/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md new file mode 100644 index 000000000..cd77ac537 --- /dev/null +++ b/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md @@ -0,0 +1,281 @@ +# Flaw Tracer — Root-Cause Analysis for PlanExe Reports + +## Goal + +A CLI tool that takes a PlanExe output directory, a starting file, and a flaw description, then recursively traces the flaw upstream through the DAG of intermediary files to find where it originated. Produces both JSON and markdown output. Built on PlanExe's existing LLM infrastructure so it can eventually become a pipeline stage. + +## Architecture + +The tool performs a recursive depth-first search through the pipeline DAG. Starting from a downstream file where a flaw is observed, it walks upstream one hop at a time — reading input files, asking an LLM whether the flaw or a precursor exists there, and continuing until it reaches a stage where the flaw exists in the output but not in any inputs. At that origin point, it reads the stage's source code to identify the likely cause. + +Three LLM prompts drive the analysis: flaw identification (once at the start), upstream checking (at each hop), and source code analysis (at each origin). All use Pydantic models for structured output and LLMExecutor for fallback resilience. + +## Components + +``` +worker_plan/worker_plan_internal/flaw_tracer/ + __init__.py + __main__.py — CLI entry point (argparse, LLM setup, orchestration) + registry.py — Static DAG mapping: stages, output files, dependencies, source code paths + tracer.py — Recursive tracing algorithm + prompts.py — Pydantic models and LLM prompt templates + output.py — JSON + markdown report generation +``` + +### `registry.py` — DAG Mapping + +A static Python data structure mapping the full pipeline topology. Each entry describes one pipeline stage: + +```python +@dataclass +class StageInfo: + name: str # e.g., "potential_levers" + output_files: list[str] # e.g., ["002-9-potential_levers_raw.json", "002-10-potential_levers.json"] + upstream_stages: list[str] # e.g., ["setup", "identify_purpose", "plan_type", "extract_constraints"] + source_code_files: list[str] # Relative to worker_plan/, e.g., ["worker_plan_internal/plan/stages/potential_levers.py", "worker_plan_internal/lever/identify_potential_levers.py"] +``` + +The registry covers all ~48 pipeline stages. Key functions: + +- `find_stage_by_filename(filename: str) -> StageInfo | None` — Given an output filename, return the stage that produced it. +- `get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]` — Return `(stage_name, file_path)` pairs for all upstream stages, resolved against the output directory. Skip files that don't exist on disk. When a stage has multiple output files (e.g., both `_raw.json` and `.json`), prefer the clean/processed file since that's what downstream stages consume. If only the raw file exists, use that. +- `get_source_code_paths(stage_name: str) -> list[Path]` — Return absolute paths to source code files for a stage. + +The mapping is derived from the Luigi task classes (`requires()` and `output()` methods) but hard-coded for reliability. When the pipeline changes, this file needs updating. + +### `prompts.py` — Pydantic Models and Prompt Templates + +Three Pydantic models for structured LLM output: + +```python +class IdentifiedFlaw(BaseModel): + description: str = Field(description="One-sentence description of the flaw") + evidence: str = Field(description="Direct quote from the file demonstrating the flaw") + severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( + description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." + ) + +class FlawIdentificationResult(BaseModel): + flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") + +class UpstreamCheckResult(BaseModel): + found: bool = Field(description="True if this file contains the flaw or a precursor to it") + evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") + explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + +class SourceCodeAnalysisResult(BaseModel): + likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + relevant_code_section: str = Field(description="The specific code or prompt text responsible") + suggestion: str = Field(description="How to fix or prevent this flaw") +``` + +Three prompt-building functions, each returning a `list[ChatMessage]`: + +**`build_flaw_identification_prompt(filename, file_content, user_flaw_description)`** + +System message: +``` +You are analyzing an intermediary file from a project planning pipeline. +The user has identified problems in this output. Identify each discrete flaw. +For each flaw, provide a short description, a direct quote as evidence, and a severity level. +Only identify real flaws — do not flag stylistic preferences or minor formatting issues. +``` + +User message contains the filename, file content, and the user's flaw description. + +**`build_upstream_check_prompt(flaw_description, evidence_quote, upstream_filename, upstream_file_content)`** + +System message: +``` +You are tracing a flaw through a project planning pipeline to find where it originated. +A downstream file contains a flaw. You are examining an upstream file that was an input +to the stage that produced the flawed output. Determine if this upstream file contains +the same problem or a precursor to it. +``` + +User message contains the flaw details and the upstream file content. + +**`build_source_code_analysis_prompt(flaw_description, evidence_quote, source_code_contents)`** + +System message: +``` +A flaw was introduced at this pipeline stage. The flaw exists in its output but NOT +in any of its inputs. Examine the source code to identify what in the prompt text, +logic, or processing likely caused this flaw. Be specific — point to lines or prompt phrases. +``` + +User message contains the flaw details and the concatenated source code. + +### `tracer.py` — Recursive Tracing Algorithm + +```python +class FlawTracer: + def __init__(self, output_dir: Path, llm_executor: LLMExecutor, source_code_base: Path, max_depth: int = 15, verbose: bool = False): + ... + + def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: + """Main entry point. Returns the complete trace result.""" + ... +``` + +The `trace` method implements three phases: + +**Phase 1 — Identify flaws.** +Read the starting file. Build the flaw identification prompt with the file content and user's description. Call the LLM via `LLMExecutor.run()` using `llm.as_structured_llm(FlawIdentificationResult)`. Returns a list of `IdentifiedFlaw` objects. + +**Phase 2 — Recursive upstream trace.** +For each identified flaw, call `_trace_flaw_upstream(flaw, stage_name, current_file, depth)`: + +1. Look up the current stage's upstream stages via the registry. +2. For each upstream stage, resolve its output files on disk. +3. Read each upstream file. Build the upstream check prompt. Call the LLM. +4. If `found=True`: append to the trace chain and recurse into that stage's upstream dependencies. +5. If `found=False`: this branch is clean, stop. +6. If depth reaches `max_depth`: stop and mark trace as incomplete. + +**Deduplication:** Track which `(stage_name, flaw_description)` pairs have already been analyzed. If two flaws converge on the same upstream file, reuse the earlier result. + +**Multiple upstream branches:** When a stage has multiple upstream inputs and the flaw is found in more than one, follow all branches. The trace can fork — the JSON output represents this as a list of trace entries per flaw (each entry has a stage and file), ordered from downstream to upstream. + +**Phase 3 — Source code analysis at origin.** +When a flaw is found in a stage's output but not in any of its inputs, that stage is the origin. Read the source code files for that stage (via registry). Build the source code analysis prompt. Call the LLM. Attach the result to the flaw's origin data. + +### `output.py` — Report Generation + +Two functions: + +**`write_json_report(result: FlawTraceResult, output_path: Path)`** + +Writes the full trace as JSON: + +```json +{ + "input": { + "starting_file": "030-report.html", + "flaw_description": "...", + "output_dir": "/path/to/output", + "timestamp": "2026-04-05T14:30:00Z" + }, + "flaws": [ + { + "id": "flaw_001", + "description": "Budget of CZK 500,000 is unvalidated", + "severity": "HIGH", + "starting_evidence": "quote from starting file...", + "trace": [ + { + "stage": "executive_summary", + "file": "025-2-executive_summary.md", + "evidence": "...", + "is_origin": false + }, + { + "stage": "make_assumptions", + "file": "003-5-make_assumptions.md", + "evidence": "...", + "is_origin": true + } + ], + "origin": { + "stage": "make_assumptions", + "file": "003-5-make_assumptions.md", + "source_code_files": ["stages/make_assumptions.py", "assumption/make_assumptions.py"], + "likely_cause": "The prompt asks the LLM to...", + "suggestion": "Add a validation step that..." + }, + "depth": 2 + } + ], + "summary": { + "total_flaws": 3, + "deepest_origin_stage": "make_assumptions", + "deepest_origin_depth": 3, + "llm_calls_made": 12 + } +} +``` + +**`write_markdown_report(result: FlawTraceResult, output_path: Path)`** + +Writes a human-readable report: + +```markdown +# Flaw Trace Report + +**Input:** 030-report.html +**Flaws found:** 3 +**Deepest origin:** make_assumptions (depth 3) + +--- + +## Flaw 1 (HIGH): Budget of CZK 500,000 is unvalidated + +**Trace:** executive_summary -> project_plan -> **make_assumptions** (origin) + +| Stage | File | Evidence | +|-------|------|----------| +| executive_summary | 025-2-executive_summary.md | "The budget is CZK 500,000..." | +| project_plan | 005-2-project_plan.md | "Estimated budget: CZK 500,000..." | +| **make_assumptions** | 003-5-make_assumptions.md | "Assume total budget..." | + +**Root cause:** The prompt asks the LLM to generate budget assumptions +without requiring external data sources... + +**Suggestion:** Add a validation step that... +``` + +Flaws are sorted by depth (deepest origin first) so the most upstream root cause appears at the top. + +### `__main__.py` — CLI Entry Point + +``` +python -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output \ + --file 030-report.html \ + --flaw "The budget is CZK 500,000 but this number appears unvalidated..." \ + --output-dir /path/to/output \ + --max-depth 15 \ + --verbose +``` + +Arguments: +- `--dir` (required): Path to the output directory containing intermediary files. +- `--file` (required): Starting file to analyze, relative to `--dir`. +- `--flaw` (required): Text description of the observed flaw(s). +- `--output-dir` (optional): Where to write `flaw_trace.json` and `flaw_trace.md`. Defaults to `--dir`. +- `--max-depth` (optional): Maximum upstream hops per flaw. Default 15. +- `--verbose` (optional): Print each LLM call and result to stderr as the trace runs. + +Orchestration: +1. Parse arguments. +2. Load model profile via `PlanExeLLMConfig.load()` and create `LLMExecutor` with priority-ordered models from the profile. +3. Create `FlawTracer` instance. +4. Call `tracer.trace(starting_file, flaw_description)`. +5. Write JSON and markdown reports via `output.py`. +6. Print summary to stdout. + +## LLM Infrastructure Integration + +- **LLMExecutor** with `LLMModelFromName.from_names()` for multi-model fallback. +- **Pydantic models** with `llm.as_structured_llm()` for all three prompt types. +- **Model profile** loaded from `PLANEXE_MODEL_PROFILE` environment variable (defaults to baseline). +- **RetryConfig** with defaults (2 retries, exponential backoff) for transient errors. +- **`max_validation_retries=1`** to allow one structured output retry with feedback on parse failure. + +## Scope Boundaries + +**In scope:** +- CLI tool with `--dir`, `--file`, `--flaw`, `--output-dir`, `--max-depth`, `--verbose`. +- Static registry of all ~48 pipeline stages with dependencies and source code paths. +- Recursive depth-first upstream tracing with three LLM prompt types. +- JSON + markdown output sorted by trace depth. +- Source code analysis only at origin stages (lazy evaluation). +- Full file contents sent to LLM (no chunking or summarization). + +**Out of scope (future work):** +- Library/module API (CLI first, refactor later). +- Integration as a Luigi pipeline stage. +- Approach B (full reverse-topological sweep). +- Approach C (scout-then-trace optimization). +- Automatic registry generation from Luigi task introspection. +- UI/web integration. From 6cb35c8fbb33887bba7f287872603bc8dccbb925 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:03:56 +0200 Subject: [PATCH 11/37] docs: add flaw_tracer README with usage instructions Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/README.md | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/README.md diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md new file mode 100644 index 000000000..6716c17ca --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -0,0 +1,60 @@ +# Flaw Tracer + +Root-cause analysis tool for PlanExe reports. Given a flaw observed in a pipeline output, it traces upstream through the DAG of intermediary files to find where the flaw originated. + +## How it works + +PlanExe runs a DAG of ~70 tasks. Each task reads upstream files, calls an LLM, and writes output files (prefixed `001-` through `030-`). Flaws introduced early propagate downstream into later stages and the final report. + +The flaw tracer performs a recursive depth-first search: + +1. **Phase 1 — Identify flaws.** Reads the starting file and asks the LLM to identify discrete flaws based on your description. +2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw or a precursor exists in each input file. Continues until it finds a stage where the flaw exists in the output but not in any inputs. +3. **Phase 3 — Analyze source code.** At the origin stage, reads the Python source code that generated the output and asks the LLM what in the prompt or logic likely caused the flaw. + +Output is a JSON file (`flaw_trace.json`) and a markdown report (`flaw_trace.md`), sorted by trace depth so the deepest root cause appears first. + +## Usage + +From the `worker_plan/` directory: + +```bash +python -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output \ + --file 030-report.html \ + --flaw "The budget is CZK 500,000 but this number appears unvalidated. No market sizing or unit economics are provided." \ + --verbose +``` + +### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `--dir` | Yes | Path to the output directory containing intermediary files | +| `--file` | Yes | Starting file to analyze (relative to `--dir`) | +| `--flaw` | Yes | Text description of the observed flaw(s) | +| `--output-dir` | No | Where to write reports (defaults to `--dir`) | +| `--max-depth` | No | Maximum upstream hops per flaw (default: 15) | +| `--verbose` | No | Print each LLM call to stderr as the trace runs | + +### Example + +```bash +python -m worker_plan_internal.flaw_tracer \ + --dir /Users/you/planexe-output/20250101_india_census \ + --file 025-2-executive_summary.md \ + --flaw "The budget claims CZK 500,000 but also states costs may exceed that by 20%. The budget is an unvalidated placeholder, not a reliable plan." \ + --output-dir /tmp/flaw-analysis \ + --verbose +``` + +This produces: +- `/tmp/flaw-analysis/flaw_trace.json` — machine-readable trace +- `/tmp/flaw-analysis/flaw_trace.md` — human-readable report + +## Running tests + +```bash +cd worker_plan +python -m pytest worker_plan_internal/flaw_tracer/tests/ -v +``` From 8b2e6ff75678e1462ba40ac48ba72a5572400470 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:21:40 +0200 Subject: [PATCH 12/37] docs: add flaw_tracer README with usage instructions Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/README.md | 64 +++++++++++++++---- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md index 6716c17ca..5deb8c6db 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -14,15 +14,27 @@ The flaw tracer performs a recursive depth-first search: Output is a JSON file (`flaw_trace.json`) and a markdown report (`flaw_trace.md`), sorted by trace depth so the deepest root cause appears first. +## Prerequisites + +- Python 3.11 (`/opt/homebrew/bin/python3.11` on macOS with Homebrew) +- An LLM configured via `PLANEXE_MODEL_PROFILE` environment variable (defaults to `baseline`) +- API key for your LLM provider (e.g., `OPENROUTER_API_KEY`) + ## Usage -From the `worker_plan/` directory: +All commands are run from the `worker_plan/` directory: ```bash -python -m worker_plan_internal.flaw_tracer \ +cd worker_plan +``` + +Basic usage: + +```bash +/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ --dir /path/to/output \ --file 030-report.html \ - --flaw "The budget is CZK 500,000 but this number appears unvalidated. No market sizing or unit economics are provided." \ + --flaw "Description of the flaw you observed" \ --verbose ``` @@ -37,24 +49,54 @@ python -m worker_plan_internal.flaw_tracer \ | `--max-depth` | No | Maximum upstream hops per flaw (default: 15) | | `--verbose` | No | Print each LLM call to stderr as the trace runs | -### Example +### Starting files + +You can start from any intermediary file. Common starting points: + +| File | What it is | +|------|------------| +| `030-report.html` | The final HTML report (largest, most flaws to find) | +| `029-2-self_audit.md` | Self-audit (already identifies issues — good for tracing them back) | +| `025-2-executive_summary.md` | Executive summary | +| `024-2-review_plan.md` | Plan review | +| `028-2-premortem.md` | Premortem analysis | + +### Examples + +Trace a flaw from the self-audit: ```bash -python -m worker_plan_internal.flaw_tracer \ - --dir /Users/you/planexe-output/20250101_india_census \ +/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output/20250101_india_census \ + --file 029-2-self_audit.md \ + --flaw "No Real-World Proof. The plan combines a digital census with caste enumeration at an unprecedented scale, lacking independent evidence of success." \ + --output-dir /tmp/flaw-analysis \ + --verbose +``` + +Trace a budget flaw from the executive summary: + +```bash +/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ + --dir /path/to/output/20250101_india_census \ --file 025-2-executive_summary.md \ --flaw "The budget claims CZK 500,000 but also states costs may exceed that by 20%. The budget is an unvalidated placeholder, not a reliable plan." \ - --output-dir /tmp/flaw-analysis \ + --output-dir /tmp/flaw-analysis2 \ --verbose ``` -This produces: -- `/tmp/flaw-analysis/flaw_trace.json` — machine-readable trace -- `/tmp/flaw-analysis/flaw_trace.md` — human-readable report +### Output + +Each run produces two files in `--output-dir` (or `--dir` if not specified): + +- `flaw_trace.json` — machine-readable trace with full details +- `flaw_trace.md` — human-readable report with trace tables + +Flaws are sorted by trace depth (deepest root cause first). A typical run on a downstream file like `029-2-self_audit.md` finds 10-20 flaws and makes 100-200 LLM calls. ## Running tests ```bash cd worker_plan -python -m pytest worker_plan_internal/flaw_tracer/tests/ -v +/opt/homebrew/bin/python3.11 -m pytest worker_plan_internal/flaw_tracer/tests/ -v ``` From 5fce9f0d158ae061b30c87cf875f8e9310cb8dc0 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:25:16 +0200 Subject: [PATCH 13/37] feat: add events.jsonl live event log to flaw tracer Appends a JSONL line for each significant event during tracing: phase1_start/done, trace_flaw_start/done, upstream_check, upstream_found, origin_found, phase3_start, trace_complete. Monitor progress with: tail -f events.jsonl Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/__main__.py | 4 ++ .../flaw_tracer/tracer.py | 46 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py index 679f8ed38..3d47ccfb6 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py @@ -76,11 +76,14 @@ def main() -> None: max_validation_retries=1, ) + events_path = report_dir / "events.jsonl" + tracer = FlawTracer( output_dir=output_dir, llm_executor=executor, max_depth=args.max_depth, verbose=args.verbose, + events_path=events_path, ) print(f"Tracing flaws in {starting_file}...", file=sys.stderr) @@ -101,6 +104,7 @@ def main() -> None: print(f"\nReports written:", file=sys.stderr) print(f" JSON: {json_path}", file=sys.stderr) print(f" Markdown: {md_path}", file=sys.stderr) + print(f" Events: {events_path}", file=sys.stderr) if __name__ == "__main__": diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index 54a7fc796..76ccc0182 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -2,9 +2,11 @@ """Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" from __future__ import annotations +import json import logging import sys from dataclasses import dataclass +from datetime import datetime, UTC from pathlib import Path from llama_index.core.llms.llm import LLM @@ -70,6 +72,31 @@ class FlawTraceResult: llm_calls_made: int = 0 +class EventLogger: + """Appends JSON events to a JSONL file for live monitoring. + + Usage: tail -f events.jsonl + """ + + def __init__(self, path: Path | None): + self._path = path + if self._path: + self._path.parent.mkdir(parents=True, exist_ok=True) + # Truncate on start so each run is a fresh log + self._path.write_text("", encoding="utf-8") + + def log(self, event_type: str, **data: object) -> None: + if self._path is None: + return + entry = { + "timestamp": datetime.now(UTC).isoformat(), + "event": event_type, + **data, + } + with open(self._path, "a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + class FlawTracer: """Traces flaws upstream through the PlanExe pipeline DAG.""" @@ -79,6 +106,7 @@ def __init__( llm_executor: LLMExecutor, max_depth: int = 15, verbose: bool = False, + events_path: Path | None = None, ): self.output_dir = output_dir self.llm_executor = llm_executor @@ -86,6 +114,7 @@ def __init__( self.verbose = verbose self._llm_calls = 0 self._checked: set[tuple[str, str]] = set() # (stage_name, flaw_description) dedup + self._events = EventLogger(events_path) def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: """Main entry point. Identify flaws and trace each upstream.""" @@ -102,13 +131,19 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: # Phase 1: Identify flaws self._log(f"Phase 1: Identifying flaws in {starting_file}") + self._events.log("phase1_start", file=starting_file, stage=stage_name) identified = self._identify_flaws(starting_file, file_content, flaw_description) self._log(f" Found {len(identified.flaws)} flaw(s)") + self._events.log("phase1_done", flaws_found=len(identified.flaws), + summaries=[f.description for f in identified.flaws]) traced_flaws: list[TracedFlaw] = [] for i, flaw in enumerate(identified.flaws): flaw_id = f"flaw_{i + 1:03d}" self._log(f"\nTracing {flaw_id}: {flaw.description}") + self._events.log("trace_flaw_start", flaw_id=flaw_id, + flaw_index=i + 1, flaw_total=len(identified.flaws), + description=flaw.description, severity=flaw.severity) starting_entry = TraceEntry( stage=stage_name, @@ -137,16 +172,22 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: # Phase 3: Source code analysis at origin (always, when origin is known) if traced.origin_stage is not None: + self._events.log("phase3_start", flaw_id=flaw_id, origin_stage=traced.origin_stage) self._analyze_source_code( traced, traced.origin_stage, flaw.description, next((e.evidence for e in traced.trace if e.stage == traced.origin_stage), flaw.evidence) ) + self._events.log("trace_flaw_done", flaw_id=flaw_id, + origin_stage=traced.origin_stage, depth=traced.depth) traced_flaws.append(traced) # Sort by depth (deepest origin first) traced_flaws.sort(key=lambda f: f.depth, reverse=True) + self._events.log("trace_complete", total_flaws=len(traced_flaws), + llm_calls=self._llm_calls) + return FlawTraceResult( starting_file=starting_file, flaw_description=flaw_description, @@ -210,11 +251,15 @@ def _trace_upstream( upstream_content = upstream_path.read_text(encoding="utf-8") self._log(f" Checking upstream: {upstream_name} ({upstream_path.name})") + self._events.log("upstream_check", stage=upstream_name, + file=upstream_path.name, depth=depth) result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) if result.found: self._log(f" -> FOUND in {upstream_name}") + self._events.log("upstream_found", stage=upstream_name, + file=upstream_path.name, depth=depth) found_upstream = True entry = TraceEntry( stage=upstream_name, @@ -238,6 +283,7 @@ def _trace_upstream( # Current stage is the origin — flaw exists here but not in any upstream traced.origin_stage = current_stage traced.depth = len(traced.trace) + self._events.log("origin_found", stage=current_stage, depth=traced.depth) # Mark the current stage entry as origin for entry in traced.trace: if entry.stage == current_stage: From 92936a42fef3ab89d262fc3798e8c0664cc2bc7a Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:41:29 +0200 Subject: [PATCH 14/37] fix: shorten event timestamp to HH:MM:SS Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/flaw_tracer/tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index 76ccc0182..03cddd9a6 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -89,7 +89,7 @@ def log(self, event_type: str, **data: object) -> None: if self._path is None: return entry = { - "timestamp": datetime.now(UTC).isoformat(), + "timestamp": datetime.now(UTC).strftime("%H:%M:%S"), "event": event_type, **data, } From ffff6738d9e58ab22f9e089c02f3fc393bd22408 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:43:17 +0200 Subject: [PATCH 15/37] fix: use compact UTC timestamp without subseconds Format: 2026-04-05T23:40:03Z Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/flaw_tracer/tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index 03cddd9a6..9e86efeed 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -89,7 +89,7 @@ def log(self, event_type: str, **data: object) -> None: if self._path is None: return entry = { - "timestamp": datetime.now(UTC).strftime("%H:%M:%S"), + "timestamp": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), "event": event_type, **data, } From c5c7c15478c36b17a1632c81ae52c8999e8b4108 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:51:41 +0200 Subject: [PATCH 16/37] docs: add AGENTS.md with flaw tracer status and known issues Documents what works, what needs fixing (Phase 1 anchoring, loose upstream checks, long evidence quotes), and test run results. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/AGENTS.md | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md new file mode 100644 index 000000000..e8d4537a3 --- /dev/null +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -0,0 +1,56 @@ +# Flaw Tracer — Status and Known Issues + +## What works well + +- **DAG traversal is correct.** The registry maps all 70 stages, upstream resolution works, dedup prevents redundant checks, depth limiting works. +- **Source code analysis gives actionable suggestions.** When the origin is correctly identified, the Phase 3 output points to specific prompt text and proposes concrete fixes. +- **Depth sorting is useful.** Deepest root causes appear first, which matches the user's intent of finding the earliest upstream origin. +- **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. +- **Evidence quoting works.** The LLM generally finds relevant passages in upstream files. + +## Known issues to fix + +### HIGH: Phase 1 doesn't anchor to the user's flaw description + +The user provides a specific flaw (e.g., "zoning and permits in Shanghai lack specifics") but Phase 1 identifies *different* flaws from the file instead. The LLM uses the description as inspiration rather than finding that exact flaw and closely related ones. + +**Root cause:** The Phase 1 prompt says "identify each discrete flaw" broadly. It should prioritize flaws matching the user's description, then optionally find additional ones. + +**Fix direction:** Restructure the Phase 1 prompt to first locate the user's specific flaw in the file, then look for related flaws. Consider splitting into "anchor the user's flaw" + "find additional flaws" as two steps. + +### MEDIUM: Upstream checks are too loose + +The LLM says "found" when an upstream file discusses a related *topic* rather than containing the actual *precursor* to the flaw. Example: a flaw about "Lead Room Designer talent availability" traces through "Room Design Complexity" evidence because both involve room design. + +**Root cause:** The upstream check prompt asks "does this file contain the same problem or a precursor?" — the word "precursor" is too vague. The LLM interprets topical similarity as causal connection. + +**Fix direction:** Make the upstream check prompt more specific. Require the LLM to explain the causal mechanism (how upstream content *caused* the downstream flaw), not just topical overlap. Consider asking the LLM to rate confidence (HIGH/MEDIUM/LOW) and only follow HIGH-confidence matches. + +### MEDIUM: Evidence quotes are too long + +Some evidence fields contain entire JSON objects (100+ lines) instead of the relevant snippet. This makes the output hard to read and wastes context window in downstream LLM calls. + +**Fix direction:** Add guidance to the upstream check prompt: "Quote only the specific sentence or phrase that demonstrates the flaw, not the entire surrounding object or section. Keep quotes under 200 characters." + +### LOW: Duplicate source code filenames are confusing + +When the stage file and implementation file have the same name (e.g., `identify_purpose.py` in both `stages/` and `assume/`), the output shows `["identify_purpose.py", "identify_purpose.py"]`. + +**Fix direction:** Include the parent directory in source code file names, e.g., `stages/identify_purpose.py` and `assume/identify_purpose.py`. + +### LOW: Most flaws converge on the same origin + +In test runs, 3 of 5 flaws trace back to `potential_levers`. This may be accurate (many downstream issues really do originate from lever identification) but could also indicate the upstream check is too eager. Worth monitoring across more runs to determine if this is a real pattern or an artifact of loose matching. + +## Test runs completed + +1. **India census** (`20250101_india_census`): Started from `029-2-self_audit.md`, 17 flaws found, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Many flaws traced to early pipeline stages. + +2. **Minecraft escape** (`20251016_minecraft_escape`): Started from `029-2-self_audit.md` with specific flaw about zoning/permits, 5 flaws found, 43 LLM calls, deepest origin: `identify_purpose` (depth 5). The user's specific flaw was not among the 5 identified — exposed the Phase 1 anchoring problem. + +## Architecture notes + +- The tool runs from `worker_plan/` directory using Python 3.11. +- LLM calls go through `LLMExecutor` with the active model profile (`PLANEXE_MODEL_PROFILE`). +- The `record_usage_metric called but no usage metrics path is set` warnings are harmless — the flaw tracer doesn't set up the metrics path since it's a standalone CLI tool, not a pipeline task. +- The first-match-wins strategy in `_trace_upstream` means only one upstream branch is followed per flaw. If the flaw exists in multiple upstream branches, only the first one encountered is traced. From d6c6a0d65305fc811c4e08ae3e721d23f27aa736 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:56:09 +0200 Subject: [PATCH 17/37] fix: anchor Phase 1 to user's flaw and tighten upstream checks Phase 1 prompt now requires the user's specific flaw as the first result, with additional flaws limited to the same problem family. Phase 2 prompt now requires causal mechanism (not just topical overlap) and limits evidence quotes to 200 characters. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/prompts.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py index 43f7cb8e9..18f623c54 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py @@ -45,9 +45,15 @@ def build_flaw_identification_messages( """Build messages for Phase 1: identifying discrete flaws in a file.""" system = ( "You are analyzing an intermediary file from a project planning pipeline.\n" - "The user has identified problems in this output. Identify each discrete flaw.\n" + "The user has described a specific flaw they observed. Your job:\n\n" + "1. FIRST, locate the user's specific flaw in the file. Find the passage that " + "corresponds to what the user described. This flaw MUST be the first item in your list.\n" + "2. THEN, identify any additional discrete flaws that are closely related to the " + "user's concern (e.g., other instances of the same problem pattern, or flaws that " + "share the same root cause). Do NOT list every possible flaw in the file — only " + "those connected to what the user raised.\n\n" "For each flaw, provide a short description (one sentence), a direct quote " - "from the file as evidence, and a severity level.\n" + "from the file as evidence (keep quotes under 200 characters), and a severity level.\n" "Only identify real flaws — do not flag stylistic preferences or minor formatting issues.\n" "Severity levels:\n" "- HIGH: fabricated data, invented statistics, or missing critical analysis\n" @@ -55,7 +61,7 @@ def build_flaw_identification_messages( "- LOW: minor gaps that don't significantly impact the plan" ) user = ( - f"User's observation:\n{user_flaw_description}\n\n" + f"User's flaw description:\n{user_flaw_description}\n\n" f"Filename: {filename}\n" f"File content:\n{file_content}" ) @@ -75,10 +81,14 @@ def build_upstream_check_messages( system = ( "You are tracing a flaw through a project planning pipeline to find where it originated.\n" "A downstream file contains a flaw. You are examining an upstream file that was an input " - "to the stage that produced the flawed output.\n" - "Determine if this upstream file contains the same problem or a precursor to it.\n" - "If YES: quote the relevant passage and explain how it connects to the downstream flaw.\n" - "If NO: explain why this file is clean regarding this specific flaw." + "to the stage that produced the flawed output.\n\n" + "Determine if this upstream file CAUSED or CONTRIBUTED to the downstream flaw.\n" + "This means the upstream file contains content that was carried forward, transformed, " + "or amplified into the downstream flaw. Merely discussing a related topic is NOT enough.\n\n" + "If YES: quote the specific sentence or phrase (under 200 characters) and explain " + "the causal mechanism — how this upstream content led to the downstream flaw.\n" + "If NO: explain why this file is clean regarding this specific flaw.\n\n" + "Be strict. Only say YES if you can identify a clear causal link, not just topical overlap." ) user = ( f"Flaw: {flaw_description}\n" From 2c9b401ffafe83b910aa90a8cb18f9ad1472b4c3 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:57:13 +0200 Subject: [PATCH 18/37] fix: disambiguate source code filenames with parent directory Shows "stages/identify_purpose.py" and "assume/identify_purpose.py" instead of "identify_purpose.py" twice. Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/flaw_tracer/tracer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index 9e86efeed..b19c8b47c 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -299,7 +299,8 @@ def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_descrip for path in source_paths: if path.exists(): content = path.read_text(encoding="utf-8") - source_contents.append((path.name, content)) + short_name = f"{path.parent.name}/{path.name}" + source_contents.append((short_name, content)) if not source_contents: return From 8e20e9e5a833122345eae65105e668256811eb7d Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 01:58:19 +0200 Subject: [PATCH 19/37] =?UTF-8?q?docs:=20update=20AGENTS.md=20=E2=80=94=20?= =?UTF-8?q?mark=20fixed=20issues,=20add=20test=20run=20v2=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/AGENTS.md | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index e8d4537a3..ae8e1647f 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -6,47 +6,46 @@ - **Source code analysis gives actionable suggestions.** When the origin is correctly identified, the Phase 3 output points to specific prompt text and proposes concrete fixes. - **Depth sorting is useful.** Deepest root causes appear first, which matches the user's intent of finding the earliest upstream origin. - **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. -- **Evidence quoting works.** The LLM generally finds relevant passages in upstream files. +- **Phase 1 anchors to the user's flaw.** The user's specific flaw is always the first result, with additional flaws limited to the same problem family. Verified on the Minecraft escape run — the zoning/permits flaw is now correctly identified and traced. +- **Upstream checks require causal links.** The prompt requires the LLM to explain *how* upstream content caused the downstream flaw, not just topical overlap. This produces tighter, more accurate traces. +- **Evidence quotes are concise.** Both Phase 1 and Phase 2 prompts instruct the LLM to keep quotes under 200 characters. +- **Source code filenames are disambiguated.** Shows `stages/identify_purpose.py` and `assume/identify_purpose.py` instead of duplicate bare filenames. -## Known issues to fix +## Fixed issues -### HIGH: Phase 1 doesn't anchor to the user's flaw description +### Phase 1 didn't anchor to user's flaw (was HIGH, fixed) -The user provides a specific flaw (e.g., "zoning and permits in Shanghai lack specifics") but Phase 1 identifies *different* flaws from the file instead. The LLM uses the description as inspiration rather than finding that exact flaw and closely related ones. +The Phase 1 prompt now requires the user's specific flaw as the first item, with additional flaws limited to the same problem family. Before the fix, the LLM would ignore the user's flaw and identify unrelated issues. -**Root cause:** The Phase 1 prompt says "identify each discrete flaw" broadly. It should prioritize flaws matching the user's description, then optionally find additional ones. +### Upstream checks were too loose (was MEDIUM, fixed) -**Fix direction:** Restructure the Phase 1 prompt to first locate the user's specific flaw in the file, then look for related flaws. Consider splitting into "anchor the user's flaw" + "find additional flaws" as two steps. +The Phase 2 prompt now requires a causal mechanism ("how did this upstream content lead to the downstream flaw?") and explicitly rejects topical overlap. Before the fix, the LLM would say "found" whenever an upstream file discussed a related topic. -### MEDIUM: Upstream checks are too loose +### Evidence quotes were too long (was MEDIUM, fixed) -The LLM says "found" when an upstream file discusses a related *topic* rather than containing the actual *precursor* to the flaw. Example: a flaw about "Lead Room Designer talent availability" traces through "Room Design Complexity" evidence because both involve room design. +Both Phase 1 and Phase 2 prompts now instruct "keep quotes under 200 characters." Before the fix, evidence fields contained entire JSON objects (100+ lines). -**Root cause:** The upstream check prompt asks "does this file contain the same problem or a precursor?" — the word "precursor" is too vague. The LLM interprets topical similarity as causal connection. +### Duplicate source code filenames (was LOW, fixed) -**Fix direction:** Make the upstream check prompt more specific. Require the LLM to explain the causal mechanism (how upstream content *caused* the downstream flaw), not just topical overlap. Consider asking the LLM to rate confidence (HIGH/MEDIUM/LOW) and only follow HIGH-confidence matches. +Source code paths now include the parent directory (`stages/identify_purpose.py`) to disambiguate files with the same name in different packages. -### MEDIUM: Evidence quotes are too long +## Open issues to monitor -Some evidence fields contain entire JSON objects (100+ lines) instead of the relevant snippet. This makes the output hard to read and wastes context window in downstream LLM calls. +### LOW: Flaw convergence on same origin -**Fix direction:** Add guidance to the upstream check prompt: "Quote only the specific sentence or phrase that demonstrates the flaw, not the entire surrounding object or section. Keep quotes under 200 characters." +In the first test run (India census, before prompt fixes), 3 of 5 flaws traced back to `potential_levers`. After the prompt tightening, the Minecraft escape run showed all 3 flaws converging on `identify_risks` — but this makes sense since all 3 flaws were about the same problem family (missing regulatory specifics). Monitor across more diverse runs to determine if convergence is a real pattern or an artifact. -### LOW: Duplicate source code filenames are confusing +### LOW: First-match-wins may miss parallel origins -When the stage file and implementation file have the same name (e.g., `identify_purpose.py` in both `stages/` and `assume/`), the output shows `["identify_purpose.py", "identify_purpose.py"]`. - -**Fix direction:** Include the parent directory in source code file names, e.g., `stages/identify_purpose.py` and `assume/identify_purpose.py`. - -### LOW: Most flaws converge on the same origin - -In test runs, 3 of 5 flaws trace back to `potential_levers`. This may be accurate (many downstream issues really do originate from lever identification) but could also indicate the upstream check is too eager. Worth monitoring across more runs to determine if this is a real pattern or an artifact of loose matching. +The `_trace_upstream` method follows only the first upstream branch where the flaw is found. If a flaw has precursors in multiple parallel branches, only one is traced. This is a deliberate efficiency trade-off. If users report missing origins, consider adding a mode that follows all branches. ## Test runs completed -1. **India census** (`20250101_india_census`): Started from `029-2-self_audit.md`, 17 flaws found, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Many flaws traced to early pipeline stages. +1. **India census** (`20250101_india_census`): Started from `029-2-self_audit.md`, 17 flaws found, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Run with old prompts — flaws were not anchored to user input, traces were loose. + +2. **Minecraft escape v1** (`20251016_minecraft_escape`): Started from `029-2-self_audit.md` with flaw about zoning/permits. Old prompts: 5 flaws found, 43 LLM calls, user's flaw not identified. Exposed the Phase 1 anchoring problem. -2. **Minecraft escape** (`20251016_minecraft_escape`): Started from `029-2-self_audit.md` with specific flaw about zoning/permits, 5 flaws found, 43 LLM calls, deepest origin: `identify_purpose` (depth 5). The user's specific flaw was not among the 5 identified — exposed the Phase 1 anchoring problem. +3. **Minecraft escape v2** (`20251016_minecraft_escape`): Same input, new prompts. 3 flaws found, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's zoning/permits flaw correctly identified as flaw_001. All 3 flaws in the same problem family (regulatory gaps). Evidence quotes concise. Traces causally sound. ## Architecture notes From ea93202374a0bcd712dd76384f434f8fd24f814e Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 02:50:22 +0200 Subject: [PATCH 20/37] docs: add Phase 3 classification limitation and India census v2 results Phase 3 always blames the prompt, but some flaws are inherent domain complexity. Future improvement: classify root causes into prompt-fixable, domain complexity, and missing input data. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../worker_plan_internal/flaw_tracer/AGENTS.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index ae8e1647f..e0b5ef2e9 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -39,6 +39,19 @@ In the first test run (India census, before prompt fixes), 3 of 5 flaws traced b The `_trace_upstream` method follows only the first upstream branch where the flaw is found. If a flaw has precursors in multiple parallel branches, only one is traced. This is a deliberate efficiency trade-off. If users report missing origins, consider adding a mode that follows all branches. +### MEDIUM: Phase 3 always blames the prompt + +Phase 3 (source code analysis) always frames the root cause as a prompt engineering problem, because the prompt is all it can see. But some "flaws" aren't prompt-fixable — they're inherent domain complexity. + +Example: The India census flaw tracer identified "caste enumeration lacks evidence of success" and suggested adding prompt guidelines for sensitive topics. But caste enumeration is a genuinely contentious political issue in India (BJP resisted it for years, opposition demanded it, Bihar ran its own count in 2023). No prompt change can resolve that — the plan correctly identified it as a high-stakes lever. The trace *chain* was right (self_audit → strategic_decisions → levers → plan prompt), but the *suggestion* oversimplified by treating domain complexity as a prompt gap. + +**Fix direction:** Teach Phase 3 to classify the root cause into categories: +1. **Prompt-fixable** — the prompt forgot to ask for something (e.g., "list specific permits with lead times"). Suggestion: modify the prompt. +2. **Domain complexity** — the topic is inherently uncertain, contentious, or requires domain expertise the LLM doesn't have. Suggestion: flag for human review, add external data sources, or accept as a known limitation. +3. **Missing input data** — the plan prompt didn't provide enough context for the pipeline to work with. Suggestion: improve the user's input. + +This would make the suggestions more honest and actionable instead of always defaulting to "modify the system prompt." + ## Test runs completed 1. **India census** (`20250101_india_census`): Started from `029-2-self_audit.md`, 17 flaws found, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Run with old prompts — flaws were not anchored to user input, traces were loose. @@ -47,6 +60,8 @@ The `_trace_upstream` method follows only the first upstream branch where the fl 3. **Minecraft escape v2** (`20251016_minecraft_escape`): Same input, new prompts. 3 flaws found, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's zoning/permits flaw correctly identified as flaw_001. All 3 flaws in the same problem family (regulatory gaps). Evidence quotes concise. Traces causally sound. +4. **India census v2** (`20250101_india_census`): Same input as run 1, new prompts. 2 flaws found (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's "no real-world proof" flaw correctly identified as flaw_001. Flaw_002 closely related (workforce feasibility). Evidence concise, source files disambiguated. Exposed the Phase 3 limitation: suggestion said "add prompt guidelines for sensitive topics" when the real issue is inherent domain complexity (caste enumeration is politically contentious in India regardless of prompt engineering). + ## Architecture notes - The tool runs from `worker_plan/` directory using Python 3.11. From 14045633b6edad95d761bd0bdba15e364491f8ca Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 02:53:18 +0200 Subject: [PATCH 21/37] feat: classify Phase 3 root causes into prompt_fixable, domain_complexity, missing_input Phase 3 now categorizes each root cause so suggestions are honest: - prompt_fixable: the prompt has a gap that can be edited - domain_complexity: inherently uncertain/contentious, no prompt change resolves it - missing_input: the user's plan didn't provide enough detail Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/output.py | 9 +++++++ .../flaw_tracer/prompts.py | 27 +++++++++++++++---- .../flaw_tracer/tests/test_output.py | 1 + .../flaw_tracer/tests/test_prompts.py | 11 ++++++++ .../flaw_tracer/tracer.py | 2 ++ 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/output.py b/worker_plan/worker_plan_internal/flaw_tracer/output.py index 583a0d071..205a28964 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/output.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/output.py @@ -53,6 +53,7 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: "stage": flaw.origin.stage, "file": flaw.origin.file, "source_code_files": flaw.origin.source_code_files, + "category": flaw.origin.category, "likely_cause": flaw.origin.likely_cause, "suggestion": flaw.origin.suggestion, } @@ -117,6 +118,14 @@ def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: # Origin analysis if flaw.origin: + category_labels = { + "prompt_fixable": "Prompt fixable", + "domain_complexity": "Domain complexity", + "missing_input": "Missing input", + } + category_label = category_labels.get(flaw.origin.category, flaw.origin.category) + lines.append(f"**Category:** {category_label}") + lines.append("") lines.append(f"**Root cause:** {flaw.origin.likely_cause}") lines.append("") lines.append(f"**Source files:** {', '.join(flaw.origin.source_code_files)}") diff --git a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py index 18f623c54..600d59d77 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py @@ -30,7 +30,14 @@ class UpstreamCheckResult(BaseModel): class SourceCodeAnalysisResult(BaseModel): """Result of analyzing source code at a flaw's origin stage.""" - likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + category: Literal["prompt_fixable", "domain_complexity", "missing_input"] = Field( + description=( + "prompt_fixable: the prompt forgot to ask for something or has a gap that can be fixed by editing the prompt. " + "domain_complexity: the topic is inherently uncertain, contentious, or requires domain expertise that no prompt change can resolve. " + "missing_input: the user's plan prompt didn't provide enough context for the pipeline to work with." + ) + ) + likely_cause: str = Field(description="What in the prompt, logic, or domain caused the flaw") relevant_code_section: str = Field(description="The specific code or prompt text responsible") suggestion: str = Field(description="How to fix or prevent this flaw") @@ -114,10 +121,20 @@ def build_source_code_analysis_messages( """ system = ( "A flaw was introduced at this pipeline stage. The flaw exists in its output " - "but NOT in any of its inputs, so this stage created it.\n" - "Examine the source code to identify what in the prompt text, logic, or processing " - "likely caused this flaw. Be specific — point to lines or prompt phrases.\n" - "Focus on the system prompt text and the data transformation logic." + "but NOT in any of its inputs, so this stage created it.\n\n" + "First, classify the root cause into one of three categories:\n" + "- prompt_fixable: The prompt has a gap or oversight that can be fixed by editing " + "the prompt text. Example: the prompt asks for budget estimates but doesn't require " + "sourcing or validation.\n" + "- domain_complexity: The topic is inherently uncertain, politically sensitive, or " + "requires specialized domain expertise that no prompt change can fully resolve. " + "Example: caste enumeration in India is politically contentious regardless of how " + "the prompt is worded.\n" + "- missing_input: The user's original plan description didn't provide enough detail " + "for this stage to produce quality output. Example: the plan says 'open a shop' " + "without specifying location, budget, or target market.\n\n" + "Then examine the source code to identify the specific cause. Be specific — point " + "to lines or prompt phrases. Focus on the system prompt text and data transformation logic." ) source_sections = [] for fname, content in source_code_contents: diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py index 46ed25e54..5bfe535ae 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py @@ -35,6 +35,7 @@ def _make_sample_result() -> FlawTraceResult: stage="make_assumptions", file="003-5-make_assumptions.md", source_code_files=["make_assumptions.py"], + category="prompt_fixable", likely_cause="Prompt generates budget without data", suggestion="Add validation step", ), diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py index cbe79c920..3262e8372 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py @@ -46,12 +46,23 @@ def test_upstream_check_result_not_found(self): def test_source_code_analysis_result(self): result = SourceCodeAnalysisResult( + category="prompt_fixable", likely_cause="prompt lacks validation", relevant_code_section="system_prompt = ...", suggestion="add grounding check", ) + self.assertEqual(result.category, "prompt_fixable") self.assertIsInstance(result.likely_cause, str) + def test_source_code_analysis_rejects_invalid_category(self): + with self.assertRaises(Exception): + SourceCodeAnalysisResult( + category="unknown_category", + likely_cause="test", + relevant_code_section="test", + suggestion="test", + ) + class TestBuildFlawIdentificationMessages(unittest.TestCase): def test_returns_chat_messages(self): diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index b19c8b47c..e0585fc3d 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -44,6 +44,7 @@ class OriginInfo: stage: str file: str source_code_files: list[str] + category: str # "prompt_fixable", "domain_complexity", or "missing_input" likely_cause: str suggestion: str @@ -321,6 +322,7 @@ def execute(llm: LLM) -> SourceCodeAnalysisResult: stage=stage_name, file=traced.trace[-1].file if traced.trace else "", source_code_files=source_file_names, + category=analysis.category, likely_cause=analysis.likely_cause, suggestion=analysis.suggestion, ) From 264e76cfbcc199349c45172273701b0fa75e307b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 03:27:59 +0200 Subject: [PATCH 22/37] docs: update README and AGENTS with Phase 3 classification and test results README: document category field, events.jsonl, updated examples and typical run stats. AGENTS: move Phase 3 to fixed, add India census v3 results, update what-works-well. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/AGENTS.md | 43 +++++++++---------- .../flaw_tracer/README.md | 34 ++++++++++----- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index e0b5ef2e9..fb624c797 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -3,13 +3,14 @@ ## What works well - **DAG traversal is correct.** The registry maps all 70 stages, upstream resolution works, dedup prevents redundant checks, depth limiting works. -- **Source code analysis gives actionable suggestions.** When the origin is correctly identified, the Phase 3 output points to specific prompt text and proposes concrete fixes. -- **Depth sorting is useful.** Deepest root causes appear first, which matches the user's intent of finding the earliest upstream origin. -- **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. -- **Phase 1 anchors to the user's flaw.** The user's specific flaw is always the first result, with additional flaws limited to the same problem family. Verified on the Minecraft escape run — the zoning/permits flaw is now correctly identified and traced. +- **Phase 1 anchors to the user's flaw.** The user's specific flaw is always the first result, with additional flaws limited to the same problem family. - **Upstream checks require causal links.** The prompt requires the LLM to explain *how* upstream content caused the downstream flaw, not just topical overlap. This produces tighter, more accurate traces. +- **Phase 3 classifies root causes.** Each origin is categorized as `prompt_fixable`, `domain_complexity`, or `missing_input`. Verified: the India census caste enumeration flaw is correctly classified as `domain_complexity`, while the workforce feasibility flaw is `prompt_fixable`. - **Evidence quotes are concise.** Both Phase 1 and Phase 2 prompts instruct the LLM to keep quotes under 200 characters. - **Source code filenames are disambiguated.** Shows `stages/identify_purpose.py` and `assume/identify_purpose.py` instead of duplicate bare filenames. +- **Depth sorting is useful.** Deepest root causes appear first, matching the user's intent of finding the earliest upstream origin. +- **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. +- **Focused output.** A typical run finds 2-3 flaws in the same problem family and makes 15-30 LLM calls (down from 17 flaws / 153 calls before prompt improvements). ## Fixed issues @@ -25,6 +26,15 @@ The Phase 2 prompt now requires a causal mechanism ("how did this upstream conte Both Phase 1 and Phase 2 prompts now instruct "keep quotes under 200 characters." Before the fix, evidence fields contained entire JSON objects (100+ lines). +### Phase 3 always blamed the prompt (was MEDIUM, fixed) + +Phase 3 now classifies each root cause into one of three categories: +- **prompt_fixable** — the prompt has a gap that can be edited (e.g., "list specific permits with lead times") +- **domain_complexity** — inherently uncertain or contentious, no prompt change resolves it (e.g., caste enumeration politics in India) +- **missing_input** — the user's plan didn't provide enough detail + +Before the fix, every suggestion was "modify the system prompt" even when the real issue was domain complexity. Verified on India census run: caste enumeration correctly classified as `domain_complexity`, workforce feasibility as `prompt_fixable`. + ### Duplicate source code filenames (was LOW, fixed) Source code paths now include the parent directory (`stages/identify_purpose.py`) to disambiguate files with the same name in different packages. @@ -33,34 +43,23 @@ Source code paths now include the parent directory (`stages/identify_purpose.py` ### LOW: Flaw convergence on same origin -In the first test run (India census, before prompt fixes), 3 of 5 flaws traced back to `potential_levers`. After the prompt tightening, the Minecraft escape run showed all 3 flaws converging on `identify_risks` — but this makes sense since all 3 flaws were about the same problem family (missing regulatory specifics). Monitor across more diverse runs to determine if convergence is a real pattern or an artifact. +After prompt tightening, convergence now makes sense — flaws in the same problem family naturally trace to the same origin. The Minecraft escape run had all 3 regulatory flaws converge on `identify_risks`, which is correct. Monitor across more diverse runs. ### LOW: First-match-wins may miss parallel origins The `_trace_upstream` method follows only the first upstream branch where the flaw is found. If a flaw has precursors in multiple parallel branches, only one is traced. This is a deliberate efficiency trade-off. If users report missing origins, consider adding a mode that follows all branches. -### MEDIUM: Phase 3 always blames the prompt - -Phase 3 (source code analysis) always frames the root cause as a prompt engineering problem, because the prompt is all it can see. But some "flaws" aren't prompt-fixable — they're inherent domain complexity. - -Example: The India census flaw tracer identified "caste enumeration lacks evidence of success" and suggested adding prompt guidelines for sensitive topics. But caste enumeration is a genuinely contentious political issue in India (BJP resisted it for years, opposition demanded it, Bihar ran its own count in 2023). No prompt change can resolve that — the plan correctly identified it as a high-stakes lever. The trace *chain* was right (self_audit → strategic_decisions → levers → plan prompt), but the *suggestion* oversimplified by treating domain complexity as a prompt gap. - -**Fix direction:** Teach Phase 3 to classify the root cause into categories: -1. **Prompt-fixable** — the prompt forgot to ask for something (e.g., "list specific permits with lead times"). Suggestion: modify the prompt. -2. **Domain complexity** — the topic is inherently uncertain, contentious, or requires domain expertise the LLM doesn't have. Suggestion: flag for human review, add external data sources, or accept as a known limitation. -3. **Missing input data** — the plan prompt didn't provide enough context for the pipeline to work with. Suggestion: improve the user's input. - -This would make the suggestions more honest and actionable instead of always defaulting to "modify the system prompt." - ## Test runs completed -1. **India census** (`20250101_india_census`): Started from `029-2-self_audit.md`, 17 flaws found, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Run with old prompts — flaws were not anchored to user input, traces were loose. +1. **India census v1** (`20250101_india_census`): Old prompts. 17 flaws, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Flaws not anchored to user input, traces loose, evidence bloated. + +2. **Minecraft escape v1** (`20251016_minecraft_escape`): Old prompts. Flaw about zoning/permits. 5 flaws, 43 LLM calls. User's flaw not identified. Exposed Phase 1 anchoring problem. -2. **Minecraft escape v1** (`20251016_minecraft_escape`): Started from `029-2-self_audit.md` with flaw about zoning/permits. Old prompts: 5 flaws found, 43 LLM calls, user's flaw not identified. Exposed the Phase 1 anchoring problem. +3. **Minecraft escape v2** (`20251016_minecraft_escape`): New prompts. 3 flaws, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's flaw correctly identified as flaw_001. All flaws in same problem family (regulatory gaps). -3. **Minecraft escape v2** (`20251016_minecraft_escape`): Same input, new prompts. 3 flaws found, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's zoning/permits flaw correctly identified as flaw_001. All 3 flaws in the same problem family (regulatory gaps). Evidence quotes concise. Traces causally sound. +4. **India census v2** (`20250101_india_census`): New prompts. 2 flaws (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's flaw correctly identified. Exposed Phase 3 "always blames prompt" limitation. -4. **India census v2** (`20250101_india_census`): Same input as run 1, new prompts. 2 flaws found (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's "no real-world proof" flaw correctly identified as flaw_001. Flaw_002 closely related (workforce feasibility). Evidence concise, source files disambiguated. Exposed the Phase 3 limitation: suggestion said "add prompt guidelines for sensitive topics" when the real issue is inherent domain complexity (caste enumeration is politically contentious in India regardless of prompt engineering). +5. **India census v3** (`20250101_india_census`): New prompts + Phase 3 classification. 2 flaws, 17 LLM calls. Caste enumeration correctly classified as `domain_complexity`. Workforce feasibility correctly classified as `prompt_fixable`. All fixes verified working. ## Architecture notes diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md index 5deb8c6db..9d1c8e8ad 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -8,11 +8,14 @@ PlanExe runs a DAG of ~70 tasks. Each task reads upstream files, calls an LLM, a The flaw tracer performs a recursive depth-first search: -1. **Phase 1 — Identify flaws.** Reads the starting file and asks the LLM to identify discrete flaws based on your description. -2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw or a precursor exists in each input file. Continues until it finds a stage where the flaw exists in the output but not in any inputs. -3. **Phase 3 — Analyze source code.** At the origin stage, reads the Python source code that generated the output and asks the LLM what in the prompt or logic likely caused the flaw. +1. **Phase 1 — Identify flaws.** Reads the starting file and locates the specific flaw you described, plus any closely related flaws in the same problem family. +2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw was *caused by* content in each input file (requires causal link, not just topical overlap). Continues until it finds a stage where the flaw exists in the output but not in any inputs. +3. **Phase 3 — Analyze source code and classify.** At the origin stage, reads the Python source code and classifies the root cause: + - **Prompt fixable** — the prompt has a gap that can be fixed by editing it + - **Domain complexity** — the topic is inherently uncertain or contentious, no prompt change resolves it + - **Missing input** — the user's plan prompt didn't provide enough detail -Output is a JSON file (`flaw_trace.json`) and a markdown report (`flaw_trace.md`), sorted by trace depth so the deepest root cause appears first. +Output is a JSON file (`flaw_trace.json`), a markdown report (`flaw_trace.md`), and a live event log (`events.jsonl`), sorted by trace depth so the deepest root cause appears first. ## Prerequisites @@ -74,25 +77,36 @@ Trace a flaw from the self-audit: --verbose ``` -Trace a budget flaw from the executive summary: +Trace a zoning/permits flaw: ```bash /opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ - --dir /path/to/output/20250101_india_census \ - --file 025-2-executive_summary.md \ - --flaw "The budget claims CZK 500,000 but also states costs may exceed that by 20%. The budget is an unvalidated placeholder, not a reliable plan." \ + --dir /path/to/output/20251016_minecraft_escape \ + --file 029-2-self_audit.md \ + --flaw "Infeasible Constraints Rated MEDIUM because the plan mentions zoning and permits but lacks specifics for the Shanghai location." \ --output-dir /tmp/flaw-analysis2 \ --verbose ``` +### Monitoring progress + +While the tracer runs, watch the live event log in another terminal: + +```bash +tail -f /tmp/flaw-analysis/events.jsonl +``` + ### Output -Each run produces two files in `--output-dir` (or `--dir` if not specified): +Each run produces three files in `--output-dir` (or `--dir` if not specified): - `flaw_trace.json` — machine-readable trace with full details - `flaw_trace.md` — human-readable report with trace tables +- `events.jsonl` — live event log for monitoring progress + +Flaws are sorted by trace depth (deepest root cause first). Each flaw's origin includes a **category** (`prompt_fixable`, `domain_complexity`, or `missing_input`) so you know whether the fix is a prompt edit, a domain limitation to accept, or a need for more detail in the plan input. -Flaws are sorted by trace depth (deepest root cause first). A typical run on a downstream file like `029-2-self_audit.md` finds 10-20 flaws and makes 100-200 LLM calls. +A typical run finds 2-3 focused flaws and makes 15-30 LLM calls. ## Running tests From db3b7884f7d5576e72d565af55424a0b0bf11174 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Mon, 6 Apr 2026 03:32:48 +0200 Subject: [PATCH 23/37] docs: add tips, limitations, honest assessment, and open issues README: add Tips section (start from self_audit, trust chains over suggestions, check category, results are non-deterministic) and Limitations section (LLM subjectivity, first-match-wins, static registry, text-only, diagnostic not prescriptive). AGENTS: add non-determinism and registry drift as MEDIUM open issues, add honest assessment section with guidance on what to trust. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/AGENTS.md | 32 ++++++++++++++++--- .../flaw_tracer/README.md | 15 +++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index fb624c797..e132d67f9 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -39,15 +39,27 @@ Before the fix, every suggestion was "modify the system prompt" even when the re Source code paths now include the parent directory (`stages/identify_purpose.py`) to disambiguate files with the same name in different packages. -## Open issues to monitor +## Open issues -### LOW: Flaw convergence on same origin +### MEDIUM: Non-determinism untested + +This is LLM judging LLM output. Every upstream check is a subjective call. Two runs on the same input may produce different traces. We haven't tested reproducibility — run the same input 3 times and compare. If traces diverge significantly, consider requiring higher-confidence matches or running multiple passes and intersecting results. + +### MEDIUM: Static registry will drift + +The DAG mapping in `registry.py` is a hand-maintained copy of the pipeline topology. Adding, removing, or renaming stages requires a manual update — the registry won't auto-detect changes. If the registry falls out of sync, traces will silently miss stages or follow wrong paths. -After prompt tightening, convergence now makes sense — flaws in the same problem family naturally trace to the same origin. The Minecraft escape run had all 3 regulatory flaws converge on `identify_risks`, which is correct. Monitor across more diverse runs. +**Fix direction:** Generate the registry from Luigi task introspection at startup, or add a CI check that compares the registry against the actual task classes. ### LOW: First-match-wins may miss parallel origins -The `_trace_upstream` method follows only the first upstream branch where the flaw is found. If a flaw has precursors in multiple parallel branches, only one is traced. This is a deliberate efficiency trade-off. If users report missing origins, consider adding a mode that follows all branches. +The `_trace_upstream` method follows only the first upstream branch where the flaw is found. Real flaws often have multiple contributing causes from parallel branches, but only one is traced. The trace looks clean and linear, but reality is messier. + +**Fix direction:** Add a `--thorough` mode that follows all branches where the flaw is found, producing a tree instead of a chain. + +### LOW: Flaw convergence on same origin + +After prompt tightening, convergence makes sense — flaws in the same problem family naturally trace to the same origin. Monitor across more diverse runs. ## Test runs completed @@ -61,6 +73,18 @@ The `_trace_upstream` method follows only the first upstream branch where the fl 5. **India census v3** (`20250101_india_census`): New prompts + Phase 3 classification. 2 flaws, 17 LLM calls. Caste enumeration correctly classified as `domain_complexity`. Workforce feasibility correctly classified as `prompt_fixable`. All fixes verified working. +## Honest assessment + +The tool is a useful diagnostic prototype. The trace chains are the most trustworthy part — they're mechanically grounded in the DAG structure. The suggestions are LLM opinions — useful starting points, not patches. + +The category classification (`prompt_fixable` / `domain_complexity` / `missing_input`) turned out to be the most valuable feature. It prevents wasted effort on flaws that can't be fixed by prompt editing. + +The tool is diagnostic, not prescriptive. It tells you *where* a flaw originated and *why*, but someone still has to decide what to do. It can't catch flaws that don't leave textual evidence — timing issues, model-specific quirks, or structural DAG problems are invisible. + +Starting from `029-2-self_audit.md` is the sweet spot. That file already contains identified issues, so the tracer is tracing known problems upstream rather than discovering flaws from scratch. + +Before relying on this for automated decisions (e.g., in the self-improve loop), it needs more diverse test runs (10+ plans) and reproducibility testing. + ## Architecture notes - The tool runs from `worker_plan/` directory using Python 3.11. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md index 9d1c8e8ad..a97c24040 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -108,6 +108,21 @@ Flaws are sorted by trace depth (deepest root cause first). Each flaw's origin i A typical run finds 2-3 focused flaws and makes 15-30 LLM calls. +## Tips + +- **Start from `029-2-self_audit.md`.** This file already contains identified issues, so you're tracing *known* problems upstream rather than asking the LLM to find flaws from scratch. +- **Trust the trace chains more than the suggestions.** The upstream path (which stages the flaw passed through) is mechanically grounded in the DAG. The suggestions are LLM opinions — useful starting points, not patches. +- **Check the category before acting.** If the origin is `domain_complexity`, don't spend time tweaking the prompt. If it's `prompt_fixable`, the suggestion is likely actionable. +- **Results are non-deterministic.** This is LLM judging LLM output. Two runs on the same input may produce slightly different traces. If a finding matters, run it twice. + +## Limitations + +- **LLM subjectivity.** Every hop in the trace is a judgment call by the LLM ("did this upstream file cause the downstream flaw?"). The causal-link requirement helps, but it's still one LLM's opinion. +- **First-match-wins.** When a flaw has precursors in multiple parallel upstream branches, only the first branch found is followed. Real flaws often have multiple contributing causes. +- **Static registry.** The DAG mapping is hand-maintained in `registry.py`. Adding, removing, or renaming pipeline stages requires a manual registry update — it won't auto-detect changes. +- **Text-only.** The tracer can only catch flaws that leave textual evidence in intermediary files. Timing issues, model-specific quirks, or structural DAG problems are invisible to it. +- **Diagnostic, not prescriptive.** It tells you *where* and *why*, but someone still has to decide what to do about it. + ## Running tests ```bash From 89eba47e453eba6cba8c687c2618fe3ad0082108 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 7 Apr 2026 19:31:51 +0200 Subject: [PATCH 24/37] refactor: replace hand-maintained registry with extract_dag introspection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The flaw tracer's registry.py now builds from extract_dag() at import time instead of a 780-line static listing. The public API is unchanged. Also rename upstream_stages → depends_on in StageInfo and tests to match the extract_dag JSON schema. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/registry.py | 768 +----------------- .../flaw_tracer/tests/test_registry.py | 4 +- 2 files changed, 45 insertions(+), 727 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index 02394a354..a664a8142 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -1,747 +1,65 @@ # worker_plan/worker_plan_internal/flaw_tracer/registry.py -"""Static DAG mapping for the PlanExe pipeline. +"""DAG registry for the flaw tracer, built from Luigi task introspection. -Maps every pipeline stage to its output files, upstream dependencies, -and source code files. Derived from the Luigi task classes in -worker_plan_internal/plan/stages/. +Replaces the former hand-maintained static registry with data extracted +from the actual pipeline via extract_dag. The public API is unchanged: + - find_stage_by_filename(filename) -> StageInfo | None + - get_upstream_files(stage_name, output_dir) -> list[tuple[str, Path]] + - get_source_code_paths(stage_name) -> list[Path] """ from dataclasses import dataclass from pathlib import Path +from worker_plan_internal.extract_dag import extract_dag + # Base path for source code, relative to worker_plan/ _SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ @dataclass(frozen=True) class StageInfo: - """One pipeline stage.""" + """One pipeline node.""" name: str output_files: tuple[str, ...] primary_output: str # preferred file to read when checking for flaws - upstream_stages: tuple[str, ...] = () + depends_on: tuple[str, ...] = () source_code_files: tuple[str, ...] = () -# ── Complete pipeline registry ────────────────────────────────────────── +def _pick_primary_output(filenames: list[str]) -> str: + """Pick the best file to read when checking a stage for flaws. + + Preference: .md > .html > non-raw file > first file. + """ + for ext in (".md", ".html"): + for f in filenames: + if f.endswith(ext): + return f + non_raw = [f for f in filenames if "_raw" not in f] + if non_raw: + return non_raw[0] + return filenames[0] if filenames else "" + + +def _build_registry() -> tuple[StageInfo, ...]: + """Build the registry from Luigi task introspection.""" + dag = extract_dag() + stages = [] + for node in dag["nodes"]: + output_files = tuple(node["output_files"]) + stages.append(StageInfo( + name=node["id"], + output_files=output_files, + primary_output=_pick_primary_output(node["output_files"]), + depends_on=tuple(node["depends_on"]), + source_code_files=tuple(node["source_files"]), + )) + return tuple(stages) + -STAGES: tuple[StageInfo, ...] = ( - # Phase 1: Initialization - StageInfo( - name="start_time", - output_files=("001-1-start_time.json",), - primary_output="001-1-start_time.json", - upstream_stages=(), - source_code_files=("worker_plan_internal/plan/stages/start_time.py",), - ), - StageInfo( - name="setup", - output_files=("001-2-plan.txt",), - primary_output="001-2-plan.txt", - upstream_stages=(), - source_code_files=("worker_plan_internal/plan/stages/setup.py",), - ), - # Phase 2: Input Validation & Strategy - StageInfo( - name="screen_planning_prompt", - output_files=("002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"), - primary_output="002-0-screen_planning_prompt.md", - upstream_stages=("setup",), - source_code_files=( - "worker_plan_internal/plan/stages/screen_planning_prompt.py", - "worker_plan_internal/diagnostics/screen_planning_prompt.py", - ), - ), - StageInfo( - name="extract_constraints", - output_files=("002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"), - primary_output="002-0-extract_constraints.md", - upstream_stages=("setup",), - source_code_files=( - "worker_plan_internal/plan/stages/extract_constraints.py", - "worker_plan_internal/diagnostics/extract_constraints.py", - ), - ), - StageInfo( - name="redline_gate", - output_files=("002-1-redline_gate.json", "002-2-redline_gate.md"), - primary_output="002-2-redline_gate.md", - upstream_stages=("setup",), - source_code_files=( - "worker_plan_internal/plan/stages/redline_gate.py", - "worker_plan_internal/diagnostics/redline_gate.py", - ), - ), - StageInfo( - name="premise_attack", - output_files=("002-3-premise_attack.json", "002-4-premise_attack.md"), - primary_output="002-4-premise_attack.md", - upstream_stages=("setup",), - source_code_files=( - "worker_plan_internal/plan/stages/premise_attack.py", - "worker_plan_internal/diagnostics/premise_attack.py", - ), - ), - StageInfo( - name="identify_purpose", - output_files=("002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"), - primary_output="002-6-identify_purpose.md", - upstream_stages=("setup",), - source_code_files=( - "worker_plan_internal/plan/stages/identify_purpose.py", - "worker_plan_internal/assume/identify_purpose.py", - ), - ), - StageInfo( - name="plan_type", - output_files=("002-7-plan_type_raw.json", "002-8-plan_type.md"), - primary_output="002-8-plan_type.md", - upstream_stages=("setup", "identify_purpose"), - source_code_files=( - "worker_plan_internal/plan/stages/plan_type.py", - "worker_plan_internal/assume/identify_plan_type.py", - ), - ), - StageInfo( - name="potential_levers", - output_files=("002-9-potential_levers_raw.json", "002-10-potential_levers.json"), - primary_output="002-10-potential_levers.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "extract_constraints"), - source_code_files=( - "worker_plan_internal/plan/stages/potential_levers.py", - "worker_plan_internal/lever/identify_potential_levers.py", - ), - ), - StageInfo( - name="deduplicate_levers", - output_files=("002-11-deduplicated_levers_raw.json",), - primary_output="002-11-deduplicated_levers_raw.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "potential_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/deduplicate_levers.py", - "worker_plan_internal/lever/deduplicate_levers.py", - ), - ), - StageInfo( - name="enrich_levers", - output_files=("002-12-enriched_levers_raw.json",), - primary_output="002-12-enriched_levers_raw.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "deduplicate_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/enrich_levers.py", - "worker_plan_internal/lever/enrich_potential_levers.py", - ), - ), - StageInfo( - name="focus_on_vital_few_levers", - output_files=("002-13-vital_few_levers_raw.json",), - primary_output="002-13-vital_few_levers_raw.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "enrich_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/focus_on_vital_few_levers.py", - "worker_plan_internal/lever/focus_on_vital_few_levers.py", - ), - ), - StageInfo( - name="strategic_decisions_markdown", - output_files=("002-14-strategic_decisions.md",), - primary_output="002-14-strategic_decisions.md", - upstream_stages=("enrich_levers", "focus_on_vital_few_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/strategic_decisions_markdown.py", - "worker_plan_internal/lever/strategic_decisions_markdown.py", - ), - ), - StageInfo( - name="candidate_scenarios", - output_files=("002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"), - primary_output="002-16-candidate_scenarios.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/candidate_scenarios.py", - "worker_plan_internal/lever/candidate_scenarios.py", - ), - ), - StageInfo( - name="select_scenario", - output_files=("002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"), - primary_output="002-18-selected_scenario.json", - upstream_stages=("setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"), - source_code_files=( - "worker_plan_internal/plan/stages/select_scenario.py", - "worker_plan_internal/lever/select_scenario.py", - ), - ), - StageInfo( - name="scenarios_markdown", - output_files=("002-19-scenarios.md",), - primary_output="002-19-scenarios.md", - upstream_stages=("candidate_scenarios", "select_scenario"), - source_code_files=( - "worker_plan_internal/plan/stages/scenarios_markdown.py", - "worker_plan_internal/lever/scenarios_markdown.py", - ), - ), - # Constraint checkers - StageInfo( - name="potential_levers_constraint", - output_files=("002-10-potential_levers_constraint.json",), - primary_output="002-10-potential_levers_constraint.json", - upstream_stages=("extract_constraints", "potential_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - StageInfo( - name="deduplicated_levers_constraint", - output_files=("002-11-deduplicated_levers_constraint.json",), - primary_output="002-11-deduplicated_levers_constraint.json", - upstream_stages=("extract_constraints", "deduplicate_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - StageInfo( - name="enriched_levers_constraint", - output_files=("002-12-enriched_levers_constraint.json",), - primary_output="002-12-enriched_levers_constraint.json", - upstream_stages=("extract_constraints", "enrich_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - StageInfo( - name="vital_few_levers_constraint", - output_files=("002-13-vital_few_levers_constraint.json",), - primary_output="002-13-vital_few_levers_constraint.json", - upstream_stages=("extract_constraints", "focus_on_vital_few_levers"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - StageInfo( - name="candidate_scenarios_constraint", - output_files=("002-16-candidate_scenarios_constraint.json",), - primary_output="002-16-candidate_scenarios_constraint.json", - upstream_stages=("extract_constraints", "candidate_scenarios"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - StageInfo( - name="selected_scenario_constraint", - output_files=("002-18-selected_scenario_constraint.json",), - primary_output="002-18-selected_scenario_constraint.json", - upstream_stages=("extract_constraints", "select_scenario"), - source_code_files=( - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ), - ), - # Phase 3: Context & Assumptions - StageInfo( - name="physical_locations", - output_files=("002-20-physical_locations_raw.json", "002-21-physical_locations.md"), - primary_output="002-21-physical_locations.md", - upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"), - source_code_files=( - "worker_plan_internal/plan/stages/physical_locations.py", - "worker_plan_internal/assume/physical_locations.py", - ), - ), - StageInfo( - name="currency_strategy", - output_files=("002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"), - primary_output="002-23-currency_strategy.md", - upstream_stages=("setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"), - source_code_files=( - "worker_plan_internal/plan/stages/currency_strategy.py", - "worker_plan_internal/assume/currency_strategy.py", - ), - ), - StageInfo( - name="identify_risks", - output_files=("003-1-identify_risks_raw.json", "003-2-identify_risks.md"), - primary_output="003-2-identify_risks.md", - upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"), - source_code_files=( - "worker_plan_internal/plan/stages/identify_risks.py", - "worker_plan_internal/assume/identify_risks.py", - ), - ), - StageInfo( - name="make_assumptions", - output_files=("003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"), - primary_output="003-5-make_assumptions.md", - upstream_stages=("setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"), - source_code_files=( - "worker_plan_internal/plan/stages/make_assumptions.py", - "worker_plan_internal/assume/make_assumptions.py", - ), - ), - StageInfo( - name="distill_assumptions", - output_files=("003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"), - primary_output="003-7-distill_assumptions.md", - upstream_stages=("setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"), - source_code_files=( - "worker_plan_internal/plan/stages/distill_assumptions.py", - "worker_plan_internal/assume/distill_assumptions.py", - ), - ), - StageInfo( - name="review_assumptions", - output_files=("003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"), - primary_output="003-9-review_assumptions.md", - upstream_stages=("identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"), - source_code_files=( - "worker_plan_internal/plan/stages/review_assumptions.py", - "worker_plan_internal/assume/review_assumptions.py", - ), - ), - StageInfo( - name="consolidate_assumptions_markdown", - output_files=("003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"), - primary_output="003-10-consolidate_assumptions_full.md", - upstream_stages=("identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"), - source_code_files=( - "worker_plan_internal/plan/stages/consolidate_assumptions_markdown.py", - "worker_plan_internal/assume/shorten_markdown.py", - ), - ), - # Phase 4: Pre-Project Assessment & Project Plan - StageInfo( - name="pre_project_assessment", - output_files=("004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"), - primary_output="004-2-pre_project_assessment.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"), - source_code_files=( - "worker_plan_internal/plan/stages/pre_project_assessment.py", - "worker_plan_internal/expert/pre_project_assessment.py", - ), - ), - StageInfo( - name="project_plan", - output_files=("005-1-project_plan_raw.json", "005-2-project_plan.md"), - primary_output="005-2-project_plan.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"), - source_code_files=( - "worker_plan_internal/plan/stages/project_plan.py", - "worker_plan_internal/plan/project_plan.py", - ), - ), - # Phase 5: Governance - StageInfo( - name="governance_phase1_audit", - output_files=("006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"), - primary_output="006-2-governance_phase1_audit.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase1_audit.py", - "worker_plan_internal/governance/governance_phase1_audit.py", - ), - ), - StageInfo( - name="governance_phase2_bodies", - output_files=("006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"), - primary_output="006-4-governance_phase2_bodies.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase2_bodies.py", - "worker_plan_internal/governance/governance_phase2_bodies.py", - ), - ), - StageInfo( - name="governance_phase3_impl_plan", - output_files=("006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"), - primary_output="006-6-governance_phase3_impl_plan.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase3_impl_plan.py", - "worker_plan_internal/governance/governance_phase3_impl_plan.py", - ), - ), - StageInfo( - name="governance_phase4_decision_escalation_matrix", - output_files=("006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"), - primary_output="006-8-governance_phase4_decision_escalation_matrix.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase4_decision_escalation_matrix.py", - "worker_plan_internal/governance/governance_phase4_decision_escalation_matrix.py", - ), - ), - StageInfo( - name="governance_phase5_monitoring_progress", - output_files=("006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"), - primary_output="006-10-governance_phase5_monitoring_progress.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase5_monitoring_progress.py", - "worker_plan_internal/governance/governance_phase5_monitoring_progress.py", - ), - ), - StageInfo( - name="governance_phase6_extra", - output_files=("006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"), - primary_output="006-12-governance_phase6_extra.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"), - source_code_files=( - "worker_plan_internal/plan/stages/governance_phase6_extra.py", - "worker_plan_internal/governance/governance_phase6_extra.py", - ), - ), - StageInfo( - name="consolidate_governance", - output_files=("006-13-consolidate_governance.md",), - primary_output="006-13-consolidate_governance.md", - upstream_stages=("governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"), - source_code_files=("worker_plan_internal/plan/stages/consolidate_governance.py",), - ), - # Phase 6: Resources & Team - StageInfo( - name="related_resources", - output_files=("007-1-related_resources_raw.json", "007-8-related_resources.md"), - primary_output="007-8-related_resources.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"), - source_code_files=( - "worker_plan_internal/plan/stages/related_resources.py", - "worker_plan_internal/plan/related_resources.py", - ), - ), - StageInfo( - name="find_team_members", - output_files=("008-1-find_team_members_raw.json", "008-2-find_team_members.json"), - primary_output="008-2-find_team_members.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/find_team_members.py", - "worker_plan_internal/team/find_team_members.py", - ), - ), - StageInfo( - name="enrich_team_contract_type", - output_files=("009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"), - primary_output="009-2-enrich_team_members_contract_type.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/enrich_team_contract_type.py", - "worker_plan_internal/team/enrich_team_members_with_contract_type.py", - ), - ), - StageInfo( - name="enrich_team_background_story", - output_files=("010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"), - primary_output="010-2-enrich_team_members_background_story.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/enrich_team_background_story.py", - "worker_plan_internal/team/enrich_team_members_with_background_story.py", - ), - ), - StageInfo( - name="enrich_team_environment_info", - output_files=("011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"), - primary_output="011-2-enrich_team_members_environment_info.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/enrich_team_environment_info.py", - "worker_plan_internal/team/enrich_team_members_with_environment_info.py", - ), - ), - StageInfo( - name="review_team", - output_files=("012-review_team_raw.json",), - primary_output="012-review_team_raw.json", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/review_team.py", - "worker_plan_internal/team/review_team.py", - ), - ), - StageInfo( - name="team_markdown", - output_files=("013-team.md",), - primary_output="013-team.md", - upstream_stages=("enrich_team_environment_info", "review_team"), - source_code_files=( - "worker_plan_internal/plan/stages/team_markdown.py", - "worker_plan_internal/team/team_markdown_document.py", - ), - ), - # Phase 7: Analysis & Experts - StageInfo( - name="swot_analysis", - output_files=("014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"), - primary_output="014-2-swot_analysis.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/swot_analysis.py", - "worker_plan_internal/swot/swot_analysis.py", - ), - ), - StageInfo( - name="expert_review", - output_files=("015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"), - primary_output="016-2-expert_criticism.md", - upstream_stages=("setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"), - source_code_files=( - "worker_plan_internal/plan/stages/expert_review.py", - "worker_plan_internal/expert/expert_finder.py", - "worker_plan_internal/expert/expert_criticism.py", - ), - ), - # Phase 8: Data & Documents - StageInfo( - name="data_collection", - output_files=("017-1-data_collection_raw.json", "017-2-data_collection.md"), - primary_output="017-2-data_collection.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"), - source_code_files=( - "worker_plan_internal/plan/stages/data_collection.py", - "worker_plan_internal/plan/data_collection.py", - ), - ), - StageInfo( - name="identify_documents", - output_files=("017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"), - primary_output="017-4-identified_documents.md", - upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"), - source_code_files=( - "worker_plan_internal/plan/stages/identify_documents.py", - "worker_plan_internal/document/identify_documents.py", - ), - ), - StageInfo( - name="filter_documents_to_find", - output_files=("017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"), - primary_output="017-8-filter_documents_to_find_clean.json", - upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"), - source_code_files=( - "worker_plan_internal/plan/stages/filter_documents_to_find.py", - "worker_plan_internal/document/filter_documents_to_find.py", - ), - ), - StageInfo( - name="filter_documents_to_create", - output_files=("017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"), - primary_output="017-10-filter_documents_to_create_clean.json", - upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"), - source_code_files=( - "worker_plan_internal/plan/stages/filter_documents_to_create.py", - "worker_plan_internal/document/filter_documents_to_create.py", - ), - ), - StageInfo( - name="draft_documents_to_find", - output_files=("017-12-draft_documents_to_find.json",), - primary_output="017-12-draft_documents_to_find.json", - upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"), - source_code_files=( - "worker_plan_internal/plan/stages/draft_documents_to_find.py", - "worker_plan_internal/document/draft_document_to_find.py", - ), - ), - StageInfo( - name="draft_documents_to_create", - output_files=("017-14-draft_documents_to_create.json",), - primary_output="017-14-draft_documents_to_create.json", - upstream_stages=("identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"), - source_code_files=( - "worker_plan_internal/plan/stages/draft_documents_to_create.py", - "worker_plan_internal/document/draft_document_to_create.py", - ), - ), - StageInfo( - name="markdown_documents", - output_files=("017-15-documents_to_create_and_find.md",), - primary_output="017-15-documents_to_create_and_find.md", - upstream_stages=("draft_documents_to_create", "draft_documents_to_find"), - source_code_files=( - "worker_plan_internal/plan/stages/markdown_documents.py", - "worker_plan_internal/document/markdown_with_document.py", - ), - ), - # Phase 9: WBS - StageInfo( - name="create_wbs_level1", - output_files=("018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"), - primary_output="018-2-wbs_level1.json", - upstream_stages=("project_plan",), - source_code_files=( - "worker_plan_internal/plan/stages/create_wbs_level1.py", - "worker_plan_internal/plan/create_wbs_level1.py", - ), - ), - StageInfo( - name="create_wbs_level2", - output_files=("018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"), - primary_output="018-5-wbs_level2.json", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"), - source_code_files=( - "worker_plan_internal/plan/stages/create_wbs_level2.py", - "worker_plan_internal/plan/create_wbs_level2.py", - ), - ), - StageInfo( - name="wbs_project_level1_and_level2", - output_files=("019-wbs_project_level1_and_level2.json",), - primary_output="019-wbs_project_level1_and_level2.json", - upstream_stages=("create_wbs_level1", "create_wbs_level2"), - source_code_files=( - "worker_plan_internal/plan/stages/wbs_project_level1_and_level2.py", - "worker_plan_internal/wbs/wbs_populate.py", - ), - ), - # Phase 10: Pitch & Dependencies - StageInfo( - name="create_pitch", - output_files=("020-1-pitch_raw.json",), - primary_output="020-1-pitch_raw.json", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"), - source_code_files=( - "worker_plan_internal/plan/stages/create_pitch.py", - "worker_plan_internal/pitch/create_pitch.py", - ), - ), - StageInfo( - name="convert_pitch_to_markdown", - output_files=("020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"), - primary_output="020-3-pitch.md", - upstream_stages=("create_pitch",), - source_code_files=( - "worker_plan_internal/plan/stages/convert_pitch_to_markdown.py", - "worker_plan_internal/pitch/convert_pitch_to_markdown.py", - ), - ), - StageInfo( - name="identify_task_dependencies", - output_files=("021-task_dependencies_raw.json",), - primary_output="021-task_dependencies_raw.json", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"), - source_code_files=( - "worker_plan_internal/plan/stages/identify_task_dependencies.py", - "worker_plan_internal/plan/identify_wbs_task_dependencies.py", - ), - ), - StageInfo( - name="estimate_task_durations", - output_files=("022-2-task_durations.json",), - primary_output="022-2-task_durations.json", - upstream_stages=("project_plan", "wbs_project_level1_and_level2"), - source_code_files=( - "worker_plan_internal/plan/stages/estimate_task_durations.py", - "worker_plan_internal/plan/estimate_wbs_task_durations.py", - ), - ), - # Phase 11: WBS Level 3 - StageInfo( - name="create_wbs_level3", - output_files=("023-2-wbs_level3.json",), - primary_output="023-2-wbs_level3.json", - upstream_stages=("project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"), - source_code_files=( - "worker_plan_internal/plan/stages/create_wbs_level3.py", - "worker_plan_internal/plan/create_wbs_level3.py", - ), - ), - StageInfo( - name="wbs_project_level1_level2_level3", - output_files=("023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"), - primary_output="023-3-wbs_project_level1_and_level2_and_level3.json", - upstream_stages=("wbs_project_level1_and_level2", "create_wbs_level3"), - source_code_files=( - "worker_plan_internal/plan/stages/wbs_project_level1_level2_level3.py", - "worker_plan_internal/wbs/wbs_populate.py", - ), - ), - # Phase 12: Schedule & Reviews - StageInfo( - name="create_schedule", - output_files=("026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"), - primary_output="026-2-schedule_gantt_dhtmlx.html", - upstream_stages=("start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"), - source_code_files=( - "worker_plan_internal/plan/stages/create_schedule.py", - "worker_plan_internal/schedule/project_schedule_populator.py", - ), - ), - StageInfo( - name="review_plan", - output_files=("024-1-review_plan_raw.json", "024-2-review_plan.md"), - primary_output="024-2-review_plan.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"), - source_code_files=( - "worker_plan_internal/plan/stages/review_plan.py", - "worker_plan_internal/plan/review_plan.py", - ), - ), - StageInfo( - name="executive_summary", - output_files=("025-1-executive_summary_raw.json", "025-2-executive_summary.md"), - primary_output="025-2-executive_summary.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"), - source_code_files=( - "worker_plan_internal/plan/stages/executive_summary.py", - "worker_plan_internal/plan/executive_summary.py", - ), - ), - StageInfo( - name="questions_and_answers", - output_files=("027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"), - primary_output="027-2-questions_and_answers.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"), - source_code_files=( - "worker_plan_internal/plan/stages/questions_and_answers.py", - "worker_plan_internal/questions_answers/questions_answers.py", - ), - ), - StageInfo( - name="premortem", - output_files=("028-1-premortem_raw.json", "028-2-premortem.md"), - primary_output="028-2-premortem.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"), - source_code_files=( - "worker_plan_internal/plan/stages/premortem.py", - "worker_plan_internal/diagnostics/premortem.py", - ), - ), - StageInfo( - name="self_audit", - output_files=("029-1-self_audit_raw.json", "029-2-self_audit.md"), - primary_output="029-2-self_audit.md", - upstream_stages=("strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"), - source_code_files=( - "worker_plan_internal/plan/stages/self_audit.py", - "worker_plan_internal/self_audit/self_audit.py", - ), - ), - # Phase 13: Final Report - StageInfo( - name="report", - output_files=("030-report.html",), - primary_output="030-report.html", - upstream_stages=( - "setup", "screen_planning_prompt", "redline_gate", "premise_attack", - "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", - "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", - "convert_pitch_to_markdown", "data_collection", "markdown_documents", - "create_wbs_level1", "wbs_project_level1_level2_level3", "expert_review", - "project_plan", "review_plan", "executive_summary", "create_schedule", - "questions_and_answers", "premortem", "self_audit", - ), - source_code_files=( - "worker_plan_internal/plan/stages/report.py", - "worker_plan_internal/report/report_generator.py", - ), - ), -) +# ── Build once at import time ────────────────────────────────────────── -# ── Lookup indexes (built once at import time) ────────────────────────── +STAGES: tuple[StageInfo, ...] = _build_registry() _STAGE_BY_NAME: dict[str, StageInfo] = {s.name: s for s in STAGES} _STAGE_BY_FILENAME: dict[str, StageInfo] = {} @@ -762,7 +80,7 @@ def get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Pat return [] result = [] - for upstream_name in stage.upstream_stages: + for upstream_name in stage.depends_on: upstream_stage = _STAGE_BY_NAME.get(upstream_name) if upstream_stage is None: continue diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index d3c0abe67..c6c930ace 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -20,7 +20,7 @@ def test_all_stages_have_required_fields(self): self.assertIsInstance(stage.name, str, f"{stage.name} name") self.assertIsInstance(stage.output_files, tuple, f"{stage.name} output_files") self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") - self.assertIsInstance(stage.upstream_stages, tuple, f"{stage.name} upstream_stages") + self.assertIsInstance(stage.depends_on, tuple, f"{stage.name} depends_on") self.assertIsInstance(stage.source_code_files, tuple, f"{stage.name} source_code_files") self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") @@ -32,7 +32,7 @@ def test_no_duplicate_stage_names(self): def test_upstream_references_are_valid(self): valid_names = {s.name for s in STAGES} for stage in STAGES: - for upstream in stage.upstream_stages: + for upstream in stage.depends_on: self.assertIn(upstream, valid_names, f"{stage.name} references unknown upstream '{upstream}'") From dffb9afeda82bd39193222c56123c0cfb0ed0406 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 7 Apr 2026 19:35:26 +0200 Subject: [PATCH 25/37] refactor: rename StageInfo to NodeInfo Co-Authored-By: Claude Opus 4.6 (1M context) --- .../worker_plan_internal/flaw_tracer/registry.py | 16 ++++++++-------- .../flaw_tracer/tests/test_registry.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index a664a8142..e3280abc7 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -3,7 +3,7 @@ Replaces the former hand-maintained static registry with data extracted from the actual pipeline via extract_dag. The public API is unchanged: - - find_stage_by_filename(filename) -> StageInfo | None + - find_stage_by_filename(filename) -> NodeInfo | None - get_upstream_files(stage_name, output_dir) -> list[tuple[str, Path]] - get_source_code_paths(stage_name) -> list[Path] """ @@ -17,7 +17,7 @@ @dataclass(frozen=True) -class StageInfo: +class NodeInfo: """One pipeline node.""" name: str output_files: tuple[str, ...] @@ -41,13 +41,13 @@ def _pick_primary_output(filenames: list[str]) -> str: return filenames[0] if filenames else "" -def _build_registry() -> tuple[StageInfo, ...]: +def _build_registry() -> tuple[NodeInfo, ...]: """Build the registry from Luigi task introspection.""" dag = extract_dag() stages = [] for node in dag["nodes"]: output_files = tuple(node["output_files"]) - stages.append(StageInfo( + stages.append(NodeInfo( name=node["id"], output_files=output_files, primary_output=_pick_primary_output(node["output_files"]), @@ -59,16 +59,16 @@ def _build_registry() -> tuple[StageInfo, ...]: # ── Build once at import time ────────────────────────────────────────── -STAGES: tuple[StageInfo, ...] = _build_registry() +STAGES: tuple[NodeInfo, ...] = _build_registry() -_STAGE_BY_NAME: dict[str, StageInfo] = {s.name: s for s in STAGES} -_STAGE_BY_FILENAME: dict[str, StageInfo] = {} +_STAGE_BY_NAME: dict[str, NodeInfo] = {s.name: s for s in STAGES} +_STAGE_BY_FILENAME: dict[str, NodeInfo] = {} for _stage in STAGES: for _fname in _stage.output_files: _STAGE_BY_FILENAME[_fname] = _stage -def find_stage_by_filename(filename: str) -> StageInfo | None: +def find_stage_by_filename(filename: str) -> NodeInfo | None: """Given an output filename, return the stage that produced it.""" return _STAGE_BY_FILENAME.get(filename) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index c6c930ace..f1247454a 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -3,7 +3,7 @@ from pathlib import Path from tempfile import TemporaryDirectory from worker_plan_internal.flaw_tracer.registry import ( - StageInfo, + NodeInfo, STAGES, find_stage_by_filename, get_upstream_files, @@ -11,7 +11,7 @@ ) -class TestStageInfo(unittest.TestCase): +class TestNodeInfo(unittest.TestCase): def test_stages_is_nonempty(self): self.assertGreater(len(STAGES), 40) From 7e699b442fe8890384463303394f37f1214aaac1 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 7 Apr 2026 19:52:53 +0200 Subject: [PATCH 26/37] =?UTF-8?q?refactor:=20rename=20stage=E2=86=92node?= =?UTF-8?q?=20throughout=20flaw=5Ftracer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - StageInfo → NodeInfo (previous commit) - STAGES → NODES - find_stage_by_filename → find_node_by_filename - TraceEntry.stage → TraceEntry.node - OriginInfo.stage → OriginInfo.node - origin_stage → origin_node - All local variables, comments, docstrings, prompts, and test data Co-Authored-By: Claude Opus 4.6 (1M context) --- .../flaw_tracer/AGENTS.md | 8 +- .../flaw_tracer/README.md | 10 +- .../flaw_tracer/__main__.py | 2 +- .../flaw_tracer/output.py | 26 +++--- .../flaw_tracer/prompts.py | 10 +- .../flaw_tracer/registry.py | 70 +++++++------- .../flaw_tracer/tests/test_output.py | 18 ++-- .../flaw_tracer/tests/test_prompts.py | 2 +- .../flaw_tracer/tests/test_registry.py | 92 +++++++++---------- .../flaw_tracer/tests/test_tracer.py | 20 ++-- .../flaw_tracer/tracer.py | 76 +++++++-------- 11 files changed, 167 insertions(+), 167 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index e132d67f9..9e49a4619 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -2,12 +2,12 @@ ## What works well -- **DAG traversal is correct.** The registry maps all 70 stages, upstream resolution works, dedup prevents redundant checks, depth limiting works. +- **DAG traversal is correct.** The registry maps all 70 nodes, upstream resolution works, dedup prevents redundant checks, depth limiting works. - **Phase 1 anchors to the user's flaw.** The user's specific flaw is always the first result, with additional flaws limited to the same problem family. - **Upstream checks require causal links.** The prompt requires the LLM to explain *how* upstream content caused the downstream flaw, not just topical overlap. This produces tighter, more accurate traces. - **Phase 3 classifies root causes.** Each origin is categorized as `prompt_fixable`, `domain_complexity`, or `missing_input`. Verified: the India census caste enumeration flaw is correctly classified as `domain_complexity`, while the workforce feasibility flaw is `prompt_fixable`. - **Evidence quotes are concise.** Both Phase 1 and Phase 2 prompts instruct the LLM to keep quotes under 200 characters. -- **Source code filenames are disambiguated.** Shows `stages/identify_purpose.py` and `assume/identify_purpose.py` instead of duplicate bare filenames. +- **Source code filenames are disambiguated.** Shows `nodes/identify_purpose.py` and `assume/identify_purpose.py` instead of duplicate bare filenames. - **Depth sorting is useful.** Deepest root causes appear first, matching the user's intent of finding the earliest upstream origin. - **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. - **Focused output.** A typical run finds 2-3 flaws in the same problem family and makes 15-30 LLM calls (down from 17 flaws / 153 calls before prompt improvements). @@ -37,7 +37,7 @@ Before the fix, every suggestion was "modify the system prompt" even when the re ### Duplicate source code filenames (was LOW, fixed) -Source code paths now include the parent directory (`stages/identify_purpose.py`) to disambiguate files with the same name in different packages. +Source code paths now include the parent directory (`nodes/identify_purpose.py`) to disambiguate files with the same name in different packages. ## Open issues @@ -47,7 +47,7 @@ This is LLM judging LLM output. Every upstream check is a subjective call. Two r ### MEDIUM: Static registry will drift -The DAG mapping in `registry.py` is a hand-maintained copy of the pipeline topology. Adding, removing, or renaming stages requires a manual update — the registry won't auto-detect changes. If the registry falls out of sync, traces will silently miss stages or follow wrong paths. +The DAG mapping in `registry.py` is a hand-maintained copy of the pipeline topology. Adding, removing, or renaming nodes requires a manual update — the registry won't auto-detect changes. If the registry falls out of sync, traces will silently miss nodes or follow wrong paths. **Fix direction:** Generate the registry from Luigi task introspection at startup, or add a CI check that compares the registry against the actual task classes. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md index a97c24040..e64e163ab 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -4,13 +4,13 @@ Root-cause analysis tool for PlanExe reports. Given a flaw observed in a pipelin ## How it works -PlanExe runs a DAG of ~70 tasks. Each task reads upstream files, calls an LLM, and writes output files (prefixed `001-` through `030-`). Flaws introduced early propagate downstream into later stages and the final report. +PlanExe runs a DAG of ~70 tasks. Each task reads upstream files, calls an LLM, and writes output files (prefixed `001-` through `030-`). Flaws introduced early propagate downstream into later nodes and the final report. The flaw tracer performs a recursive depth-first search: 1. **Phase 1 — Identify flaws.** Reads the starting file and locates the specific flaw you described, plus any closely related flaws in the same problem family. -2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw was *caused by* content in each input file (requires causal link, not just topical overlap). Continues until it finds a stage where the flaw exists in the output but not in any inputs. -3. **Phase 3 — Analyze source code and classify.** At the origin stage, reads the Python source code and classifies the root cause: +2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw was *caused by* content in each input file (requires causal link, not just topical overlap). Continues until it finds a node where the flaw exists in the output but not in any inputs. +3. **Phase 3 — Analyze source code and classify.** At the origin node, reads the Python source code and classifies the root cause: - **Prompt fixable** — the prompt has a gap that can be fixed by editing it - **Domain complexity** — the topic is inherently uncertain or contentious, no prompt change resolves it - **Missing input** — the user's plan prompt didn't provide enough detail @@ -111,7 +111,7 @@ A typical run finds 2-3 focused flaws and makes 15-30 LLM calls. ## Tips - **Start from `029-2-self_audit.md`.** This file already contains identified issues, so you're tracing *known* problems upstream rather than asking the LLM to find flaws from scratch. -- **Trust the trace chains more than the suggestions.** The upstream path (which stages the flaw passed through) is mechanically grounded in the DAG. The suggestions are LLM opinions — useful starting points, not patches. +- **Trust the trace chains more than the suggestions.** The upstream path (which nodes the flaw passed through) is mechanically grounded in the DAG. The suggestions are LLM opinions — useful starting points, not patches. - **Check the category before acting.** If the origin is `domain_complexity`, don't spend time tweaking the prompt. If it's `prompt_fixable`, the suggestion is likely actionable. - **Results are non-deterministic.** This is LLM judging LLM output. Two runs on the same input may produce slightly different traces. If a finding matters, run it twice. @@ -119,7 +119,7 @@ A typical run finds 2-3 focused flaws and makes 15-30 LLM calls. - **LLM subjectivity.** Every hop in the trace is a judgment call by the LLM ("did this upstream file cause the downstream flaw?"). The causal-link requirement helps, but it's still one LLM's opinion. - **First-match-wins.** When a flaw has precursors in multiple parallel upstream branches, only the first branch found is followed. Real flaws often have multiple contributing causes. -- **Static registry.** The DAG mapping is hand-maintained in `registry.py`. Adding, removing, or renaming pipeline stages requires a manual registry update — it won't auto-detect changes. +- **Static registry.** The DAG mapping is hand-maintained in `registry.py`. Adding, removing, or renaming pipeline nodes requires a manual registry update — it won't auto-detect changes. - **Text-only.** The tracer can only catch flaws that leave textual evidence in intermediary files. Timing issues, model-specific quirks, or structural DAG problems are invisible to it. - **Diagnostic, not prescriptive.** It tells you *where* and *why*, but someone still has to decide what to do about it. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py index 3d47ccfb6..e9fadccff 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/__main__.py @@ -99,7 +99,7 @@ def main() -> None: print(f"\nFlaws found: {len(result.flaws)}", file=sys.stderr) if result.flaws: deepest = max(result.flaws, key=lambda f: f.depth) - print(f"Deepest origin: {deepest.origin_stage} (depth {deepest.depth})", file=sys.stderr) + print(f"Deepest origin: {deepest.origin_node} (depth {deepest.depth})", file=sys.stderr) print(f"LLM calls made: {result.llm_calls_made}", file=sys.stderr) print(f"\nReports written:", file=sys.stderr) print(f" JSON: {json_path}", file=sys.stderr) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/output.py b/worker_plan/worker_plan_internal/flaw_tracer/output.py index 205a28964..dfec61856 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/output.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/output.py @@ -19,14 +19,14 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: "flaws": [], "summary": { "total_flaws": len(result.flaws), - "deepest_origin_stage": None, + "deepest_origin_node": None, "deepest_origin_depth": 0, "llm_calls_made": result.llm_calls_made, }, } max_depth = 0 - deepest_stage = None + deepest_node = None for flaw in result.flaws: flaw_data = { @@ -36,7 +36,7 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: "starting_evidence": flaw.starting_evidence, "trace": [ { - "stage": entry.stage, + "node": entry.node, "file": entry.file, "evidence": entry.evidence, "is_origin": entry.is_origin, @@ -50,7 +50,7 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: if flaw.origin: flaw_data["origin"] = { - "stage": flaw.origin.stage, + "node": flaw.origin.node, "file": flaw.origin.file, "source_code_files": flaw.origin.source_code_files, "category": flaw.origin.category, @@ -60,12 +60,12 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: if flaw.depth > max_depth: max_depth = flaw.depth - deepest_stage = flaw.origin_stage + deepest_node = flaw.origin_node data["flaws"].append(flaw_data) data["flaws"].sort(key=lambda f: f["depth"], reverse=True) - data["summary"]["deepest_origin_stage"] = deepest_stage + data["summary"]["deepest_origin_node"] = deepest_node data["summary"]["deepest_origin_depth"] = max_depth output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") @@ -81,7 +81,7 @@ def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: if result.flaws: deepest = max(result.flaws, key=lambda f: f.depth) - lines.append(f"**Deepest origin:** {deepest.origin_stage} (depth {deepest.depth})") + lines.append(f"**Deepest origin:** {deepest.origin_node} (depth {deepest.depth})") lines.append(f"**LLM calls:** {result.llm_calls_made}") lines.append("") @@ -93,10 +93,10 @@ def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: lines.append("") # Trace chain summary - stage_names = [entry.stage for entry in flaw.trace] + node_names = [entry.node for entry in flaw.trace] chain_parts = [] - for name in stage_names: - if name == flaw.origin_stage: + for name in node_names: + if name == flaw.origin_node: chain_parts.append(f"**{name}** (origin)") else: chain_parts.append(name) @@ -108,12 +108,12 @@ def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: lines.append("") # Trace table - lines.append("| Stage | File | Evidence |") + lines.append("| Node | File | Evidence |") lines.append("|-------|------|----------|") for entry in flaw.trace: - stage_cell = f"**{entry.stage}**" if entry.is_origin else entry.stage + node_cell = f"**{entry.node}**" if entry.is_origin else entry.node evidence_cell = _escape_table_cell(entry.evidence) - lines.append(f"| {stage_cell} | {entry.file} | {evidence_cell} |") + lines.append(f"| {node_cell} | {entry.file} | {evidence_cell} |") lines.append("") # Origin analysis diff --git a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py index 600d59d77..b510970e8 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/prompts.py @@ -29,7 +29,7 @@ class UpstreamCheckResult(BaseModel): class SourceCodeAnalysisResult(BaseModel): - """Result of analyzing source code at a flaw's origin stage.""" + """Result of analyzing source code at a flaw's origin node.""" category: Literal["prompt_fixable", "domain_complexity", "missing_input"] = Field( description=( "prompt_fixable: the prompt forgot to ask for something or has a gap that can be fixed by editing the prompt. " @@ -88,7 +88,7 @@ def build_upstream_check_messages( system = ( "You are tracing a flaw through a project planning pipeline to find where it originated.\n" "A downstream file contains a flaw. You are examining an upstream file that was an input " - "to the stage that produced the flawed output.\n\n" + "to the node that produced the flawed output.\n\n" "Determine if this upstream file CAUSED or CONTRIBUTED to the downstream flaw.\n" "This means the upstream file contains content that was carried forward, transformed, " "or amplified into the downstream flaw. Merely discussing a related topic is NOT enough.\n\n" @@ -120,8 +120,8 @@ def build_source_code_analysis_messages( source_code_contents: list of (filename, content) tuples """ system = ( - "A flaw was introduced at this pipeline stage. The flaw exists in its output " - "but NOT in any of its inputs, so this stage created it.\n\n" + "A flaw was introduced at this pipeline node. The flaw exists in its output " + "but NOT in any of its inputs, so this node created it.\n\n" "First, classify the root cause into one of three categories:\n" "- prompt_fixable: The prompt has a gap or oversight that can be fixed by editing " "the prompt text. Example: the prompt asks for budget estimates but doesn't require " @@ -131,7 +131,7 @@ def build_source_code_analysis_messages( "Example: caste enumeration in India is politically contentious regardless of how " "the prompt is worded.\n" "- missing_input: The user's original plan description didn't provide enough detail " - "for this stage to produce quality output. Example: the plan says 'open a shop' " + "for this node to produce quality output. Example: the plan says 'open a shop' " "without specifying location, budget, or target market.\n\n" "Then examine the source code to identify the specific cause. Be specific — point " "to lines or prompt phrases. Focus on the system prompt text and data transformation logic." diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index e3280abc7..66bf7fdec 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -3,9 +3,9 @@ Replaces the former hand-maintained static registry with data extracted from the actual pipeline via extract_dag. The public API is unchanged: - - find_stage_by_filename(filename) -> NodeInfo | None - - get_upstream_files(stage_name, output_dir) -> list[tuple[str, Path]] - - get_source_code_paths(stage_name) -> list[Path] + - find_node_by_filename(filename) -> NodeInfo | None + - get_upstream_files(node_name, output_dir) -> list[tuple[str, Path]] + - get_source_code_paths(node_name) -> list[Path] """ from dataclasses import dataclass from pathlib import Path @@ -27,7 +27,7 @@ class NodeInfo: def _pick_primary_output(filenames: list[str]) -> str: - """Pick the best file to read when checking a stage for flaws. + """Pick the best file to read when checking a node for flaws. Preference: .md > .html > non-raw file > first file. """ @@ -44,55 +44,55 @@ def _pick_primary_output(filenames: list[str]) -> str: def _build_registry() -> tuple[NodeInfo, ...]: """Build the registry from Luigi task introspection.""" dag = extract_dag() - stages = [] - for node in dag["nodes"]: - output_files = tuple(node["output_files"]) - stages.append(NodeInfo( - name=node["id"], + nodes = [] + for entry in dag["nodes"]: + output_files = tuple(entry["output_files"]) + nodes.append(NodeInfo( + name=entry["id"], output_files=output_files, - primary_output=_pick_primary_output(node["output_files"]), - depends_on=tuple(node["depends_on"]), - source_code_files=tuple(node["source_files"]), + primary_output=_pick_primary_output(entry["output_files"]), + depends_on=tuple(entry["depends_on"]), + source_code_files=tuple(entry["source_files"]), )) - return tuple(stages) + return tuple(nodes) # ── Build once at import time ────────────────────────────────────────── -STAGES: tuple[NodeInfo, ...] = _build_registry() +NODES: tuple[NodeInfo, ...] = _build_registry() -_STAGE_BY_NAME: dict[str, NodeInfo] = {s.name: s for s in STAGES} -_STAGE_BY_FILENAME: dict[str, NodeInfo] = {} -for _stage in STAGES: - for _fname in _stage.output_files: - _STAGE_BY_FILENAME[_fname] = _stage +_NODE_BY_NAME: dict[str, NodeInfo] = {n.name: n for n in NODES} +_NODE_BY_FILENAME: dict[str, NodeInfo] = {} +for _node in NODES: + for _fname in _node.output_files: + _NODE_BY_FILENAME[_fname] = _node -def find_stage_by_filename(filename: str) -> NodeInfo | None: - """Given an output filename, return the stage that produced it.""" - return _STAGE_BY_FILENAME.get(filename) +def find_node_by_filename(filename: str) -> NodeInfo | None: + """Given an output filename, return the node that produced it.""" + return _NODE_BY_FILENAME.get(filename) -def get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]: - """Return (stage_name, file_path) pairs for upstream stages whose primary output exists on disk.""" - stage = _STAGE_BY_NAME.get(stage_name) - if stage is None: +def get_upstream_files(node_name: str, output_dir: Path) -> list[tuple[str, Path]]: + """Return (node_name, file_path) pairs for upstream nodes whose primary output exists on disk.""" + node = _NODE_BY_NAME.get(node_name) + if node is None: return [] result = [] - for upstream_name in stage.depends_on: - upstream_stage = _STAGE_BY_NAME.get(upstream_name) - if upstream_stage is None: + for upstream_name in node.depends_on: + upstream_node = _NODE_BY_NAME.get(upstream_name) + if upstream_node is None: continue - primary_path = output_dir / upstream_stage.primary_output + primary_path = output_dir / upstream_node.primary_output if primary_path.exists(): result.append((upstream_name, primary_path)) return result -def get_source_code_paths(stage_name: str) -> list[Path]: - """Return absolute paths to source code files for a stage.""" - stage = _STAGE_BY_NAME.get(stage_name) - if stage is None: +def get_source_code_paths(node_name: str) -> list[Path]: + """Return absolute paths to source code files for a node.""" + node = _NODE_BY_NAME.get(node_name) + if node is None: return [] - return [_SOURCE_BASE / f for f in stage.source_code_files] + return [_SOURCE_BASE / f for f in node.source_code_files] diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py index 5bfe535ae..00e326bea 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py @@ -26,13 +26,13 @@ def _make_sample_result() -> FlawTraceResult: severity="HIGH", starting_evidence="CZK 500,000", trace=[ - TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="CZK 500,000", is_origin=False), - TraceEntry(stage="project_plan", file="005-2-project_plan.md", evidence="Budget: 500k", is_origin=False), - TraceEntry(stage="make_assumptions", file="003-5-make_assumptions.md", evidence="Assume budget of 500k", is_origin=True), + TraceEntry(node="executive_summary", file="025-2-executive_summary.md", evidence="CZK 500,000", is_origin=False), + TraceEntry(node="project_plan", file="005-2-project_plan.md", evidence="Budget: 500k", is_origin=False), + TraceEntry(node="make_assumptions", file="003-5-make_assumptions.md", evidence="Assume budget of 500k", is_origin=True), ], - origin_stage="make_assumptions", + origin_node="make_assumptions", origin=OriginInfo( - stage="make_assumptions", + node="make_assumptions", file="003-5-make_assumptions.md", source_code_files=["make_assumptions.py"], category="prompt_fixable", @@ -47,9 +47,9 @@ def _make_sample_result() -> FlawTraceResult: severity="MEDIUM", starting_evidence="growing Czech market", trace=[ - TraceEntry(stage="executive_summary", file="025-2-executive_summary.md", evidence="growing Czech market", is_origin=True), + TraceEntry(node="executive_summary", file="025-2-executive_summary.md", evidence="growing Czech market", is_origin=True), ], - origin_stage="executive_summary", + origin_node="executive_summary", depth=1, ), ], @@ -79,7 +79,7 @@ def test_json_contains_correct_summary(self): data = json.loads(output_path.read_text(encoding="utf-8")) summary = data["summary"] self.assertEqual(summary["total_flaws"], 2) - self.assertEqual(summary["deepest_origin_stage"], "make_assumptions") + self.assertEqual(summary["deepest_origin_node"], "make_assumptions") self.assertEqual(summary["deepest_origin_depth"], 3) self.assertEqual(summary["llm_calls_made"], 8) @@ -123,7 +123,7 @@ def test_markdown_contains_trace_table(self): write_markdown_report(result, output_path) content = output_path.read_text(encoding="utf-8") - self.assertIn("| Stage |", content) + self.assertIn("| Node |", content) self.assertIn("| File |", content) def test_empty_result_produces_valid_markdown(self): diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py index 3262e8372..9770ad894 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py @@ -118,7 +118,7 @@ def test_returns_chat_messages(self): flaw_description="Budget fabricated", evidence_quote="CZK 500,000", source_code_contents=[ - ("stages/make_assumptions.py", "class MakeAssumptionsTask: ..."), + ("nodes/make_assumptions.py", "class MakeAssumptionsTask: ..."), ("assume/make_assumptions.py", "def execute(llm, query): ..."), ], ) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index f1247454a..76d110848 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -4,62 +4,62 @@ from tempfile import TemporaryDirectory from worker_plan_internal.flaw_tracer.registry import ( NodeInfo, - STAGES, - find_stage_by_filename, + NODES, + find_node_by_filename, get_upstream_files, get_source_code_paths, ) class TestNodeInfo(unittest.TestCase): - def test_stages_is_nonempty(self): - self.assertGreater(len(STAGES), 40) - - def test_all_stages_have_required_fields(self): - for stage in STAGES: - self.assertIsInstance(stage.name, str, f"{stage.name} name") - self.assertIsInstance(stage.output_files, tuple, f"{stage.name} output_files") - self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") - self.assertIsInstance(stage.depends_on, tuple, f"{stage.name} depends_on") - self.assertIsInstance(stage.source_code_files, tuple, f"{stage.name} source_code_files") - self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") - self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") - - def test_no_duplicate_stage_names(self): - names = [s.name for s in STAGES] + def test_nodes_is_nonempty(self): + self.assertGreater(len(NODES), 40) + + def test_all_nodes_have_required_fields(self): + for node in NODES: + self.assertIsInstance(node.name, str, f"{node.name} name") + self.assertIsInstance(node.output_files, tuple, f"{node.name} output_files") + self.assertTrue(len(node.output_files) > 0, f"{node.name} has no output_files") + self.assertIsInstance(node.depends_on, tuple, f"{node.name} depends_on") + self.assertIsInstance(node.source_code_files, tuple, f"{node.name} source_code_files") + self.assertIsInstance(node.primary_output, str, f"{node.name} primary_output") + self.assertIn(node.primary_output, node.output_files, f"{node.name} primary_output not in output_files") + + def test_no_duplicate_node_names(self): + names = [n.name for n in NODES] self.assertEqual(len(names), len(set(names))) def test_upstream_references_are_valid(self): - valid_names = {s.name for s in STAGES} - for stage in STAGES: - for upstream in stage.depends_on: - self.assertIn(upstream, valid_names, f"{stage.name} references unknown upstream '{upstream}'") + valid_names = {n.name for n in NODES} + for node in NODES: + for upstream in node.depends_on: + self.assertIn(upstream, valid_names, f"{node.name} references unknown upstream '{upstream}'") -class TestFindStageByFilename(unittest.TestCase): +class TestFindNodeByFilename(unittest.TestCase): def test_find_report(self): - stage = find_stage_by_filename("030-report.html") - self.assertIsNotNone(stage) - self.assertEqual(stage.name, "report") + node = find_node_by_filename("030-report.html") + self.assertIsNotNone(node) + self.assertEqual(node.name, "report") def test_find_potential_levers_clean(self): - stage = find_stage_by_filename("002-10-potential_levers.json") - self.assertIsNotNone(stage) - self.assertEqual(stage.name, "potential_levers") + node = find_node_by_filename("002-10-potential_levers.json") + self.assertIsNotNone(node) + self.assertEqual(node.name, "potential_levers") def test_find_potential_levers_raw(self): - stage = find_stage_by_filename("002-9-potential_levers_raw.json") - self.assertIsNotNone(stage) - self.assertEqual(stage.name, "potential_levers") + node = find_node_by_filename("002-9-potential_levers_raw.json") + self.assertIsNotNone(node) + self.assertEqual(node.name, "potential_levers") def test_find_executive_summary(self): - stage = find_stage_by_filename("025-2-executive_summary.md") - self.assertIsNotNone(stage) - self.assertEqual(stage.name, "executive_summary") + node = find_node_by_filename("025-2-executive_summary.md") + self.assertIsNotNone(node) + self.assertEqual(node.name, "executive_summary") def test_unknown_filename_returns_none(self): - stage = find_stage_by_filename("zzz-unknown.txt") - self.assertIsNone(stage) + node = find_node_by_filename("zzz-unknown.txt") + self.assertIsNone(node) class TestGetUpstreamFiles(unittest.TestCase): @@ -78,11 +78,11 @@ def test_potential_levers_upstream(self): (output_dir / "002-0-extract_constraints.md").write_text("constraints", encoding="utf-8") result = get_upstream_files("potential_levers", output_dir) - stage_names = [name for name, _ in result] - self.assertIn("setup", stage_names) - self.assertIn("identify_purpose", stage_names) - self.assertIn("plan_type", stage_names) - self.assertIn("extract_constraints", stage_names) + node_names = [name for name, _ in result] + self.assertIn("setup", node_names) + self.assertIn("identify_purpose", node_names) + self.assertIn("plan_type", node_names) + self.assertIn("extract_constraints", node_names) def test_missing_files_are_skipped(self): with TemporaryDirectory() as d: @@ -91,10 +91,10 @@ def test_missing_files_are_skipped(self): (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") result = get_upstream_files("potential_levers", output_dir) - stage_names = [name for name, _ in result] - self.assertIn("setup", stage_names) + node_names = [name for name, _ in result] + self.assertIn("setup", node_names) # The others should be skipped because their files don't exist - self.assertNotIn("identify_purpose", stage_names) + self.assertNotIn("identify_purpose", node_names) class TestGetSourceCodePaths(unittest.TestCase): @@ -104,6 +104,6 @@ def test_potential_levers_source(self): self.assertIn("potential_levers.py", filenames) self.assertIn("identify_potential_levers.py", filenames) - def test_unknown_stage_returns_empty(self): - paths = get_source_code_paths("nonexistent_stage") + def test_unknown_node_returns_empty(self): + paths = get_source_code_paths("nonexistent_node") self.assertEqual(paths, []) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py index 18c9d6ea8..d4e89a40c 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py @@ -64,7 +64,7 @@ def test_dataclass_with_flaws(self): description="Budget fabricated", severity="HIGH", starting_evidence="CZK 500,000", - trace=[TraceEntry(stage="test", file="test.md", evidence="ev")], + trace=[TraceEntry(node="test", file="test.md", evidence="ev")], ) result = FlawTraceResult( starting_file="test.md", @@ -86,7 +86,7 @@ def test_defaults(self): starting_evidence="ev", trace=[], ) - self.assertIsNone(flaw.origin_stage) + self.assertIsNone(flaw.origin_node) self.assertIsNone(flaw.origin) self.assertEqual(flaw.depth, 0) self.assertTrue(flaw.trace_complete) @@ -191,11 +191,11 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content self.assertEqual(len(result.flaws), 1) flaw = result.flaws[0] # The trace should include at least executive_summary and project_plan - trace_stages = [entry.stage for entry in flaw.trace] - self.assertIn("executive_summary", trace_stages) - self.assertIn("project_plan", trace_stages) + trace_nodes = [entry.node for entry in flaw.trace] + self.assertIn("executive_summary", trace_nodes) + self.assertIn("project_plan", trace_nodes) # Origin should be project_plan (flaw found there but not in its upstream 'setup') - self.assertEqual(flaw.origin_stage, "project_plan") + self.assertEqual(flaw.origin_node, "project_plan") def test_deduplication_works(self): """Stages already checked for the same flaw should be skipped.""" @@ -300,7 +300,7 @@ def always_found(flaw_desc, evidence, upstream_filename, upstream_content): class TestFlawTracerSourceCodeAnalysis(unittest.TestCase): - """Test that Phase 3 source code analysis is invoked at the origin stage.""" + """Test that Phase 3 source code analysis is invoked at the origin node.""" def test_source_code_analysis_called_at_origin(self): with TemporaryDirectory() as d: @@ -322,11 +322,11 @@ def test_source_code_analysis_called_at_origin(self): # _analyze_source_code should have been called once for the origin mock_analyze.assert_called_once() args = mock_analyze.call_args - # First positional arg is the TracedFlaw, second is the stage name + # First positional arg is the TracedFlaw, second is the node name self.assertEqual(args[0][1], "executive_summary") def test_source_code_analysis_called_at_deep_origin(self): - """Phase 3 should run when the origin is found at a deeper upstream stage.""" + """Phase 3 should run when the origin is found at a deeper upstream node.""" with TemporaryDirectory() as d: output_dir = Path(d) # Create files for a chain: executive_summary -> project_plan (origin) @@ -361,7 +361,7 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content # Phase 3 should have been called at the deep origin (project_plan) mock_analyze.assert_called_once() args = mock_analyze.call_args - # Second positional arg is the origin stage name + # Second positional arg is the origin node name self.assertEqual(args[0][1], "project_plan") diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py index e0585fc3d..f1daf1465 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tracer.py @@ -12,7 +12,7 @@ from llama_index.core.llms.llm import LLM from worker_plan_internal.flaw_tracer.registry import ( - find_stage_by_filename, + find_node_by_filename, get_upstream_files, get_source_code_paths, ) @@ -32,7 +32,7 @@ @dataclass class TraceEntry: """One hop in a flaw's upstream trace.""" - stage: str + node: str file: str evidence: str is_origin: bool = False @@ -40,8 +40,8 @@ class TraceEntry: @dataclass class OriginInfo: - """Source code analysis at a flaw's origin stage.""" - stage: str + """Source code analysis at a flaw's origin node.""" + node: str file: str source_code_files: list[str] category: str # "prompt_fixable", "domain_complexity", or "missing_input" @@ -57,7 +57,7 @@ class TracedFlaw: severity: str starting_evidence: str trace: list[TraceEntry] - origin_stage: str | None = None + origin_node: str | None = None origin: OriginInfo | None = None depth: int = 0 trace_complete: bool = True @@ -114,7 +114,7 @@ def __init__( self.max_depth = max_depth self.verbose = verbose self._llm_calls = 0 - self._checked: set[tuple[str, str]] = set() # (stage_name, flaw_description) dedup + self._checked: set[tuple[str, str]] = set() # (node_name, flaw_description) dedup self._events = EventLogger(events_path) def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: @@ -127,12 +127,12 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: raise FileNotFoundError(f"Starting file not found: {file_path}") file_content = file_path.read_text(encoding="utf-8") - stage = find_stage_by_filename(starting_file) - stage_name = stage.name if stage else "unknown" + found_node = find_node_by_filename(starting_file) + node_name = found_node.name if found_node else "unknown" # Phase 1: Identify flaws self._log(f"Phase 1: Identifying flaws in {starting_file}") - self._events.log("phase1_start", file=starting_file, stage=stage_name) + self._events.log("phase1_start", file=starting_file, node=node_name) identified = self._identify_flaws(starting_file, file_content, flaw_description) self._log(f" Found {len(identified.flaws)} flaw(s)") self._events.log("phase1_done", flaws_found=len(identified.flaws), @@ -147,7 +147,7 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: description=flaw.description, severity=flaw.severity) starting_entry = TraceEntry( - stage=stage_name, + node=node_name, file=starting_file, evidence=flaw.evidence, is_origin=False, @@ -161,26 +161,26 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: trace=[starting_entry], ) - if stage and self.max_depth > 0: - self._trace_upstream(traced, stage_name, flaw.description, flaw.evidence, depth=0) + if found_node and self.max_depth > 0: + self._trace_upstream(traced, node_name, flaw.description, flaw.evidence, depth=0) # Mark the last trace entry as origin if no deeper origin was found - if traced.origin_stage is None and traced.trace: + if traced.origin_node is None and traced.trace: last = traced.trace[-1] last.is_origin = True - traced.origin_stage = last.stage + traced.origin_node = last.node traced.depth = len(traced.trace) - 1 # Phase 3: Source code analysis at origin (always, when origin is known) - if traced.origin_stage is not None: - self._events.log("phase3_start", flaw_id=flaw_id, origin_stage=traced.origin_stage) + if traced.origin_node is not None: + self._events.log("phase3_start", flaw_id=flaw_id, origin_node=traced.origin_node) self._analyze_source_code( - traced, traced.origin_stage, flaw.description, - next((e.evidence for e in traced.trace if e.stage == traced.origin_stage), flaw.evidence) + traced, traced.origin_node, flaw.description, + next((e.evidence for e in traced.trace if e.node == traced.origin_node), flaw.evidence) ) self._events.log("trace_flaw_done", flaw_id=flaw_id, - origin_stage=traced.origin_stage, depth=traced.depth) + origin_node=traced.origin_node, depth=traced.depth) traced_flaws.append(traced) # Sort by depth (deepest origin first) @@ -224,18 +224,18 @@ def execute(llm: LLM) -> UpstreamCheckResult: def _trace_upstream( self, traced: TracedFlaw, - current_stage: str, + current_node: str, flaw_description: str, evidence: str, depth: int, ) -> None: - """Recursively trace a flaw through upstream stages.""" + """Recursively trace a flaw through upstream nodes.""" if depth >= self.max_depth: traced.trace_complete = False - self._log(f" Max depth {self.max_depth} reached at {current_stage}") + self._log(f" Max depth {self.max_depth} reached at {current_node}") return - upstream_files = get_upstream_files(current_stage, self.output_dir) + upstream_files = get_upstream_files(current_node, self.output_dir) if not upstream_files: return # No upstream = this is the origin @@ -252,18 +252,18 @@ def _trace_upstream( upstream_content = upstream_path.read_text(encoding="utf-8") self._log(f" Checking upstream: {upstream_name} ({upstream_path.name})") - self._events.log("upstream_check", stage=upstream_name, + self._events.log("upstream_check", node=upstream_name, file=upstream_path.name, depth=depth) result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) if result.found: self._log(f" -> FOUND in {upstream_name}") - self._events.log("upstream_found", stage=upstream_name, + self._events.log("upstream_found", node=upstream_name, file=upstream_path.name, depth=depth) found_upstream = True entry = TraceEntry( - stage=upstream_name, + node=upstream_name, file=upstream_path.name, evidence=result.evidence or "", is_origin=False, @@ -277,22 +277,22 @@ def _trace_upstream( ) # First-match-wins: once an origin is found in one upstream # branch, stop exploring others. - if traced.origin_stage is not None: + if traced.origin_node is not None: return if not found_upstream: - # Current stage is the origin — flaw exists here but not in any upstream - traced.origin_stage = current_stage + # Current node is the origin — flaw exists here but not in any upstream + traced.origin_node = current_node traced.depth = len(traced.trace) - self._events.log("origin_found", stage=current_stage, depth=traced.depth) - # Mark the current stage entry as origin + self._events.log("origin_found", node=current_node, depth=traced.depth) + # Mark the current node entry as origin for entry in traced.trace: - if entry.stage == current_stage: + if entry.node == current_node: entry.is_origin = True - def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_description: str, evidence: str) -> None: - """Phase 3: Analyze source code at the origin stage.""" - source_paths = get_source_code_paths(stage_name) + def _analyze_source_code(self, traced: TracedFlaw, node_name: str, flaw_description: str, evidence: str) -> None: + """Phase 3: Analyze source code at the origin node.""" + source_paths = get_source_code_paths(node_name) if not source_paths: return @@ -306,7 +306,7 @@ def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_descrip if not source_contents: return - self._log(f" Phase 3: Analyzing source code for {stage_name}") + self._log(f" Phase 3: Analyzing source code for {node_name}") messages = build_source_code_analysis_messages(flaw_description, evidence, source_contents) def execute(llm: LLM) -> SourceCodeAnalysisResult: @@ -319,7 +319,7 @@ def execute(llm: LLM) -> SourceCodeAnalysisResult: analysis = self.llm_executor.run(execute) source_file_names = [name for name, _ in source_contents] traced.origin = OriginInfo( - stage=stage_name, + node=node_name, file=traced.trace[-1].file if traced.trace else "", source_code_files=source_file_names, category=analysis.category, @@ -327,7 +327,7 @@ def execute(llm: LLM) -> SourceCodeAnalysisResult: suggestion=analysis.suggestion, ) except Exception as e: - logger.warning(f"Source code analysis failed for {stage_name}: {e}") + logger.warning(f"Source code analysis failed for {node_name}: {e}") def _log(self, message: str) -> None: """Print to stderr if verbose mode is enabled.""" From a2b7df52066574904f9f50e78e468639c46c7a09 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Tue, 7 Apr 2026 21:11:27 +0200 Subject: [PATCH 27/37] refactor: remove primary_output from NodeInfo The heuristic is now applied inline in get_upstream_files() instead of being stored on the dataclass. Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/flaw_tracer/registry.py | 7 ++++--- .../flaw_tracer/tests/test_registry.py | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index 66bf7fdec..c65933366 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -21,7 +21,6 @@ class NodeInfo: """One pipeline node.""" name: str output_files: tuple[str, ...] - primary_output: str # preferred file to read when checking for flaws depends_on: tuple[str, ...] = () source_code_files: tuple[str, ...] = () @@ -50,7 +49,6 @@ def _build_registry() -> tuple[NodeInfo, ...]: nodes.append(NodeInfo( name=entry["id"], output_files=output_files, - primary_output=_pick_primary_output(entry["output_files"]), depends_on=tuple(entry["depends_on"]), source_code_files=tuple(entry["source_files"]), )) @@ -84,7 +82,10 @@ def get_upstream_files(node_name: str, output_dir: Path) -> list[tuple[str, Path upstream_node = _NODE_BY_NAME.get(upstream_name) if upstream_node is None: continue - primary_path = output_dir / upstream_node.primary_output + primary = _pick_primary_output(list(upstream_node.output_files)) + if not primary: + continue + primary_path = output_dir / primary if primary_path.exists(): result.append((upstream_name, primary_path)) return result diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index 76d110848..cf437947c 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -22,8 +22,6 @@ def test_all_nodes_have_required_fields(self): self.assertTrue(len(node.output_files) > 0, f"{node.name} has no output_files") self.assertIsInstance(node.depends_on, tuple, f"{node.name} depends_on") self.assertIsInstance(node.source_code_files, tuple, f"{node.name} source_code_files") - self.assertIsInstance(node.primary_output, str, f"{node.name} primary_output") - self.assertIn(node.primary_output, node.output_files, f"{node.name} primary_output not in output_files") def test_no_duplicate_node_names(self): names = [n.name for n in NODES] From 91d6058cca4679c92d2dc58eeec16618f6ac58b5 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 01:09:37 +0200 Subject: [PATCH 28/37] refactor: replace flat source_files with structured implementation object Each node now has implementation.files with role (workflow_node or business_logic) and path, instead of a flat source_files list. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../worker_plan_internal/extract_dag.py | 32 ++++++++++++------- .../flaw_tracer/registry.py | 2 +- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/worker_plan/worker_plan_internal/extract_dag.py b/worker_plan/worker_plan_internal/extract_dag.py index 642e27272..29a8038b0 100644 --- a/worker_plan/worker_plan_internal/extract_dag.py +++ b/worker_plan/worker_plan_internal/extract_dag.py @@ -135,24 +135,32 @@ def _detect_implementation_files(cls: type) -> list[str]: return files -def _extract_source_files(task: luigi.Task) -> list[str]: - """Get source files: task's own file + auto-detected implementation files.""" +def _extract_implementation(task: luigi.Task) -> dict[str, Any]: + """Get implementation info: workflow node file + auto-detected business logic files.""" cls = type(task) + files: list[dict[str, str]] = [] - # The task's own file - result: list[str] = [] + # The task's own file (workflow node) try: task_file = Path(inspect.getfile(cls)).resolve() - result.append(str(task_file.relative_to(_WORKER_PLAN_DIR))) + files.append({ + "role": "workflow_node", + "path": str(task_file.relative_to(_WORKER_PLAN_DIR)), + }) except (TypeError, ValueError, OSError): pass - # Supplement with auto-detected implementation files - for f in _detect_implementation_files(cls): - if f not in result: - result.append(f) + # Auto-detected implementation files (business logic) + seen = {f["path"] for f in files} + for path in _detect_implementation_files(cls): + if path not in seen: + files.append({ + "role": "business_logic", + "path": path, + }) + seen.add(path) - return result + return {"files": files} def _output_sort_key(stage: dict[str, Any]) -> tuple[int, int, str]: @@ -198,7 +206,7 @@ def _walk(task: luigi.Task) -> None: stage_name = _class_name_to_stage_name(class_name) description = cls.description() if hasattr(cls, "description") else "" output_files = _extract_output_filenames(task) - source_files = _extract_source_files(task) + implementation = _extract_implementation(task) depends_on_names = sorted(set( _class_name_to_stage_name(dep.__class__.__name__) for dep in upstream_tasks @@ -209,7 +217,7 @@ def _walk(task: luigi.Task) -> None: "description": description, "output_files": output_files, "depends_on": depends_on_names, - "source_files": source_files, + "implementation": implementation, }) _walk(root) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index c65933366..9d9ac5db3 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -50,7 +50,7 @@ def _build_registry() -> tuple[NodeInfo, ...]: name=entry["id"], output_files=output_files, depends_on=tuple(entry["depends_on"]), - source_code_files=tuple(entry["source_files"]), + source_code_files=tuple(f["path"] for f in entry["implementation"]["files"]), )) return tuple(nodes) From 1f2de491ec3585e821d718cdcb4948ba7ae9c51e Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 01:23:09 +0200 Subject: [PATCH 29/37] refactor: rename output_files to artifacts with path objects Each artifact is now {"path": "filename"} instead of a flat string. Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/extract_dag.py | 6 +++--- worker_plan/worker_plan_internal/flaw_tracer/registry.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/worker_plan/worker_plan_internal/extract_dag.py b/worker_plan/worker_plan_internal/extract_dag.py index 29a8038b0..171ca3cd1 100644 --- a/worker_plan/worker_plan_internal/extract_dag.py +++ b/worker_plan/worker_plan_internal/extract_dag.py @@ -165,7 +165,7 @@ def _extract_implementation(task: luigi.Task) -> dict[str, Any]: def _output_sort_key(stage: dict[str, Any]) -> tuple[int, int, str]: """Sort key: numeric prefix from the first output filename, then name.""" - filename = stage["output_files"][0] if stage.get("output_files") else "" + filename = stage["artifacts"][0]["path"] if stage.get("artifacts") else "" match = re.match(r"(\d+)-?(\d+)?", filename) if match: major = int(match.group(1)) @@ -205,7 +205,7 @@ def _walk(task: luigi.Task) -> None: cls = type(task) stage_name = _class_name_to_stage_name(class_name) description = cls.description() if hasattr(cls, "description") else "" - output_files = _extract_output_filenames(task) + artifacts = [{"path": f} for f in _extract_output_filenames(task)] implementation = _extract_implementation(task) depends_on_names = sorted(set( _class_name_to_stage_name(dep.__class__.__name__) @@ -215,7 +215,7 @@ def _walk(task: luigi.Task) -> None: stages.append({ "id": stage_name, "description": description, - "output_files": output_files, + "artifacts": artifacts, "depends_on": depends_on_names, "implementation": implementation, }) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index 9d9ac5db3..a13fbfa98 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -45,7 +45,7 @@ def _build_registry() -> tuple[NodeInfo, ...]: dag = extract_dag() nodes = [] for entry in dag["nodes"]: - output_files = tuple(entry["output_files"]) + output_files = tuple(a["path"] for a in entry["artifacts"]) nodes.append(NodeInfo( name=entry["id"], output_files=output_files, From 03a3c1d03cb7aa3843d3e7b5d05f5067c47d1b4e Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 01:27:14 +0200 Subject: [PATCH 30/37] refactor: flatten implementation.files to top-level source_files array Replace nested "implementation": {"files": [...]} with a flat "source_files": [...] array of {role, path} objects. Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/extract_dag.py | 10 +++++----- .../worker_plan_internal/flaw_tracer/registry.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/worker_plan/worker_plan_internal/extract_dag.py b/worker_plan/worker_plan_internal/extract_dag.py index 171ca3cd1..88a7d9fa5 100644 --- a/worker_plan/worker_plan_internal/extract_dag.py +++ b/worker_plan/worker_plan_internal/extract_dag.py @@ -135,8 +135,8 @@ def _detect_implementation_files(cls: type) -> list[str]: return files -def _extract_implementation(task: luigi.Task) -> dict[str, Any]: - """Get implementation info: workflow node file + auto-detected business logic files.""" +def _extract_source_files(task: luigi.Task) -> list[dict[str, str]]: + """Get source files: workflow node file + auto-detected business logic files.""" cls = type(task) files: list[dict[str, str]] = [] @@ -160,7 +160,7 @@ def _extract_implementation(task: luigi.Task) -> dict[str, Any]: }) seen.add(path) - return {"files": files} + return files def _output_sort_key(stage: dict[str, Any]) -> tuple[int, int, str]: @@ -206,7 +206,7 @@ def _walk(task: luigi.Task) -> None: stage_name = _class_name_to_stage_name(class_name) description = cls.description() if hasattr(cls, "description") else "" artifacts = [{"path": f} for f in _extract_output_filenames(task)] - implementation = _extract_implementation(task) + source_files = _extract_source_files(task) depends_on_names = sorted(set( _class_name_to_stage_name(dep.__class__.__name__) for dep in upstream_tasks @@ -217,7 +217,7 @@ def _walk(task: luigi.Task) -> None: "description": description, "artifacts": artifacts, "depends_on": depends_on_names, - "implementation": implementation, + "source_files": source_files, }) _walk(root) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index a13fbfa98..57b0db58a 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -50,7 +50,7 @@ def _build_registry() -> tuple[NodeInfo, ...]: name=entry["id"], output_files=output_files, depends_on=tuple(entry["depends_on"]), - source_code_files=tuple(f["path"] for f in entry["implementation"]["files"]), + source_code_files=tuple(f["path"] for f in entry["source_files"]), )) return tuple(nodes) From 677f7c6c5b42375d8aa1fafc39102f8953fe0c3b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 01:46:26 +0200 Subject: [PATCH 31/37] refactor: replace depends_on with inputs array containing from_node and artifact_path Each input now specifies which upstream node it reads from and which specific artifact file it consumes, instead of a flat list of node names. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../worker_plan_internal/extract_dag.py | 45 ++++++++++++++++-- .../flaw_tracer/registry.py | 46 ++++++++----------- .../flaw_tracer/tests/test_registry.py | 8 ++-- 3 files changed, 62 insertions(+), 37 deletions(-) diff --git a/worker_plan/worker_plan_internal/extract_dag.py b/worker_plan/worker_plan_internal/extract_dag.py index 88a7d9fa5..eb0665bf2 100644 --- a/worker_plan/worker_plan_internal/extract_dag.py +++ b/worker_plan/worker_plan_internal/extract_dag.py @@ -163,6 +163,44 @@ def _extract_source_files(task: luigi.Task) -> list[dict[str, str]]: return files +def _pick_primary_output(filenames: list[str]) -> str: + """Pick the most likely file to be read from a node's outputs. + + Preference: .md > .html > non-raw file > first file. + """ + for ext in (".md", ".html"): + for f in filenames: + if f.endswith(ext): + return f + non_raw = [f for f in filenames if "_raw" not in f] + if non_raw: + return non_raw[0] + return filenames[0] if filenames else "" + + +def _extract_inputs(upstream_tasks: list[luigi.Task]) -> list[dict[str, str]]: + """Build inputs list: for each upstream task, identify the primary artifact it provides.""" + inputs: list[dict[str, str]] = [] + seen: set[str] = set() + + for dep in upstream_tasks: + node_name = _class_name_to_stage_name(dep.__class__.__name__) + if node_name in seen: + continue + seen.add(node_name) + + output_files = _extract_output_filenames(dep) + primary = _pick_primary_output(output_files) + if primary: + inputs.append({ + "from_node": node_name, + "artifact_path": primary, + }) + + inputs.sort(key=lambda x: x["from_node"]) + return inputs + + def _output_sort_key(stage: dict[str, Any]) -> tuple[int, int, str]: """Sort key: numeric prefix from the first output filename, then name.""" filename = stage["artifacts"][0]["path"] if stage.get("artifacts") else "" @@ -206,17 +244,14 @@ def _walk(task: luigi.Task) -> None: stage_name = _class_name_to_stage_name(class_name) description = cls.description() if hasattr(cls, "description") else "" artifacts = [{"path": f} for f in _extract_output_filenames(task)] + inputs = _extract_inputs(upstream_tasks) source_files = _extract_source_files(task) - depends_on_names = sorted(set( - _class_name_to_stage_name(dep.__class__.__name__) - for dep in upstream_tasks - )) stages.append({ "id": stage_name, "description": description, "artifacts": artifacts, - "depends_on": depends_on_names, + "inputs": inputs, "source_files": source_files, }) diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/flaw_tracer/registry.py index 57b0db58a..a0c990b1b 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/registry.py @@ -16,40 +16,36 @@ _SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ +@dataclass(frozen=True) +class NodeInput: + """One input to a pipeline node: the upstream node name and the artifact it provides.""" + from_node: str + artifact_path: str + + @dataclass(frozen=True) class NodeInfo: """One pipeline node.""" name: str output_files: tuple[str, ...] - depends_on: tuple[str, ...] = () + inputs: tuple[NodeInput, ...] = () source_code_files: tuple[str, ...] = () -def _pick_primary_output(filenames: list[str]) -> str: - """Pick the best file to read when checking a node for flaws. - - Preference: .md > .html > non-raw file > first file. - """ - for ext in (".md", ".html"): - for f in filenames: - if f.endswith(ext): - return f - non_raw = [f for f in filenames if "_raw" not in f] - if non_raw: - return non_raw[0] - return filenames[0] if filenames else "" - - def _build_registry() -> tuple[NodeInfo, ...]: """Build the registry from Luigi task introspection.""" dag = extract_dag() nodes = [] for entry in dag["nodes"]: output_files = tuple(a["path"] for a in entry["artifacts"]) + inputs = tuple( + NodeInput(from_node=inp["from_node"], artifact_path=inp["artifact_path"]) + for inp in entry["inputs"] + ) nodes.append(NodeInfo( name=entry["id"], output_files=output_files, - depends_on=tuple(entry["depends_on"]), + inputs=inputs, source_code_files=tuple(f["path"] for f in entry["source_files"]), )) return tuple(nodes) @@ -72,22 +68,16 @@ def find_node_by_filename(filename: str) -> NodeInfo | None: def get_upstream_files(node_name: str, output_dir: Path) -> list[tuple[str, Path]]: - """Return (node_name, file_path) pairs for upstream nodes whose primary output exists on disk.""" + """Return (node_name, file_path) pairs for upstream nodes whose artifact exists on disk.""" node = _NODE_BY_NAME.get(node_name) if node is None: return [] result = [] - for upstream_name in node.depends_on: - upstream_node = _NODE_BY_NAME.get(upstream_name) - if upstream_node is None: - continue - primary = _pick_primary_output(list(upstream_node.output_files)) - if not primary: - continue - primary_path = output_dir / primary - if primary_path.exists(): - result.append((upstream_name, primary_path)) + for inp in node.inputs: + artifact_path = output_dir / inp.artifact_path + if artifact_path.exists(): + result.append((inp.from_node, artifact_path)) return result diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py index cf437947c..48072280d 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py @@ -20,18 +20,18 @@ def test_all_nodes_have_required_fields(self): self.assertIsInstance(node.name, str, f"{node.name} name") self.assertIsInstance(node.output_files, tuple, f"{node.name} output_files") self.assertTrue(len(node.output_files) > 0, f"{node.name} has no output_files") - self.assertIsInstance(node.depends_on, tuple, f"{node.name} depends_on") + self.assertIsInstance(node.inputs, tuple, f"{node.name} inputs") self.assertIsInstance(node.source_code_files, tuple, f"{node.name} source_code_files") def test_no_duplicate_node_names(self): names = [n.name for n in NODES] self.assertEqual(len(names), len(set(names))) - def test_upstream_references_are_valid(self): + def test_input_references_are_valid(self): valid_names = {n.name for n in NODES} for node in NODES: - for upstream in node.depends_on: - self.assertIn(upstream, valid_names, f"{node.name} references unknown upstream '{upstream}'") + for inp in node.inputs: + self.assertIn(inp.from_node, valid_names, f"{node.name} references unknown node '{inp.from_node}'") class TestFindNodeByFilename(unittest.TestCase): From b06c936d0d5d087ef29bdafb2c46bc33dd66e438 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 03:03:15 +0200 Subject: [PATCH 32/37] =?UTF-8?q?docs:=20add=20proposal=20133=20=E2=80=94?= =?UTF-8?q?=20DAG=20format=20insights=20and=20RCA=20strategy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Analyzes the current DAG JSON format's strengths for root cause analysis and identifies gaps (claim-level provenance, runtime context, artifact semantics). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-04-05-flaw-tracer.md | 11 ++- .../specs/2026-04-05-flaw-tracer-design.md | 7 +- .../flaw_tracer/AGENTS.md | 65 ++++++++------- .../flaw_tracer/README.md | 83 +++++++++++++------ 4 files changed, 107 insertions(+), 59 deletions(-) diff --git a/docs/superpowers/plans/2026-04-05-flaw-tracer.md b/docs/superpowers/plans/2026-04-05-flaw-tracer.md index 97b49ddb8..ab4872fa4 100644 --- a/docs/superpowers/plans/2026-04-05-flaw-tracer.md +++ b/docs/superpowers/plans/2026-04-05-flaw-tracer.md @@ -1,10 +1,15 @@ -# Flaw Tracer Implementation Plan +# Root Cause Analysis (RCA) Implementation Plan + +> **Historical note:** This plan was written under the name "flaw tracer". The module +> may be renamed to `rca`, `trace`, `provenance`, or `upstream_tracer` in a future PR. +> The static DAG registry described here has since been replaced by `extract_dag.py` +> which introspects the Luigi task graph at import time. > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. -**Goal:** Build a CLI tool that traces flaws in PlanExe reports upstream through the pipeline DAG to find root causes. +**Goal:** Build a CLI tool that traces problems in PlanExe reports upstream through the pipeline DAG to find root causes. -**Architecture:** Recursive depth-first search through a static DAG registry. Three LLM prompts (flaw identification, upstream check, source code analysis) use Pydantic structured output via LLMExecutor. Produces JSON + markdown reports. +**Architecture:** Recursive depth-first search using a DAG registry built from Luigi task introspection. Three LLM prompts (problem identification, upstream check, source code analysis) use Pydantic structured output via LLMExecutor. Produces JSON + markdown reports. **Tech Stack:** Python 3.13, llama-index LLM infrastructure, Pydantic v2, argparse, pytest diff --git a/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md b/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md index cd77ac537..7b5ad0e4f 100644 --- a/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md +++ b/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md @@ -1,4 +1,9 @@ -# Flaw Tracer — Root-Cause Analysis for PlanExe Reports +# Root Cause Analysis (RCA) for PlanExe Reports + +> **Historical note:** This spec was written under the name "flaw tracer". The module +> may be renamed to `rca`, `trace`, `provenance`, or `upstream_tracer` in a future PR. +> The static DAG registry described here has since been replaced by `extract_dag.py` +> which introspects the Luigi task graph at import time. ## Goal diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md index 9e49a4619..be38c5afe 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md @@ -1,26 +1,30 @@ -# Flaw Tracer — Status and Known Issues +# Root Cause Analysis (RCA) — Status and Known Issues + +> **Naming note:** This module is currently `flaw_tracer`. Candidates for rename: +> `rca`, `trace`, `provenance`, `upstream_tracer`. See README.md for rationale. ## What works well -- **DAG traversal is correct.** The registry maps all 70 nodes, upstream resolution works, dedup prevents redundant checks, depth limiting works. -- **Phase 1 anchors to the user's flaw.** The user's specific flaw is always the first result, with additional flaws limited to the same problem family. -- **Upstream checks require causal links.** The prompt requires the LLM to explain *how* upstream content caused the downstream flaw, not just topical overlap. This produces tighter, more accurate traces. -- **Phase 3 classifies root causes.** Each origin is categorized as `prompt_fixable`, `domain_complexity`, or `missing_input`. Verified: the India census caste enumeration flaw is correctly classified as `domain_complexity`, while the workforce feasibility flaw is `prompt_fixable`. +- **DAG is auto-generated.** The registry builds from `extract_dag.py` via Luigi task introspection at import time — no hand-maintained mapping. Adding, removing, or renaming pipeline nodes requires zero manual updates. +- **Phase 1 anchors to the user's problem.** The user's specific problem is always the first result, with additional problems limited to the same family. +- **Upstream checks require causal links.** The prompt requires the LLM to explain *how* upstream content caused the downstream problem, not just topical overlap. This produces tighter, more accurate traces. +- **Phase 3 classifies root causes.** Each origin is categorized as `prompt_fixable`, `domain_complexity`, or `missing_input`. Verified: the India census caste enumeration problem is correctly classified as `domain_complexity`, while the workforce feasibility problem is `prompt_fixable`. - **Evidence quotes are concise.** Both Phase 1 and Phase 2 prompts instruct the LLM to keep quotes under 200 characters. - **Source code filenames are disambiguated.** Shows `nodes/identify_purpose.py` and `assume/identify_purpose.py` instead of duplicate bare filenames. - **Depth sorting is useful.** Deepest root causes appear first, matching the user's intent of finding the earliest upstream origin. - **Events.jsonl enables live monitoring.** Users can `tail -f events.jsonl` to watch progress instead of waiting blindly. -- **Focused output.** A typical run finds 2-3 flaws in the same problem family and makes 15-30 LLM calls (down from 17 flaws / 153 calls before prompt improvements). +- **Focused output.** A typical run finds 2-3 problems in the same family and makes 15-30 LLM calls (down from 17 problems / 153 calls before prompt improvements). +- **DAG schema is rich.** Each node has artifacts, inputs (with from_node + artifact_path), and source_files (with role: workflow_node or business_logic). This enables artifact-level provenance tracing. ## Fixed issues -### Phase 1 didn't anchor to user's flaw (was HIGH, fixed) +### Phase 1 didn't anchor to user's problem (was HIGH, fixed) -The Phase 1 prompt now requires the user's specific flaw as the first item, with additional flaws limited to the same problem family. Before the fix, the LLM would ignore the user's flaw and identify unrelated issues. +The Phase 1 prompt now requires the user's specific problem as the first item, with additional problems limited to the same family. Before the fix, the LLM would ignore the user's problem and identify unrelated issues. ### Upstream checks were too loose (was MEDIUM, fixed) -The Phase 2 prompt now requires a causal mechanism ("how did this upstream content lead to the downstream flaw?") and explicitly rejects topical overlap. Before the fix, the LLM would say "found" whenever an upstream file discussed a related topic. +The Phase 2 prompt now requires a causal mechanism ("how did this upstream content lead to the downstream problem?") and explicitly rejects topical overlap. Before the fix, the LLM would say "found" whenever an upstream file discussed a related topic. ### Evidence quotes were too long (was MEDIUM, fixed) @@ -39,55 +43,58 @@ Before the fix, every suggestion was "modify the system prompt" even when the re Source code paths now include the parent directory (`nodes/identify_purpose.py`) to disambiguate files with the same name in different packages. +### Static registry drifted (was MEDIUM, fixed) + +The DAG registry was a 780-line hand-maintained copy of the pipeline topology. Now replaced with `extract_dag.py` which introspects the Luigi task graph at import time. Zero maintenance needed when pipeline changes. + ## Open issues ### MEDIUM: Non-determinism untested This is LLM judging LLM output. Every upstream check is a subjective call. Two runs on the same input may produce different traces. We haven't tested reproducibility — run the same input 3 times and compare. If traces diverge significantly, consider requiring higher-confidence matches or running multiple passes and intersecting results. -### MEDIUM: Static registry will drift - -The DAG mapping in `registry.py` is a hand-maintained copy of the pipeline topology. Adding, removing, or renaming nodes requires a manual update — the registry won't auto-detect changes. If the registry falls out of sync, traces will silently miss nodes or follow wrong paths. +### LOW: First-match-wins may miss parallel origins -**Fix direction:** Generate the registry from Luigi task introspection at startup, or add a CI check that compares the registry against the actual task classes. +The `_trace_upstream` method follows only the first upstream branch where the problem is found. Real problems often have multiple contributing causes from parallel branches, but only one is traced. The trace looks clean and linear, but reality is messier. -### LOW: First-match-wins may miss parallel origins +**Fix direction:** Add a `--thorough` mode that follows all branches where the problem is found, producing a tree instead of a chain. -The `_trace_upstream` method follows only the first upstream branch where the flaw is found. Real flaws often have multiple contributing causes from parallel branches, but only one is traced. The trace looks clean and linear, but reality is messier. +### LOW: Problem convergence on same origin -**Fix direction:** Add a `--thorough` mode that follows all branches where the flaw is found, producing a tree instead of a chain. +After prompt tightening, convergence makes sense — problems in the same family naturally trace to the same origin. Monitor across more diverse runs. -### LOW: Flaw convergence on same origin +### LOW: Artifact-level only, not claim-level -After prompt tightening, convergence makes sense — flaws in the same problem family naturally trace to the same origin. Monitor across more diverse runs. +The tool traces at the artifact level (which file introduced the problem) but cannot yet attribute individual sentences to specific input spans. See `docs/proposals/133-dag-and-rca.md` for the gap analysis and future directions. ## Test runs completed -1. **India census v1** (`20250101_india_census`): Old prompts. 17 flaws, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Flaws not anchored to user input, traces loose, evidence bloated. +1. **India census v1** (`20250101_india_census`): Old prompts. 17 problems, 153 LLM calls, deepest origin: `potential_levers` (depth 6). Problems not anchored to user input, traces loose, evidence bloated. -2. **Minecraft escape v1** (`20251016_minecraft_escape`): Old prompts. Flaw about zoning/permits. 5 flaws, 43 LLM calls. User's flaw not identified. Exposed Phase 1 anchoring problem. +2. **Minecraft escape v1** (`20251016_minecraft_escape`): Old prompts. Problem about zoning/permits. 5 problems, 43 LLM calls. User's problem not identified. Exposed Phase 1 anchoring problem. -3. **Minecraft escape v2** (`20251016_minecraft_escape`): New prompts. 3 flaws, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's flaw correctly identified as flaw_001. All flaws in same problem family (regulatory gaps). +3. **Minecraft escape v2** (`20251016_minecraft_escape`): New prompts. 3 problems, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's problem correctly identified as flaw_001. All problems in same family (regulatory gaps). -4. **India census v2** (`20250101_india_census`): New prompts. 2 flaws (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's flaw correctly identified. Exposed Phase 3 "always blames prompt" limitation. +4. **India census v2** (`20250101_india_census`): New prompts. 2 problems (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's problem correctly identified. Exposed Phase 3 "always blames prompt" limitation. -5. **India census v3** (`20250101_india_census`): New prompts + Phase 3 classification. 2 flaws, 17 LLM calls. Caste enumeration correctly classified as `domain_complexity`. Workforce feasibility correctly classified as `prompt_fixable`. All fixes verified working. +5. **India census v3** (`20250101_india_census`): New prompts + Phase 3 classification. 2 problems, 17 LLM calls. Caste enumeration correctly classified as `domain_complexity`. Workforce feasibility correctly classified as `prompt_fixable`. All fixes verified working. ## Honest assessment -The tool is a useful diagnostic prototype. The trace chains are the most trustworthy part — they're mechanically grounded in the DAG structure. The suggestions are LLM opinions — useful starting points, not patches. +The tool is a useful diagnostic prototype for root cause analysis. The trace chains are the most trustworthy part — they're mechanically grounded in the DAG structure. The suggestions are LLM opinions — useful starting points, not patches. -The category classification (`prompt_fixable` / `domain_complexity` / `missing_input`) turned out to be the most valuable feature. It prevents wasted effort on flaws that can't be fixed by prompt editing. +The category classification (`prompt_fixable` / `domain_complexity` / `missing_input`) turned out to be the most valuable feature. It prevents wasted effort on problems that can't be fixed by prompt editing. -The tool is diagnostic, not prescriptive. It tells you *where* a flaw originated and *why*, but someone still has to decide what to do. It can't catch flaws that don't leave textual evidence — timing issues, model-specific quirks, or structural DAG problems are invisible. +The tool is diagnostic, not prescriptive. It tells you *where* a problem originated and *why*, but someone still has to decide what to do. It can't catch problems that don't leave textual evidence — timing issues, model-specific quirks, or structural DAG problems are invisible. -Starting from `029-2-self_audit.md` is the sweet spot. That file already contains identified issues, so the tracer is tracing known problems upstream rather than discovering flaws from scratch. +Starting from `029-2-self_audit.md` is the sweet spot. That file already contains identified issues, so the tracer is tracing known problems upstream rather than discovering problems from scratch. Before relying on this for automated decisions (e.g., in the self-improve loop), it needs more diverse test runs (10+ plans) and reproducibility testing. ## Architecture notes - The tool runs from `worker_plan/` directory using Python 3.11. +- The DAG registry is built from `extract_dag.py` at import time — no static data. - LLM calls go through `LLMExecutor` with the active model profile (`PLANEXE_MODEL_PROFILE`). -- The `record_usage_metric called but no usage metrics path is set` warnings are harmless — the flaw tracer doesn't set up the metrics path since it's a standalone CLI tool, not a pipeline task. -- The first-match-wins strategy in `_trace_upstream` means only one upstream branch is followed per flaw. If the flaw exists in multiple upstream branches, only the first one encountered is traced. +- The `record_usage_metric called but no usage metrics path is set` warnings are harmless — the RCA tool doesn't set up the metrics path since it's a standalone CLI tool, not a pipeline task. +- The first-match-wins strategy in `_trace_upstream` means only one upstream branch is followed per problem. If the problem exists in multiple upstream branches, only the first one encountered is traced. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/flaw_tracer/README.md index e64e163ab..5148bafb1 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/flaw_tracer/README.md @@ -1,15 +1,25 @@ -# Flaw Tracer +# Root Cause Analysis (RCA) for PlanExe -Root-cause analysis tool for PlanExe reports. Given a flaw observed in a pipeline output, it traces upstream through the DAG of intermediary files to find where the flaw originated. +> **Naming note:** This module is currently named `flaw_tracer`. Candidate names under consideration: +> - `rca` — direct, matches the goal (root cause analysis) +> - `trace` — short, verb-oriented +> - `provenance` — emphasizes the artifact lineage aspect +> - `upstream_tracer` — describes the direction of analysis +> +> The module may be renamed in a future PR. + +Given a problem observed in a pipeline output, this tool traces upstream through the DAG of intermediary artifacts to find where the problem originated and classify its root cause. ## How it works -PlanExe runs a DAG of ~70 tasks. Each task reads upstream files, calls an LLM, and writes output files (prefixed `001-` through `030-`). Flaws introduced early propagate downstream into later nodes and the final report. +PlanExe runs a DAG of ~70 nodes. Each node reads upstream artifacts, calls an LLM, and writes output artifacts (prefixed `001-` through `030-`). Problems introduced early propagate downstream into later nodes and the final report. + +The DAG structure is extracted automatically from the Luigi task graph by `extract_dag.py` — no hand-maintained registry needed. The registry builds at import time via Luigi task introspection. -The flaw tracer performs a recursive depth-first search: +The tool performs a recursive depth-first search: -1. **Phase 1 — Identify flaws.** Reads the starting file and locates the specific flaw you described, plus any closely related flaws in the same problem family. -2. **Phase 2 — Trace upstream.** For each flaw, walks upstream through the DAG one hop at a time, asking the LLM whether the flaw was *caused by* content in each input file (requires causal link, not just topical overlap). Continues until it finds a node where the flaw exists in the output but not in any inputs. +1. **Phase 1 — Identify problems.** Reads the starting artifact and locates the specific problem you described, plus any closely related problems in the same family. +2. **Phase 2 — Trace upstream.** For each problem, walks upstream through the DAG one hop at a time, asking the LLM whether the problem was *caused by* content in each input artifact (requires causal link, not just topical overlap). Continues until it finds a node where the problem exists in the output but not in any inputs. 3. **Phase 3 — Analyze source code and classify.** At the origin node, reads the Python source code and classifies the root cause: - **Prompt fixable** — the prompt has a gap that can be fixed by editing it - **Domain complexity** — the topic is inherently uncertain or contentious, no prompt change resolves it @@ -17,6 +27,15 @@ The flaw tracer performs a recursive depth-first search: Output is a JSON file (`flaw_trace.json`), a markdown report (`flaw_trace.md`), and a live event log (`events.jsonl`), sorted by trace depth so the deepest root cause appears first. +## DAG integration + +The pipeline DAG is defined in `extract_dag.py` which introspects the actual Luigi task graph at import time. Each node in the DAG provides: +- **artifacts** — output files the node produces +- **inputs** — which upstream node and specific artifact each node reads +- **source_files** — the workflow node file and business logic files + +This means the RCA tool always stays in sync with the pipeline — no manual registry updates needed when nodes are added, removed, or renamed. + ## Prerequisites - Python 3.11 (`/opt/homebrew/bin/python3.11` on macOS with Homebrew) @@ -37,7 +56,7 @@ Basic usage: /opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ --dir /path/to/output \ --file 030-report.html \ - --flaw "Description of the flaw you observed" \ + --flaw "Description of the problem you observed" \ --verbose ``` @@ -45,20 +64,20 @@ Basic usage: | Argument | Required | Description | |----------|----------|-------------| -| `--dir` | Yes | Path to the output directory containing intermediary files | -| `--file` | Yes | Starting file to analyze (relative to `--dir`) | -| `--flaw` | Yes | Text description of the observed flaw(s) | +| `--dir` | Yes | Path to the output directory containing intermediary artifacts | +| `--file` | Yes | Starting artifact to analyze (relative to `--dir`) | +| `--flaw` | Yes | Text description of the observed problem(s) | | `--output-dir` | No | Where to write reports (defaults to `--dir`) | -| `--max-depth` | No | Maximum upstream hops per flaw (default: 15) | +| `--max-depth` | No | Maximum upstream hops per problem (default: 15) | | `--verbose` | No | Print each LLM call to stderr as the trace runs | ### Starting files -You can start from any intermediary file. Common starting points: +You can start from any intermediary artifact. Common starting points: | File | What it is | |------|------------| -| `030-report.html` | The final HTML report (largest, most flaws to find) | +| `030-report.html` | The final HTML report (largest, most problems to find) | | `029-2-self_audit.md` | Self-audit (already identifies issues — good for tracing them back) | | `025-2-executive_summary.md` | Executive summary | | `024-2-review_plan.md` | Plan review | @@ -66,25 +85,25 @@ You can start from any intermediary file. Common starting points: ### Examples -Trace a flaw from the self-audit: +Trace a problem from the self-audit: ```bash /opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ --dir /path/to/output/20250101_india_census \ --file 029-2-self_audit.md \ --flaw "No Real-World Proof. The plan combines a digital census with caste enumeration at an unprecedented scale, lacking independent evidence of success." \ - --output-dir /tmp/flaw-analysis \ + --output-dir /tmp/rca-analysis \ --verbose ``` -Trace a zoning/permits flaw: +Trace a zoning/permits problem: ```bash /opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ --dir /path/to/output/20251016_minecraft_escape \ --file 029-2-self_audit.md \ --flaw "Infeasible Constraints Rated MEDIUM because the plan mentions zoning and permits but lacks specifics for the Shanghai location." \ - --output-dir /tmp/flaw-analysis2 \ + --output-dir /tmp/rca-analysis2 \ --verbose ``` @@ -93,7 +112,7 @@ Trace a zoning/permits flaw: While the tracer runs, watch the live event log in another terminal: ```bash -tail -f /tmp/flaw-analysis/events.jsonl +tail -f /tmp/rca-analysis/events.jsonl ``` ### Output @@ -104,23 +123,35 @@ Each run produces three files in `--output-dir` (or `--dir` if not specified): - `flaw_trace.md` — human-readable report with trace tables - `events.jsonl` — live event log for monitoring progress -Flaws are sorted by trace depth (deepest root cause first). Each flaw's origin includes a **category** (`prompt_fixable`, `domain_complexity`, or `missing_input`) so you know whether the fix is a prompt edit, a domain limitation to accept, or a need for more detail in the plan input. +Problems are sorted by trace depth (deepest root cause first). Each problem's origin includes a **category** (`prompt_fixable`, `domain_complexity`, or `missing_input`) so you know whether the fix is a prompt edit, a domain limitation to accept, or a need for more detail in the plan input. + +A typical run finds 2-3 focused problems and makes 15-30 LLM calls. + +## RCA investigation strategy + +The tool implements the investigation strategy described in `docs/proposals/133-dag-and-rca.md`: -A typical run finds 2-3 focused flaws and makes 15-30 LLM calls. +1. Start from the final artifact (e.g., `030-report.html`) +2. Inspect direct input artifacts to the producing node +3. Search those artifacts for the false claim or problem +4. When found upstream, recurse into that node's inputs +5. Continue until reaching the earliest artifact containing the problem +6. Inspect the producing node's source files +7. Classify the failure mode ## Tips -- **Start from `029-2-self_audit.md`.** This file already contains identified issues, so you're tracing *known* problems upstream rather than asking the LLM to find flaws from scratch. -- **Trust the trace chains more than the suggestions.** The upstream path (which nodes the flaw passed through) is mechanically grounded in the DAG. The suggestions are LLM opinions — useful starting points, not patches. +- **Start from `029-2-self_audit.md`.** This file already contains identified issues, so you're tracing *known* problems upstream rather than asking the LLM to find problems from scratch. +- **Trust the trace chains more than the suggestions.** The upstream path (which nodes the problem passed through) is mechanically grounded in the DAG. The suggestions are LLM opinions — useful starting points, not patches. - **Check the category before acting.** If the origin is `domain_complexity`, don't spend time tweaking the prompt. If it's `prompt_fixable`, the suggestion is likely actionable. - **Results are non-deterministic.** This is LLM judging LLM output. Two runs on the same input may produce slightly different traces. If a finding matters, run it twice. ## Limitations -- **LLM subjectivity.** Every hop in the trace is a judgment call by the LLM ("did this upstream file cause the downstream flaw?"). The causal-link requirement helps, but it's still one LLM's opinion. -- **First-match-wins.** When a flaw has precursors in multiple parallel upstream branches, only the first branch found is followed. Real flaws often have multiple contributing causes. -- **Static registry.** The DAG mapping is hand-maintained in `registry.py`. Adding, removing, or renaming pipeline nodes requires a manual registry update — it won't auto-detect changes. -- **Text-only.** The tracer can only catch flaws that leave textual evidence in intermediary files. Timing issues, model-specific quirks, or structural DAG problems are invisible to it. +- **LLM subjectivity.** Every hop in the trace is a judgment call by the LLM ("did this upstream artifact cause the downstream problem?"). The causal-link requirement helps, but it's still one LLM's opinion. +- **First-match-wins.** When a problem has precursors in multiple parallel upstream branches, only the first branch found is followed. Real problems often have multiple contributing causes. +- **Text-only.** The tracer can only catch problems that leave textual evidence in intermediary artifacts. Timing issues, model-specific quirks, or structural DAG problems are invisible to it. +- **Artifact-level, not claim-level.** The tool can identify which artifact and node likely introduced a problem, but cannot yet prove which exact sentence transformation introduced a specific false claim (see `docs/proposals/133-dag-and-rca.md` for the gap analysis). - **Diagnostic, not prescriptive.** It tells you *where* and *why*, but someone still has to decide what to do about it. ## Running tests From 7827d8455ad189794959d6617df2d92b1aceb575 Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 15:10:23 +0200 Subject: [PATCH 33/37] refactor: rename flaw_tracer to rca and FlawTracer to RootCauseAnalyzer Co-Authored-By: Claude Opus 4.6 (1M context) --- ...04-05-flaw-tracer.md => 2026-04-05-rca.md} | 3 ++- ...cer-design.md => 2026-04-05-rca-design.md} | 2 +- .../flaw_tracer/__init__.py | 1 - .../{flaw_tracer => rca}/AGENTS.md | 3 --- .../{flaw_tracer => rca}/README.md | 16 +++--------- .../worker_plan_internal/rca/__init__.py | 1 + .../{flaw_tracer => rca}/__main__.py | 12 ++++----- .../{flaw_tracer => rca}/output.py | 4 +-- .../{flaw_tracer => rca}/prompts.py | 4 +-- .../{flaw_tracer => rca}/registry.py | 4 +-- .../{flaw_tracer => rca}/tests/__init__.py | 0 .../{flaw_tracer => rca}/tests/test_output.py | 6 ++--- .../tests/test_prompts.py | 4 +-- .../tests/test_registry.py | 4 +-- .../{flaw_tracer => rca}/tests/test_tracer.py | 26 +++++++++---------- .../{flaw_tracer => rca}/tracer.py | 8 +++--- 16 files changed, 44 insertions(+), 54 deletions(-) rename docs/superpowers/plans/{2026-04-05-flaw-tracer.md => 2026-04-05-rca.md} (99%) rename docs/superpowers/specs/{2026-04-05-flaw-tracer-design.md => 2026-04-05-rca-design.md} (99%) delete mode 100644 worker_plan/worker_plan_internal/flaw_tracer/__init__.py rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/AGENTS.md (98%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/README.md (92%) create mode 100644 worker_plan/worker_plan_internal/rca/__init__.py rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/__main__.py (91%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/output.py (97%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/prompts.py (98%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/registry.py (95%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tests/__init__.py (100%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tests/test_output.py (96%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tests/test_prompts.py (97%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tests/test_registry.py (97%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tests/test_tracer.py (96%) rename worker_plan/worker_plan_internal/{flaw_tracer => rca}/tracer.py (98%) diff --git a/docs/superpowers/plans/2026-04-05-flaw-tracer.md b/docs/superpowers/plans/2026-04-05-rca.md similarity index 99% rename from docs/superpowers/plans/2026-04-05-flaw-tracer.md rename to docs/superpowers/plans/2026-04-05-rca.md index ab4872fa4..fed9dae2b 100644 --- a/docs/superpowers/plans/2026-04-05-flaw-tracer.md +++ b/docs/superpowers/plans/2026-04-05-rca.md @@ -1,7 +1,8 @@ # Root Cause Analysis (RCA) Implementation Plan > **Historical note:** This plan was written under the name "flaw tracer". The module -> may be renamed to `rca`, `trace`, `provenance`, or `upstream_tracer` in a future PR. +> has been renamed to `rca` (root cause analysis) — all paths referencing `flaw_tracer` +> in this document now correspond to `rca`. > The static DAG registry described here has since been replaced by `extract_dag.py` > which introspects the Luigi task graph at import time. diff --git a/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md b/docs/superpowers/specs/2026-04-05-rca-design.md similarity index 99% rename from docs/superpowers/specs/2026-04-05-flaw-tracer-design.md rename to docs/superpowers/specs/2026-04-05-rca-design.md index 7b5ad0e4f..e8dd344b3 100644 --- a/docs/superpowers/specs/2026-04-05-flaw-tracer-design.md +++ b/docs/superpowers/specs/2026-04-05-rca-design.md @@ -1,7 +1,7 @@ # Root Cause Analysis (RCA) for PlanExe Reports > **Historical note:** This spec was written under the name "flaw tracer". The module -> may be renamed to `rca`, `trace`, `provenance`, or `upstream_tracer` in a future PR. +> has been renamed to `rca` (root cause analysis). > The static DAG registry described here has since been replaced by `extract_dag.py` > which introspects the Luigi task graph at import time. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__init__.py b/worker_plan/worker_plan_internal/flaw_tracer/__init__.py deleted file mode 100644 index e6ca3af64..000000000 --- a/worker_plan/worker_plan_internal/flaw_tracer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Flaw Tracer — Root-cause analysis for PlanExe reports.""" diff --git a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md b/worker_plan/worker_plan_internal/rca/AGENTS.md similarity index 98% rename from worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md rename to worker_plan/worker_plan_internal/rca/AGENTS.md index be38c5afe..de8d0df32 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/AGENTS.md +++ b/worker_plan/worker_plan_internal/rca/AGENTS.md @@ -1,8 +1,5 @@ # Root Cause Analysis (RCA) — Status and Known Issues -> **Naming note:** This module is currently `flaw_tracer`. Candidates for rename: -> `rca`, `trace`, `provenance`, `upstream_tracer`. See README.md for rationale. - ## What works well - **DAG is auto-generated.** The registry builds from `extract_dag.py` via Luigi task introspection at import time — no hand-maintained mapping. Adding, removing, or renaming pipeline nodes requires zero manual updates. diff --git a/worker_plan/worker_plan_internal/flaw_tracer/README.md b/worker_plan/worker_plan_internal/rca/README.md similarity index 92% rename from worker_plan/worker_plan_internal/flaw_tracer/README.md rename to worker_plan/worker_plan_internal/rca/README.md index 5148bafb1..9b5986cfa 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/README.md +++ b/worker_plan/worker_plan_internal/rca/README.md @@ -1,13 +1,5 @@ # Root Cause Analysis (RCA) for PlanExe -> **Naming note:** This module is currently named `flaw_tracer`. Candidate names under consideration: -> - `rca` — direct, matches the goal (root cause analysis) -> - `trace` — short, verb-oriented -> - `provenance` — emphasizes the artifact lineage aspect -> - `upstream_tracer` — describes the direction of analysis -> -> The module may be renamed in a future PR. - Given a problem observed in a pipeline output, this tool traces upstream through the DAG of intermediary artifacts to find where the problem originated and classify its root cause. ## How it works @@ -53,7 +45,7 @@ cd worker_plan Basic usage: ```bash -/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ +/opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ --flaw "Description of the problem you observed" \ @@ -88,7 +80,7 @@ You can start from any intermediary artifact. Common starting points: Trace a problem from the self-audit: ```bash -/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ +/opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output/20250101_india_census \ --file 029-2-self_audit.md \ --flaw "No Real-World Proof. The plan combines a digital census with caste enumeration at an unprecedented scale, lacking independent evidence of success." \ @@ -99,7 +91,7 @@ Trace a problem from the self-audit: Trace a zoning/permits problem: ```bash -/opt/homebrew/bin/python3.11 -m worker_plan_internal.flaw_tracer \ +/opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output/20251016_minecraft_escape \ --file 029-2-self_audit.md \ --flaw "Infeasible Constraints Rated MEDIUM because the plan mentions zoning and permits but lacks specifics for the Shanghai location." \ @@ -158,5 +150,5 @@ The tool implements the investigation strategy described in `docs/proposals/133- ```bash cd worker_plan -/opt/homebrew/bin/python3.11 -m pytest worker_plan_internal/flaw_tracer/tests/ -v +/opt/homebrew/bin/python3.11 -m pytest worker_plan_internal/rca/tests/ -v ``` diff --git a/worker_plan/worker_plan_internal/rca/__init__.py b/worker_plan/worker_plan_internal/rca/__init__.py new file mode 100644 index 000000000..adec8b493 --- /dev/null +++ b/worker_plan/worker_plan_internal/rca/__init__.py @@ -0,0 +1 @@ +"""RCA — Root-cause analysis for PlanExe reports.""" diff --git a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py b/worker_plan/worker_plan_internal/rca/__main__.py similarity index 91% rename from worker_plan/worker_plan_internal/flaw_tracer/__main__.py rename to worker_plan/worker_plan_internal/rca/__main__.py index e9fadccff..c40eb73ae 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/__main__.py +++ b/worker_plan/worker_plan_internal/rca/__main__.py @@ -1,8 +1,8 @@ -# worker_plan/worker_plan_internal/flaw_tracer/__main__.py -"""CLI entry point for the flaw tracer. +# worker_plan/worker_plan_internal/rca/__main__.py +"""CLI entry point for RCA (root cause analysis). Usage: - python -m worker_plan_internal.flaw_tracer \ + python -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ --flaw "The budget appears unvalidated..." \ @@ -14,8 +14,8 @@ import sys from pathlib import Path -from worker_plan_internal.flaw_tracer.tracer import FlawTracer -from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.rca.tracer import RootCauseAnalyzer +from worker_plan_internal.rca.output import write_json_report, write_markdown_report from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelFromName, RetryConfig from worker_plan_internal.llm_factory import get_llm_names_by_priority @@ -78,7 +78,7 @@ def main() -> None: events_path = report_dir / "events.jsonl" - tracer = FlawTracer( + tracer = RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, max_depth=args.max_depth, diff --git a/worker_plan/worker_plan_internal/flaw_tracer/output.py b/worker_plan/worker_plan_internal/rca/output.py similarity index 97% rename from worker_plan/worker_plan_internal/flaw_tracer/output.py rename to worker_plan/worker_plan_internal/rca/output.py index dfec61856..cf8d78826 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/output.py +++ b/worker_plan/worker_plan_internal/rca/output.py @@ -1,10 +1,10 @@ -# worker_plan/worker_plan_internal/flaw_tracer/output.py +# worker_plan/worker_plan_internal/rca/output.py """JSON and markdown report generation for flaw trace results.""" import json from datetime import datetime, UTC from pathlib import Path -from worker_plan_internal.flaw_tracer.tracer import FlawTraceResult +from worker_plan_internal.rca.tracer import FlawTraceResult def write_json_report(result: FlawTraceResult, output_path: Path) -> None: diff --git a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py b/worker_plan/worker_plan_internal/rca/prompts.py similarity index 98% rename from worker_plan/worker_plan_internal/flaw_tracer/prompts.py rename to worker_plan/worker_plan_internal/rca/prompts.py index b510970e8..89a79444e 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/prompts.py +++ b/worker_plan/worker_plan_internal/rca/prompts.py @@ -1,5 +1,5 @@ -# worker_plan/worker_plan_internal/flaw_tracer/prompts.py -"""Pydantic models and prompt builders for the flaw tracer.""" +# worker_plan/worker_plan_internal/rca/prompts.py +"""Pydantic models and prompt builders for RCA.""" from typing import Literal from pydantic import BaseModel, Field from llama_index.core.llms import ChatMessage, MessageRole diff --git a/worker_plan/worker_plan_internal/flaw_tracer/registry.py b/worker_plan/worker_plan_internal/rca/registry.py similarity index 95% rename from worker_plan/worker_plan_internal/flaw_tracer/registry.py rename to worker_plan/worker_plan_internal/rca/registry.py index a0c990b1b..3db157087 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/registry.py +++ b/worker_plan/worker_plan_internal/rca/registry.py @@ -1,5 +1,5 @@ -# worker_plan/worker_plan_internal/flaw_tracer/registry.py -"""DAG registry for the flaw tracer, built from Luigi task introspection. +# worker_plan/worker_plan_internal/rca/registry.py +"""DAG registry for RCA, built from Luigi task introspection. Replaces the former hand-maintained static registry with data extracted from the actual pipeline via extract_dag. The public API is unchanged: diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py b/worker_plan/worker_plan_internal/rca/tests/__init__.py similarity index 100% rename from worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py rename to worker_plan/worker_plan_internal/rca/tests/__init__.py diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py b/worker_plan/worker_plan_internal/rca/tests/test_output.py similarity index 96% rename from worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py rename to worker_plan/worker_plan_internal/rca/tests/test_output.py index 00e326bea..aaaeb8b7e 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_output.py @@ -1,16 +1,16 @@ -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +# worker_plan/worker_plan_internal/rca/tests/test_output.py import json import unittest from pathlib import Path from tempfile import TemporaryDirectory -from worker_plan_internal.flaw_tracer.tracer import ( +from worker_plan_internal.rca.tracer import ( FlawTraceResult, TracedFlaw, TraceEntry, OriginInfo, ) -from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.rca.output import write_json_report, write_markdown_report def _make_sample_result() -> FlawTraceResult: diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py b/worker_plan/worker_plan_internal/rca/tests/test_prompts.py similarity index 97% rename from worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py rename to worker_plan/worker_plan_internal/rca/tests/test_prompts.py index 9770ad894..f9b606282 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_prompts.py @@ -1,7 +1,7 @@ -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +# worker_plan/worker_plan_internal/rca/tests/test_prompts.py import unittest from llama_index.core.llms import ChatMessage, MessageRole -from worker_plan_internal.flaw_tracer.prompts import ( +from worker_plan_internal.rca.prompts import ( IdentifiedFlaw, FlawIdentificationResult, UpstreamCheckResult, diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py b/worker_plan/worker_plan_internal/rca/tests/test_registry.py similarity index 97% rename from worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py rename to worker_plan/worker_plan_internal/rca/tests/test_registry.py index 48072280d..1ca79cf4a 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_registry.py @@ -1,8 +1,8 @@ -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +# worker_plan/worker_plan_internal/rca/tests/test_registry.py import unittest from pathlib import Path from tempfile import TemporaryDirectory -from worker_plan_internal.flaw_tracer.registry import ( +from worker_plan_internal.rca.registry import ( NodeInfo, NODES, find_node_by_filename, diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py b/worker_plan/worker_plan_internal/rca/tests/test_tracer.py similarity index 96% rename from worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py rename to worker_plan/worker_plan_internal/rca/tests/test_tracer.py index d4e89a40c..b597b6725 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_tracer.py @@ -1,4 +1,4 @@ -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +# worker_plan/worker_plan_internal/rca/tests/test_tracer.py """Tests for the flaw tracer recursive algorithm. Since ResponseMockLLM does NOT support as_structured_llm(), we mock the three @@ -11,14 +11,14 @@ from tempfile import TemporaryDirectory from unittest.mock import patch -from worker_plan_internal.flaw_tracer.tracer import ( - FlawTracer, +from worker_plan_internal.rca.tracer import ( + RootCauseAnalyzer, FlawTraceResult, TracedFlaw, TraceEntry, OriginInfo, ) -from worker_plan_internal.flaw_tracer.prompts import ( +from worker_plan_internal.rca.prompts import ( FlawIdentificationResult, IdentifiedFlaw, UpstreamCheckResult, @@ -34,10 +34,10 @@ def _make_executor() -> LLMExecutor: return LLMExecutor(llm_models=llm_models) -def _make_tracer(output_dir: Path, max_depth: int = 15, verbose: bool = False) -> FlawTracer: - """Create a FlawTracer with a dummy executor.""" +def _make_tracer(output_dir: Path, max_depth: int = 15, verbose: bool = False) -> RootCauseAnalyzer: + """Create a RootCauseAnalyzer with a dummy executor.""" executor = _make_executor() - return FlawTracer( + return RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, max_depth=max_depth, @@ -92,7 +92,7 @@ def test_defaults(self): self.assertTrue(flaw.trace_complete) -class TestFlawTracerPhase1(unittest.TestCase): +class TestRootCauseAnalyzerPhase1(unittest.TestCase): """Test flaw identification (Phase 1) with mocked LLM methods.""" def test_identify_flaws_returns_flaws(self): @@ -134,7 +134,7 @@ def test_file_not_found_raises(self): tracer.trace("nonexistent.md", "test") -class TestFlawTracerUpstreamTrace(unittest.TestCase): +class TestRootCauseAnalyzerUpstreamTrace(unittest.TestCase): """Test upstream tracing (Phase 2) with a simple two-level chain.""" def test_traces_flaw_upstream(self): @@ -240,7 +240,7 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content f"Dedup failed: checked {checked_stages}") -class TestFlawTracerMaxDepth(unittest.TestCase): +class TestRootCauseAnalyzerMaxDepth(unittest.TestCase): def test_respects_max_depth_zero(self): """With max_depth=0, no upstream tracing happens.""" with TemporaryDirectory() as d: @@ -299,7 +299,7 @@ def always_found(flaw_desc, evidence, upstream_filename, upstream_content): self.assertFalse(flaw.trace_complete) -class TestFlawTracerSourceCodeAnalysis(unittest.TestCase): +class TestRootCauseAnalyzerSourceCodeAnalysis(unittest.TestCase): """Test that Phase 3 source code analysis is invoked at the origin node.""" def test_source_code_analysis_called_at_origin(self): @@ -365,7 +365,7 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content self.assertEqual(args[0][1], "project_plan") -class TestFlawTracerMultipleFlaws(unittest.TestCase): +class TestRootCauseAnalyzerMultipleFlaws(unittest.TestCase): """Test that multiple flaws are traced independently.""" def test_traces_multiple_flaws(self): @@ -395,7 +395,7 @@ def test_traces_multiple_flaws(self): self.assertEqual(len(ids), len(set(ids))) -class TestFlawTracerSortsByDepth(unittest.TestCase): +class TestRootCauseAnalyzerSortsByDepth(unittest.TestCase): """Test that results are sorted by depth (deepest origin first).""" def test_flaws_sorted_by_depth_descending(self): diff --git a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py b/worker_plan/worker_plan_internal/rca/tracer.py similarity index 98% rename from worker_plan/worker_plan_internal/flaw_tracer/tracer.py rename to worker_plan/worker_plan_internal/rca/tracer.py index f1daf1465..28c6aaacb 100644 --- a/worker_plan/worker_plan_internal/flaw_tracer/tracer.py +++ b/worker_plan/worker_plan_internal/rca/tracer.py @@ -1,4 +1,4 @@ -# worker_plan/worker_plan_internal/flaw_tracer/tracer.py +# worker_plan/worker_plan_internal/rca/tracer.py """Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" from __future__ import annotations @@ -11,12 +11,12 @@ from llama_index.core.llms.llm import LLM -from worker_plan_internal.flaw_tracer.registry import ( +from worker_plan_internal.rca.registry import ( find_node_by_filename, get_upstream_files, get_source_code_paths, ) -from worker_plan_internal.flaw_tracer.prompts import ( +from worker_plan_internal.rca.prompts import ( FlawIdentificationResult, UpstreamCheckResult, SourceCodeAnalysisResult, @@ -98,7 +98,7 @@ def log(self, event_type: str, **data: object) -> None: f.write(json.dumps(entry, ensure_ascii=False) + "\n") -class FlawTracer: +class RootCauseAnalyzer: """Traces flaws upstream through the PlanExe pipeline DAG.""" def __init__( From 109b7abfc415a982ca7133e74fa1ed9e664d61dd Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 15:23:05 +0200 Subject: [PATCH 34/37] refactor: rename --flaw to --problem and output files to root_cause_analysis Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/rca/README.md | 14 +++++++------- worker_plan/worker_plan_internal/rca/__main__.py | 14 +++++++------- .../worker_plan_internal/rca/tests/test_output.py | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/worker_plan/worker_plan_internal/rca/README.md b/worker_plan/worker_plan_internal/rca/README.md index 9b5986cfa..4f303b763 100644 --- a/worker_plan/worker_plan_internal/rca/README.md +++ b/worker_plan/worker_plan_internal/rca/README.md @@ -17,7 +17,7 @@ The tool performs a recursive depth-first search: - **Domain complexity** — the topic is inherently uncertain or contentious, no prompt change resolves it - **Missing input** — the user's plan prompt didn't provide enough detail -Output is a JSON file (`flaw_trace.json`), a markdown report (`flaw_trace.md`), and a live event log (`events.jsonl`), sorted by trace depth so the deepest root cause appears first. +Output is a JSON file (`root_cause_analysis.json`), a markdown report (`root_cause_analysis.md`), and a live event log (`events.jsonl`), sorted by trace depth so the deepest root cause appears first. ## DAG integration @@ -48,7 +48,7 @@ Basic usage: /opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ - --flaw "Description of the problem you observed" \ + --problem "Description of the problem you observed" \ --verbose ``` @@ -58,7 +58,7 @@ Basic usage: |----------|----------|-------------| | `--dir` | Yes | Path to the output directory containing intermediary artifacts | | `--file` | Yes | Starting artifact to analyze (relative to `--dir`) | -| `--flaw` | Yes | Text description of the observed problem(s) | +| `--problem` | Yes | Text description of the observed problem(s) | | `--output-dir` | No | Where to write reports (defaults to `--dir`) | | `--max-depth` | No | Maximum upstream hops per problem (default: 15) | | `--verbose` | No | Print each LLM call to stderr as the trace runs | @@ -83,7 +83,7 @@ Trace a problem from the self-audit: /opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output/20250101_india_census \ --file 029-2-self_audit.md \ - --flaw "No Real-World Proof. The plan combines a digital census with caste enumeration at an unprecedented scale, lacking independent evidence of success." \ + --problem "No Real-World Proof. The plan combines a digital census with caste enumeration at an unprecedented scale, lacking independent evidence of success." \ --output-dir /tmp/rca-analysis \ --verbose ``` @@ -94,7 +94,7 @@ Trace a zoning/permits problem: /opt/homebrew/bin/python3.11 -m worker_plan_internal.rca \ --dir /path/to/output/20251016_minecraft_escape \ --file 029-2-self_audit.md \ - --flaw "Infeasible Constraints Rated MEDIUM because the plan mentions zoning and permits but lacks specifics for the Shanghai location." \ + --problem "Infeasible Constraints Rated MEDIUM because the plan mentions zoning and permits but lacks specifics for the Shanghai location." \ --output-dir /tmp/rca-analysis2 \ --verbose ``` @@ -111,8 +111,8 @@ tail -f /tmp/rca-analysis/events.jsonl Each run produces three files in `--output-dir` (or `--dir` if not specified): -- `flaw_trace.json` — machine-readable trace with full details -- `flaw_trace.md` — human-readable report with trace tables +- `root_cause_analysis.json` — machine-readable trace with full details +- `root_cause_analysis.md` — human-readable report with trace tables - `events.jsonl` — live event log for monitoring progress Problems are sorted by trace depth (deepest root cause first). Each problem's origin includes a **category** (`prompt_fixable`, `domain_complexity`, or `missing_input`) so you know whether the fix is a prompt edit, a domain limitation to accept, or a need for more detail in the plan input. diff --git a/worker_plan/worker_plan_internal/rca/__main__.py b/worker_plan/worker_plan_internal/rca/__main__.py index c40eb73ae..d5ba79294 100644 --- a/worker_plan/worker_plan_internal/rca/__main__.py +++ b/worker_plan/worker_plan_internal/rca/__main__.py @@ -5,7 +5,7 @@ python -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ - --flaw "The budget appears unvalidated..." \ + --problem "The budget appears unvalidated..." \ --output-dir /path/to/output \ --max-depth 15 \ --verbose @@ -33,12 +33,12 @@ def main() -> None: help="Starting file to analyze (relative to --dir)", ) parser.add_argument( - "--flaw", required=True, - help="Text description of the observed flaw(s)", + "--problem", required=True, + help="Text description of the observed problem(s)", ) parser.add_argument( "--output-dir", type=Path, default=None, - help="Where to write flaw_trace.json and flaw_trace.md (defaults to --dir)", + help="Where to write root_cause_analysis.json and root_cause_analysis.md (defaults to --dir)", ) parser.add_argument( "--max-depth", type=int, default=15, @@ -87,11 +87,11 @@ def main() -> None: ) print(f"Tracing flaws in {starting_file}...", file=sys.stderr) - result = tracer.trace(starting_file, args.flaw) + result = tracer.trace(starting_file, args.problem) # Write reports - json_path = report_dir / "flaw_trace.json" - md_path = report_dir / "flaw_trace.md" + json_path = report_dir / "root_cause_analysis.json" + md_path = report_dir / "root_cause_analysis.md" write_json_report(result, json_path) write_markdown_report(result, md_path) diff --git a/worker_plan/worker_plan_internal/rca/tests/test_output.py b/worker_plan/worker_plan_internal/rca/tests/test_output.py index aaaeb8b7e..0840df3d0 100644 --- a/worker_plan/worker_plan_internal/rca/tests/test_output.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_output.py @@ -60,7 +60,7 @@ def _make_sample_result() -> FlawTraceResult: class TestWriteJsonReport(unittest.TestCase): def test_writes_valid_json(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) @@ -72,7 +72,7 @@ def test_writes_valid_json(self): def test_json_contains_correct_summary(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) @@ -85,7 +85,7 @@ def test_json_contains_correct_summary(self): def test_json_flaws_sorted_by_depth(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) @@ -97,7 +97,7 @@ def test_json_flaws_sorted_by_depth(self): class TestWriteMarkdownReport(unittest.TestCase): def test_writes_markdown_file(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) @@ -107,7 +107,7 @@ def test_writes_markdown_file(self): def test_markdown_contains_flaw_details(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) @@ -118,7 +118,7 @@ def test_markdown_contains_flaw_details(self): def test_markdown_contains_trace_table(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) @@ -128,7 +128,7 @@ def test_markdown_contains_trace_table(self): def test_empty_result_produces_valid_markdown(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = FlawTraceResult( starting_file="030-report.html", flaw_description="test", From 411629b612259e4194d5b7193fbbbed5596a8ffe Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 15:31:36 +0200 Subject: [PATCH 35/37] refactor: replace "flaw" terminology with "problem" throughout rca module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames classes (TracedFlaw→TracedProblem, FlawTraceResult→RCAResult, IdentifiedFlaw→IdentifiedProblem, FlawIdentificationResult→ ProblemIdentificationResult), fields, methods, JSON keys, LLM prompts, and markdown report output. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../worker_plan_internal/rca/AGENTS.md | 2 +- .../worker_plan_internal/rca/__main__.py | 12 +- .../worker_plan_internal/rca/output.py | 94 ++++---- .../worker_plan_internal/rca/prompts.py | 78 +++---- .../rca/tests/test_output.py | 42 ++-- .../rca/tests/test_prompts.py | 44 ++-- .../rca/tests/test_tracer.py | 212 +++++++++--------- .../worker_plan_internal/rca/tracer.py | 130 +++++------ 8 files changed, 307 insertions(+), 307 deletions(-) diff --git a/worker_plan/worker_plan_internal/rca/AGENTS.md b/worker_plan/worker_plan_internal/rca/AGENTS.md index de8d0df32..f7ec4f645 100644 --- a/worker_plan/worker_plan_internal/rca/AGENTS.md +++ b/worker_plan/worker_plan_internal/rca/AGENTS.md @@ -70,7 +70,7 @@ The tool traces at the artifact level (which file introduced the problem) but ca 2. **Minecraft escape v1** (`20251016_minecraft_escape`): Old prompts. Problem about zoning/permits. 5 problems, 43 LLM calls. User's problem not identified. Exposed Phase 1 anchoring problem. -3. **Minecraft escape v2** (`20251016_minecraft_escape`): New prompts. 3 problems, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's problem correctly identified as flaw_001. All problems in same family (regulatory gaps). +3. **Minecraft escape v2** (`20251016_minecraft_escape`): New prompts. 3 problems, 31 LLM calls, deepest origin: `identify_risks` (depth 5). User's problem correctly identified as problem_001. All problems in same family (regulatory gaps). 4. **India census v2** (`20250101_india_census`): New prompts. 2 problems (down from 17), 17 LLM calls (down from 153), deepest origin: `potential_levers` (depth 6). User's problem correctly identified. Exposed Phase 3 "always blames prompt" limitation. diff --git a/worker_plan/worker_plan_internal/rca/__main__.py b/worker_plan/worker_plan_internal/rca/__main__.py index d5ba79294..aea03cf94 100644 --- a/worker_plan/worker_plan_internal/rca/__main__.py +++ b/worker_plan/worker_plan_internal/rca/__main__.py @@ -22,7 +22,7 @@ def main() -> None: parser = argparse.ArgumentParser( - description="Trace flaws in PlanExe reports upstream to their root cause.", + description="Trace problems in PlanExe reports upstream to their root cause.", ) parser.add_argument( "--dir", required=True, type=Path, @@ -42,7 +42,7 @@ def main() -> None: ) parser.add_argument( "--max-depth", type=int, default=15, - help="Maximum upstream hops per flaw (default: 15)", + help="Maximum upstream hops per problem (default: 15)", ) parser.add_argument( "--verbose", action="store_true", @@ -86,7 +86,7 @@ def main() -> None: events_path=events_path, ) - print(f"Tracing flaws in {starting_file}...", file=sys.stderr) + print(f"Tracing problems in {starting_file}...", file=sys.stderr) result = tracer.trace(starting_file, args.problem) # Write reports @@ -96,9 +96,9 @@ def main() -> None: write_markdown_report(result, md_path) # Print summary - print(f"\nFlaws found: {len(result.flaws)}", file=sys.stderr) - if result.flaws: - deepest = max(result.flaws, key=lambda f: f.depth) + print(f"\nProblems found: {len(result.problems)}", file=sys.stderr) + if result.problems: + deepest = max(result.problems, key=lambda p: p.depth) print(f"Deepest origin: {deepest.origin_node} (depth {deepest.depth})", file=sys.stderr) print(f"LLM calls made: {result.llm_calls_made}", file=sys.stderr) print(f"\nReports written:", file=sys.stderr) diff --git a/worker_plan/worker_plan_internal/rca/output.py b/worker_plan/worker_plan_internal/rca/output.py index cf8d78826..7f3c2dbfa 100644 --- a/worker_plan/worker_plan_internal/rca/output.py +++ b/worker_plan/worker_plan_internal/rca/output.py @@ -1,24 +1,24 @@ # worker_plan/worker_plan_internal/rca/output.py -"""JSON and markdown report generation for flaw trace results.""" +"""JSON and markdown report generation for root cause analysis results.""" import json from datetime import datetime, UTC from pathlib import Path -from worker_plan_internal.rca.tracer import FlawTraceResult +from worker_plan_internal.rca.tracer import RCAResult -def write_json_report(result: FlawTraceResult, output_path: Path) -> None: - """Write the flaw trace result as a JSON file.""" +def write_json_report(result: RCAResult, output_path: Path) -> None: + """Write the RCA result as a JSON file.""" data = { "input": { "starting_file": result.starting_file, - "flaw_description": result.flaw_description, + "problem_description": result.problem_description, "output_dir": result.output_dir, "timestamp": datetime.now(UTC).isoformat(), }, - "flaws": [], + "problems": [], "summary": { - "total_flaws": len(result.flaws), + "total_problems": len(result.problems), "deepest_origin_node": None, "deepest_origin_depth": 0, "llm_calls_made": result.llm_calls_made, @@ -28,12 +28,12 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: max_depth = 0 deepest_node = None - for flaw in result.flaws: - flaw_data = { - "id": flaw.id, - "description": flaw.description, - "severity": flaw.severity, - "starting_evidence": flaw.starting_evidence, + for problem in result.problems: + problem_data = { + "id": problem.id, + "description": problem.description, + "severity": problem.severity, + "starting_evidence": problem.starting_evidence, "trace": [ { "node": entry.node, @@ -41,96 +41,96 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: "evidence": entry.evidence, "is_origin": entry.is_origin, } - for entry in flaw.trace + for entry in problem.trace ], "origin": None, - "depth": flaw.depth, - "trace_complete": flaw.trace_complete, + "depth": problem.depth, + "trace_complete": problem.trace_complete, } - if flaw.origin: - flaw_data["origin"] = { - "node": flaw.origin.node, - "file": flaw.origin.file, - "source_code_files": flaw.origin.source_code_files, - "category": flaw.origin.category, - "likely_cause": flaw.origin.likely_cause, - "suggestion": flaw.origin.suggestion, + if problem.origin: + problem_data["origin"] = { + "node": problem.origin.node, + "file": problem.origin.file, + "source_code_files": problem.origin.source_code_files, + "category": problem.origin.category, + "likely_cause": problem.origin.likely_cause, + "suggestion": problem.origin.suggestion, } - if flaw.depth > max_depth: - max_depth = flaw.depth - deepest_node = flaw.origin_node + if problem.depth > max_depth: + max_depth = problem.depth + deepest_node = problem.origin_node - data["flaws"].append(flaw_data) + data["problems"].append(problem_data) - data["flaws"].sort(key=lambda f: f["depth"], reverse=True) + data["problems"].sort(key=lambda p: p["depth"], reverse=True) data["summary"]["deepest_origin_node"] = deepest_node data["summary"]["deepest_origin_depth"] = max_depth output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") -def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: - """Write the flaw trace result as a markdown report.""" +def write_markdown_report(result: RCAResult, output_path: Path) -> None: + """Write the RCA result as a markdown report.""" lines: list[str] = [] - lines.append("# Flaw Trace Report") + lines.append("# Root Cause Analysis Report") lines.append("") lines.append(f"**Input:** {result.starting_file}") - lines.append(f"**Flaws found:** {len(result.flaws)}") + lines.append(f"**Problems found:** {len(result.problems)}") - if result.flaws: - deepest = max(result.flaws, key=lambda f: f.depth) + if result.problems: + deepest = max(result.problems, key=lambda p: p.depth) lines.append(f"**Deepest origin:** {deepest.origin_node} (depth {deepest.depth})") lines.append(f"**LLM calls:** {result.llm_calls_made}") lines.append("") - sorted_flaws = sorted(result.flaws, key=lambda f: f.depth, reverse=True) - for flaw in sorted_flaws: + sorted_problems = sorted(result.problems, key=lambda p: p.depth, reverse=True) + for problem in sorted_problems: lines.append("---") lines.append("") - lines.append(f"## {flaw.id.replace('_', ' ').title()} ({flaw.severity}): {flaw.description}") + lines.append(f"## {problem.id.replace('_', ' ').title()} ({problem.severity}): {problem.description}") lines.append("") # Trace chain summary - node_names = [entry.node for entry in flaw.trace] + node_names = [entry.node for entry in problem.trace] chain_parts = [] for name in node_names: - if name == flaw.origin_node: + if name == problem.origin_node: chain_parts.append(f"**{name}** (origin)") else: chain_parts.append(name) lines.append(f"**Trace:** {' -> '.join(chain_parts)}") lines.append("") - if not flaw.trace_complete: + if not problem.trace_complete: lines.append("*Note: trace incomplete — max depth reached.*") lines.append("") # Trace table lines.append("| Node | File | Evidence |") lines.append("|-------|------|----------|") - for entry in flaw.trace: + for entry in problem.trace: node_cell = f"**{entry.node}**" if entry.is_origin else entry.node evidence_cell = _escape_table_cell(entry.evidence) lines.append(f"| {node_cell} | {entry.file} | {evidence_cell} |") lines.append("") # Origin analysis - if flaw.origin: + if problem.origin: category_labels = { "prompt_fixable": "Prompt fixable", "domain_complexity": "Domain complexity", "missing_input": "Missing input", } - category_label = category_labels.get(flaw.origin.category, flaw.origin.category) + category_label = category_labels.get(problem.origin.category, problem.origin.category) lines.append(f"**Category:** {category_label}") lines.append("") - lines.append(f"**Root cause:** {flaw.origin.likely_cause}") + lines.append(f"**Root cause:** {problem.origin.likely_cause}") lines.append("") - lines.append(f"**Source files:** {', '.join(flaw.origin.source_code_files)}") + lines.append(f"**Source files:** {', '.join(problem.origin.source_code_files)}") lines.append("") - lines.append(f"**Suggestion:** {flaw.origin.suggestion}") + lines.append(f"**Suggestion:** {problem.origin.suggestion}") lines.append("") output_path.write_text("\n".join(lines), encoding="utf-8") diff --git a/worker_plan/worker_plan_internal/rca/prompts.py b/worker_plan/worker_plan_internal/rca/prompts.py index 89a79444e..93173dc6b 100644 --- a/worker_plan/worker_plan_internal/rca/prompts.py +++ b/worker_plan/worker_plan_internal/rca/prompts.py @@ -7,29 +7,29 @@ # -- Pydantic models for structured LLM output -------------------------------- -class IdentifiedFlaw(BaseModel): - """A discrete flaw found in a pipeline output file.""" - description: str = Field(description="One-sentence description of the flaw") - evidence: str = Field(description="Direct quote from the file demonstrating the flaw") +class IdentifiedProblem(BaseModel): + """A discrete problem found in a pipeline output file.""" + description: str = Field(description="One-sentence description of the problem") + evidence: str = Field(description="Direct quote from the file demonstrating the problem") severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." ) -class FlawIdentificationResult(BaseModel): - """Result of analyzing a file for flaws.""" - flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") +class ProblemIdentificationResult(BaseModel): + """Result of analyzing a file for problems.""" + problems: list[IdentifiedProblem] = Field(description="List of discrete problems found in the file") class UpstreamCheckResult(BaseModel): - """Result of checking an upstream file for a flaw precursor.""" - found: bool = Field(description="True if this file contains the flaw or a precursor to it") + """Result of checking an upstream file for a problem precursor.""" + found: bool = Field(description="True if this file contains the problem or a precursor to it") evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") - explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + explanation: str = Field(description="How this connects to the downstream problem, or why this file is clean") class SourceCodeAnalysisResult(BaseModel): - """Result of analyzing source code at a flaw's origin node.""" + """Result of analyzing source code at a problem's origin node.""" category: Literal["prompt_fixable", "domain_complexity", "missing_input"] = Field( description=( "prompt_fixable: the prompt forgot to ask for something or has a gap that can be fixed by editing the prompt. " @@ -37,38 +37,38 @@ class SourceCodeAnalysisResult(BaseModel): "missing_input: the user's plan prompt didn't provide enough context for the pipeline to work with." ) ) - likely_cause: str = Field(description="What in the prompt, logic, or domain caused the flaw") + likely_cause: str = Field(description="What in the prompt, logic, or domain caused the problem") relevant_code_section: str = Field(description="The specific code or prompt text responsible") - suggestion: str = Field(description="How to fix or prevent this flaw") + suggestion: str = Field(description="How to fix or prevent this problem") # -- Prompt builders ----------------------------------------------------------- -def build_flaw_identification_messages( +def build_problem_identification_messages( filename: str, file_content: str, - user_flaw_description: str, + user_problem_description: str, ) -> list[ChatMessage]: - """Build messages for Phase 1: identifying discrete flaws in a file.""" + """Build messages for Phase 1: identifying discrete problems in a file.""" system = ( "You are analyzing an intermediary file from a project planning pipeline.\n" - "The user has described a specific flaw they observed. Your job:\n\n" - "1. FIRST, locate the user's specific flaw in the file. Find the passage that " - "corresponds to what the user described. This flaw MUST be the first item in your list.\n" - "2. THEN, identify any additional discrete flaws that are closely related to the " - "user's concern (e.g., other instances of the same problem pattern, or flaws that " - "share the same root cause). Do NOT list every possible flaw in the file — only " + "The user has described a specific problem they observed. Your job:\n\n" + "1. FIRST, locate the user's specific problem in the file. Find the passage that " + "corresponds to what the user described. This problem MUST be the first item in your list.\n" + "2. THEN, identify any additional discrete problems that are closely related to the " + "user's concern (e.g., other instances of the same problem pattern, or problems that " + "share the same root cause). Do NOT list every possible problem in the file — only " "those connected to what the user raised.\n\n" - "For each flaw, provide a short description (one sentence), a direct quote " + "For each problem, provide a short description (one sentence), a direct quote " "from the file as evidence (keep quotes under 200 characters), and a severity level.\n" - "Only identify real flaws — do not flag stylistic preferences or minor formatting issues.\n" + "Only identify real problems — do not flag stylistic preferences or minor formatting issues.\n" "Severity levels:\n" "- HIGH: fabricated data, invented statistics, or missing critical analysis\n" "- MEDIUM: weak reasoning, vague unsupported claims, or shallow treatment\n" "- LOW: minor gaps that don't significantly impact the plan" ) user = ( - f"User's flaw description:\n{user_flaw_description}\n\n" + f"User's problem description:\n{user_problem_description}\n\n" f"Filename: {filename}\n" f"File content:\n{file_content}" ) @@ -79,26 +79,26 @@ def build_flaw_identification_messages( def build_upstream_check_messages( - flaw_description: str, + problem_description: str, evidence_quote: str, upstream_filename: str, upstream_file_content: str, ) -> list[ChatMessage]: - """Build messages for Phase 2: checking if a flaw exists in an upstream file.""" + """Build messages for Phase 2: checking if a problem exists in an upstream file.""" system = ( - "You are tracing a flaw through a project planning pipeline to find where it originated.\n" - "A downstream file contains a flaw. You are examining an upstream file that was an input " - "to the node that produced the flawed output.\n\n" - "Determine if this upstream file CAUSED or CONTRIBUTED to the downstream flaw.\n" + "You are tracing a problem through a project planning pipeline to find where it originated.\n" + "A downstream file contains a problem. You are examining an upstream file that was an input " + "to the node that produced the problematic output.\n\n" + "Determine if this upstream file CAUSED or CONTRIBUTED to the downstream problem.\n" "This means the upstream file contains content that was carried forward, transformed, " - "or amplified into the downstream flaw. Merely discussing a related topic is NOT enough.\n\n" + "or amplified into the downstream problem. Merely discussing a related topic is NOT enough.\n\n" "If YES: quote the specific sentence or phrase (under 200 characters) and explain " - "the causal mechanism — how this upstream content led to the downstream flaw.\n" - "If NO: explain why this file is clean regarding this specific flaw.\n\n" + "the causal mechanism — how this upstream content led to the downstream problem.\n" + "If NO: explain why this file is clean regarding this specific problem.\n\n" "Be strict. Only say YES if you can identify a clear causal link, not just topical overlap." ) user = ( - f"Flaw: {flaw_description}\n" + f"Problem: {problem_description}\n" f"Evidence from downstream: {evidence_quote}\n\n" f"Upstream filename: {upstream_filename}\n" f"Upstream file content:\n{upstream_file_content}" @@ -110,17 +110,17 @@ def build_upstream_check_messages( def build_source_code_analysis_messages( - flaw_description: str, + problem_description: str, evidence_quote: str, source_code_contents: list[tuple[str, str]], ) -> list[ChatMessage]: - """Build messages for Phase 3: analyzing source code at flaw origin. + """Build messages for Phase 3: analyzing source code at problem origin. Args: source_code_contents: list of (filename, content) tuples """ system = ( - "A flaw was introduced at this pipeline node. The flaw exists in its output " + "A problem was introduced at this pipeline node. The problem exists in its output " "but NOT in any of its inputs, so this node created it.\n\n" "First, classify the root cause into one of three categories:\n" "- prompt_fixable: The prompt has a gap or oversight that can be fixed by editing " @@ -142,7 +142,7 @@ def build_source_code_analysis_messages( source_text = "\n\n".join(source_sections) user = ( - f"Flaw: {flaw_description}\n" + f"Problem: {problem_description}\n" f"Evidence from output: {evidence_quote}\n\n" f"Source code files:\n{source_text}" ) diff --git a/worker_plan/worker_plan_internal/rca/tests/test_output.py b/worker_plan/worker_plan_internal/rca/tests/test_output.py index 0840df3d0..315b4a409 100644 --- a/worker_plan/worker_plan_internal/rca/tests/test_output.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_output.py @@ -5,23 +5,23 @@ from tempfile import TemporaryDirectory from worker_plan_internal.rca.tracer import ( - FlawTraceResult, - TracedFlaw, + RCAResult, + TracedProblem, TraceEntry, OriginInfo, ) from worker_plan_internal.rca.output import write_json_report, write_markdown_report -def _make_sample_result() -> FlawTraceResult: - """Create a sample FlawTraceResult for testing.""" - return FlawTraceResult( +def _make_sample_result() -> RCAResult: + """Create a sample RCAResult for testing.""" + return RCAResult( starting_file="025-2-executive_summary.md", - flaw_description="Budget is unvalidated", + problem_description="Budget is unvalidated", output_dir="/tmp/test_output", - flaws=[ - TracedFlaw( - id="flaw_001", + problems=[ + TracedProblem( + id="problem_001", description="Budget of CZK 500,000 is unvalidated", severity="HIGH", starting_evidence="CZK 500,000", @@ -41,8 +41,8 @@ def _make_sample_result() -> FlawTraceResult: ), depth=3, ), - TracedFlaw( - id="flaw_002", + TracedProblem( + id="problem_002", description="Missing market sizing", severity="MEDIUM", starting_evidence="growing Czech market", @@ -67,7 +67,7 @@ def test_writes_valid_json(self): self.assertTrue(output_path.exists()) data = json.loads(output_path.read_text(encoding="utf-8")) self.assertIn("input", data) - self.assertIn("flaws", data) + self.assertIn("problems", data) self.assertIn("summary", data) def test_json_contains_correct_summary(self): @@ -78,19 +78,19 @@ def test_json_contains_correct_summary(self): data = json.loads(output_path.read_text(encoding="utf-8")) summary = data["summary"] - self.assertEqual(summary["total_flaws"], 2) + self.assertEqual(summary["total_problems"], 2) self.assertEqual(summary["deepest_origin_node"], "make_assumptions") self.assertEqual(summary["deepest_origin_depth"], 3) self.assertEqual(summary["llm_calls_made"], 8) - def test_json_flaws_sorted_by_depth(self): + def test_json_problems_sorted_by_depth(self): with TemporaryDirectory() as d: output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) data = json.loads(output_path.read_text(encoding="utf-8")) - depths = [f["depth"] for f in data["flaws"]] + depths = [f["depth"] for f in data["problems"]] self.assertEqual(depths, sorted(depths, reverse=True)) @@ -103,9 +103,9 @@ def test_writes_markdown_file(self): self.assertTrue(output_path.exists()) content = output_path.read_text(encoding="utf-8") - self.assertIn("# Flaw Trace Report", content) + self.assertIn("# Root Cause Analysis Report", content) - def test_markdown_contains_flaw_details(self): + def test_markdown_contains_problem_details(self): with TemporaryDirectory() as d: output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() @@ -129,14 +129,14 @@ def test_markdown_contains_trace_table(self): def test_empty_result_produces_valid_markdown(self): with TemporaryDirectory() as d: output_path = Path(d) / "root_cause_analysis.md" - result = FlawTraceResult( + result = RCAResult( starting_file="030-report.html", - flaw_description="test", + problem_description="test", output_dir="/tmp", - flaws=[], + problems=[], llm_calls_made=1, ) write_markdown_report(result, output_path) content = output_path.read_text(encoding="utf-8") - self.assertIn("Flaws found:** 0", content) + self.assertIn("Problems found:** 0", content) diff --git a/worker_plan/worker_plan_internal/rca/tests/test_prompts.py b/worker_plan/worker_plan_internal/rca/tests/test_prompts.py index f9b606282..45fb4fae8 100644 --- a/worker_plan/worker_plan_internal/rca/tests/test_prompts.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_prompts.py @@ -2,38 +2,38 @@ import unittest from llama_index.core.llms import ChatMessage, MessageRole from worker_plan_internal.rca.prompts import ( - IdentifiedFlaw, - FlawIdentificationResult, + IdentifiedProblem, + ProblemIdentificationResult, UpstreamCheckResult, SourceCodeAnalysisResult, - build_flaw_identification_messages, + build_problem_identification_messages, build_upstream_check_messages, build_source_code_analysis_messages, ) class TestPydanticModels(unittest.TestCase): - def test_identified_flaw_valid(self): - flaw = IdentifiedFlaw( + def test_identified_problem_valid(self): + problem = IdentifiedProblem( description="Budget figure is fabricated", evidence="The budget is CZK 500,000", severity="HIGH", ) - self.assertEqual(flaw.severity, "HIGH") + self.assertEqual(problem.severity, "HIGH") - def test_identified_flaw_rejects_invalid_severity(self): + def test_identified_problem_rejects_invalid_severity(self): with self.assertRaises(Exception): - IdentifiedFlaw( + IdentifiedProblem( description="test", evidence="test", severity="CRITICAL", ) - def test_flaw_identification_result(self): - result = FlawIdentificationResult(flaws=[ - IdentifiedFlaw(description="test", evidence="quote", severity="LOW"), + def test_problem_identification_result(self): + result = ProblemIdentificationResult(problems=[ + IdentifiedProblem(description="test", evidence="quote", severity="LOW"), ]) - self.assertEqual(len(result.flaws), 1) + self.assertEqual(len(result.problems), 1) def test_upstream_check_result_found(self): result = UpstreamCheckResult(found=True, evidence="quote", explanation="precursor") @@ -64,12 +64,12 @@ def test_source_code_analysis_rejects_invalid_category(self): ) -class TestBuildFlawIdentificationMessages(unittest.TestCase): +class TestBuildProblemIdentificationMessages(unittest.TestCase): def test_returns_chat_messages(self): - messages = build_flaw_identification_messages( + messages = build_problem_identification_messages( filename="030-report.html", file_content="report content", - user_flaw_description="budget is wrong", + user_problem_description="budget is wrong", ) self.assertIsInstance(messages, list) self.assertEqual(len(messages), 2) @@ -77,10 +77,10 @@ def test_returns_chat_messages(self): self.assertEqual(messages[1].role, MessageRole.USER) def test_user_message_contains_inputs(self): - messages = build_flaw_identification_messages( + messages = build_problem_identification_messages( filename="025-2-executive_summary.md", file_content="# Summary\nBudget: 500k", - user_flaw_description="fabricated budget", + user_problem_description="fabricated budget", ) user_content = messages[1].content self.assertIn("025-2-executive_summary.md", user_content) @@ -91,7 +91,7 @@ def test_user_message_contains_inputs(self): class TestBuildUpstreamCheckMessages(unittest.TestCase): def test_returns_chat_messages(self): messages = build_upstream_check_messages( - flaw_description="Budget is fabricated", + problem_description="Budget is fabricated", evidence_quote="CZK 500,000", upstream_filename="005-2-project_plan.md", upstream_file_content="# Project Plan\nBudget: 500k", @@ -99,9 +99,9 @@ def test_returns_chat_messages(self): self.assertIsInstance(messages, list) self.assertEqual(len(messages), 2) - def test_user_message_contains_flaw_and_upstream(self): + def test_user_message_contains_problem_and_upstream(self): messages = build_upstream_check_messages( - flaw_description="Missing market sizing", + problem_description="Missing market sizing", evidence_quote="growing Czech market", upstream_filename="003-5-make_assumptions.md", upstream_file_content="# Assumptions\nMarket is growing", @@ -115,7 +115,7 @@ def test_user_message_contains_flaw_and_upstream(self): class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): def test_returns_chat_messages(self): messages = build_source_code_analysis_messages( - flaw_description="Budget fabricated", + problem_description="Budget fabricated", evidence_quote="CZK 500,000", source_code_contents=[ ("nodes/make_assumptions.py", "class MakeAssumptionsTask: ..."), @@ -127,7 +127,7 @@ def test_returns_chat_messages(self): def test_user_message_contains_source_code(self): messages = build_source_code_analysis_messages( - flaw_description="Missing analysis", + problem_description="Missing analysis", evidence_quote="no data", source_code_contents=[ ("my_stage.py", "SYSTEM_PROMPT = 'Generate assumptions'"), diff --git a/worker_plan/worker_plan_internal/rca/tests/test_tracer.py b/worker_plan/worker_plan_internal/rca/tests/test_tracer.py index b597b6725..aca22d407 100644 --- a/worker_plan/worker_plan_internal/rca/tests/test_tracer.py +++ b/worker_plan/worker_plan_internal/rca/tests/test_tracer.py @@ -1,8 +1,8 @@ # worker_plan/worker_plan_internal/rca/tests/test_tracer.py -"""Tests for the flaw tracer recursive algorithm. +"""Tests for the root cause analyzer recursive algorithm. Since ResponseMockLLM does NOT support as_structured_llm(), we mock the three -private LLM-calling methods (_identify_flaws, _check_upstream, +private LLM-calling methods (_identify_problems, _check_upstream, _analyze_source_code) directly. This tests the tracing logic — recursion, deduplication, max depth — which is the important part. """ @@ -13,14 +13,14 @@ from worker_plan_internal.rca.tracer import ( RootCauseAnalyzer, - FlawTraceResult, - TracedFlaw, + RCAResult, + TracedProblem, TraceEntry, OriginInfo, ) from worker_plan_internal.rca.prompts import ( - FlawIdentificationResult, - IdentifiedFlaw, + ProblemIdentificationResult, + IdentifiedProblem, UpstreamCheckResult, ) from worker_plan_internal.llm_util.response_mockllm import ResponseMockLLM @@ -45,58 +45,58 @@ def _make_tracer(output_dir: Path, max_depth: int = 15, verbose: bool = False) - ) -class TestFlawTraceResult(unittest.TestCase): +class TestRCAResult(unittest.TestCase): def test_dataclass_creation(self): - result = FlawTraceResult( + result = RCAResult( starting_file="030-report.html", - flaw_description="test", + problem_description="test", output_dir="/tmp/test", - flaws=[], + problems=[], llm_calls_made=0, ) self.assertEqual(result.starting_file, "030-report.html") - self.assertEqual(len(result.flaws), 0) + self.assertEqual(len(result.problems), 0) self.assertEqual(result.llm_calls_made, 0) - def test_dataclass_with_flaws(self): - flaw = TracedFlaw( - id="flaw_001", + def test_dataclass_with_problems(self): + problem = TracedProblem( + id="problem_001", description="Budget fabricated", severity="HIGH", starting_evidence="CZK 500,000", trace=[TraceEntry(node="test", file="test.md", evidence="ev")], ) - result = FlawTraceResult( + result = RCAResult( starting_file="test.md", - flaw_description="test", + problem_description="test", output_dir="/tmp/test", - flaws=[flaw], + problems=[problem], llm_calls_made=1, ) - self.assertEqual(len(result.flaws), 1) - self.assertEqual(result.flaws[0].severity, "HIGH") + self.assertEqual(len(result.problems), 1) + self.assertEqual(result.problems[0].severity, "HIGH") -class TestTracedFlaw(unittest.TestCase): +class TestTracedProblem(unittest.TestCase): def test_defaults(self): - flaw = TracedFlaw( - id="flaw_001", + problem = TracedProblem( + id="problem_001", description="test", severity="LOW", starting_evidence="ev", trace=[], ) - self.assertIsNone(flaw.origin_node) - self.assertIsNone(flaw.origin) - self.assertEqual(flaw.depth, 0) - self.assertTrue(flaw.trace_complete) + self.assertIsNone(problem.origin_node) + self.assertIsNone(problem.origin) + self.assertEqual(problem.depth, 0) + self.assertTrue(problem.trace_complete) class TestRootCauseAnalyzerPhase1(unittest.TestCase): - """Test flaw identification (Phase 1) with mocked LLM methods.""" + """Test problem identification (Phase 1) with mocked LLM methods.""" - def test_identify_flaws_returns_flaws(self): - """The tracer should produce TracedFlaw objects from Phase 1 identification.""" + def test_identify_problems(self): + """The analyzer should produce TracedProblem objects from Phase 1 identification.""" with TemporaryDirectory() as d: output_dir = Path(d) # Create a minimal output file @@ -105,10 +105,10 @@ def test_identify_flaws_returns_flaws(self): tracer = _make_tracer(output_dir) - # Mock Phase 1: identify flaws - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw( + # Mock Phase 1: identify problems + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem( description="Budget is unvalidated", evidence="CZK 500,000", severity="HIGH", @@ -116,15 +116,15 @@ def test_identify_flaws_returns_flaws(self): ] ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_analyze_source_code') as mock_analyze: result = tracer.trace("025-2-executive_summary.md", "budget is unvalidated") - self.assertIsInstance(result, FlawTraceResult) - self.assertGreaterEqual(len(result.flaws), 1) - flaw = result.flaws[0] - self.assertEqual(flaw.description, "Budget is unvalidated") - self.assertEqual(flaw.severity, "HIGH") + self.assertIsInstance(result, RCAResult) + self.assertGreaterEqual(len(result.problems), 1) + problem = result.problems[0] + self.assertEqual(problem.description, "Budget is unvalidated") + self.assertEqual(problem.severity, "HIGH") def test_file_not_found_raises(self): """The tracer should raise FileNotFoundError for missing starting files.""" @@ -137,7 +137,7 @@ def test_file_not_found_raises(self): class TestRootCauseAnalyzerUpstreamTrace(unittest.TestCase): """Test upstream tracing (Phase 2) with a simple two-level chain.""" - def test_traces_flaw_upstream(self): + def test_traces_problem_upstream(self): with TemporaryDirectory() as d: output_dir = Path(d) # Create files for a chain: executive_summary -> project_plan -> setup @@ -151,10 +151,10 @@ def test_traces_flaw_upstream(self): tracer = _make_tracer(output_dir) - # Mock Phase 1: identify flaws - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw( + # Mock Phase 1: identify problems + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem( description="Budget fabricated", evidence="CZK 500,000", severity="HIGH", @@ -166,10 +166,10 @@ def test_traces_flaw_upstream(self): upstream_call_count = 0 upstream_responses = {} - def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + def mock_check_upstream(problem_desc, evidence, upstream_filename, upstream_content): nonlocal upstream_call_count upstream_call_count += 1 - # project_plan has the flaw; others are clean + # project_plan has the problem; others are clean if "project_plan" in upstream_filename: return UpstreamCheckResult( found=True, @@ -183,22 +183,22 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content explanation="clean", ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "budget is fabricated") - self.assertEqual(len(result.flaws), 1) - flaw = result.flaws[0] + self.assertEqual(len(result.problems), 1) + problem = result.problems[0] # The trace should include at least executive_summary and project_plan - trace_nodes = [entry.node for entry in flaw.trace] + trace_nodes = [entry.node for entry in problem.trace] self.assertIn("executive_summary", trace_nodes) self.assertIn("project_plan", trace_nodes) - # Origin should be project_plan (flaw found there but not in its upstream 'setup') - self.assertEqual(flaw.origin_node, "project_plan") + # Origin should be project_plan (problem found there but not in its upstream 'setup') + self.assertEqual(problem.origin_node, "project_plan") def test_deduplication_works(self): - """Stages already checked for the same flaw should be skipped.""" + """Stages already checked for the same problem should be skipped.""" with TemporaryDirectory() as d: output_dir = Path(d) # executive_summary depends on strategic_decisions_markdown, scenarios_markdown, etc. @@ -213,21 +213,21 @@ def test_deduplication_works(self): tracer = _make_tracer(output_dir) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH") + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="Budget fabricated", evidence="500k", severity="HIGH") ] ) checked_stages = [] - def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + def mock_check_upstream(problem_desc, evidence, upstream_filename, upstream_content): checked_stages.append(upstream_filename) if "project_plan" in upstream_filename: return UpstreamCheckResult(found=True, evidence="500k", explanation="found here") return UpstreamCheckResult(found=False, evidence=None, explanation="clean") - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "budget fabricated") @@ -249,20 +249,20 @@ def test_respects_max_depth_zero(self): tracer = _make_tracer(output_dir, max_depth=0) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="test flaw", evidence="500k", severity="LOW") + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="test problem", evidence="500k", severity="LOW") ] ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream') as mock_check, \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "test") - self.assertEqual(len(result.flaws), 1) + self.assertEqual(len(result.problems), 1) # With max_depth=0, no upstream tracing happens - self.assertEqual(len(result.flaws[0].trace), 1) # only the starting file + self.assertEqual(len(result.problems[0].trace), 1) # only the starting file # _check_upstream should never have been called mock_check.assert_not_called() @@ -279,24 +279,24 @@ def test_max_depth_limits_recursion(self): tracer = _make_tracer(output_dir, max_depth=1) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="flaw", evidence="500k", severity="MEDIUM") + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="problem", evidence="500k", severity="MEDIUM") ] ) - def always_found(flaw_desc, evidence, upstream_filename, upstream_content): + def always_found(problem_desc, evidence, upstream_filename, upstream_content): return UpstreamCheckResult(found=True, evidence="500k", explanation="found") - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream', side_effect=always_found), \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "test") - self.assertEqual(len(result.flaws), 1) - flaw = result.flaws[0] + self.assertEqual(len(result.problems), 1) + problem = result.problems[0] # trace_complete should be False because max depth was hit - self.assertFalse(flaw.trace_complete) + self.assertFalse(problem.trace_complete) class TestRootCauseAnalyzerSourceCodeAnalysis(unittest.TestCase): @@ -309,20 +309,20 @@ def test_source_code_analysis_called_at_origin(self): tracer = _make_tracer(output_dir) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="flaw", evidence="500k", severity="HIGH") + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="problem", evidence="500k", severity="HIGH") ] ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_analyze_source_code') as mock_analyze: result = tracer.trace("025-2-executive_summary.md", "test") # _analyze_source_code should have been called once for the origin mock_analyze.assert_called_once() args = mock_analyze.call_args - # First positional arg is the TracedFlaw, second is the node name + # First positional arg is the TracedProblem, second is the node name self.assertEqual(args[0][1], "executive_summary") def test_source_code_analysis_called_at_deep_origin(self): @@ -339,21 +339,21 @@ def test_source_code_analysis_called_at_deep_origin(self): tracer = _make_tracer(output_dir) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH") + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="Budget fabricated", evidence="500k", severity="HIGH") ] ) - def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): - # project_plan has the flaw; others are clean + def mock_check_upstream(problem_desc, evidence, upstream_filename, upstream_content): + # project_plan has the problem; others are clean if "project_plan" in upstream_filename: return UpstreamCheckResult( found=True, evidence="Budget: 500k", explanation="Budget originates here" ) return UpstreamCheckResult(found=False, evidence=None, explanation="clean") - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ patch.object(tracer, '_analyze_source_code') as mock_analyze: result = tracer.trace("025-2-executive_summary.md", "budget fabricated") @@ -365,40 +365,40 @@ def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content self.assertEqual(args[0][1], "project_plan") -class TestRootCauseAnalyzerMultipleFlaws(unittest.TestCase): - """Test that multiple flaws are traced independently.""" +class TestRootCauseAnalyzerMultipleProblems(unittest.TestCase): + """Test that multiple problems are traced independently.""" - def test_traces_multiple_flaws(self): + def test_traces_multiple_problems(self): with TemporaryDirectory() as d: output_dir = Path(d) (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k\nTimeline: 2 months", encoding="utf-8") tracer = _make_tracer(output_dir) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="Budget fabricated", evidence="500k", severity="HIGH"), - IdentifiedFlaw(description="Timeline unrealistic", evidence="2 months", severity="MEDIUM"), + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="Budget fabricated", evidence="500k", severity="HIGH"), + IdentifiedProblem(description="Timeline unrealistic", evidence="2 months", severity="MEDIUM"), ] ) - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "multiple issues") - self.assertEqual(len(result.flaws), 2) - descriptions = {f.description for f in result.flaws} + self.assertEqual(len(result.problems), 2) + descriptions = {f.description for f in result.problems} self.assertIn("Budget fabricated", descriptions) self.assertIn("Timeline unrealistic", descriptions) - # Each flaw should have a unique ID - ids = [f.id for f in result.flaws] + # Each problem should have a unique ID + ids = [f.id for f in result.problems] self.assertEqual(len(ids), len(set(ids))) class TestRootCauseAnalyzerSortsByDepth(unittest.TestCase): """Test that results are sorted by depth (deepest origin first).""" - def test_flaws_sorted_by_depth_descending(self): + def test_problems_sorted_by_depth_descending(self): with TemporaryDirectory() as d: output_dir = Path(d) (output_dir / "025-2-executive_summary.md").write_text("content", encoding="utf-8") @@ -409,31 +409,31 @@ def test_flaws_sorted_by_depth_descending(self): tracer = _make_tracer(output_dir) - mock_identification = FlawIdentificationResult( - flaws=[ - IdentifiedFlaw(description="shallow flaw", evidence="ev1", severity="LOW"), - IdentifiedFlaw(description="deep flaw", evidence="ev2", severity="HIGH"), + mock_identification = ProblemIdentificationResult( + problems=[ + IdentifiedProblem(description="shallow problem", evidence="ev1", severity="LOW"), + IdentifiedProblem(description="deep problem", evidence="ev2", severity="HIGH"), ] ) call_count = 0 - def mock_check_upstream(flaw_desc, evidence, upstream_filename, upstream_content): + def mock_check_upstream(problem_desc, evidence, upstream_filename, upstream_content): nonlocal call_count call_count += 1 - # For "deep flaw", find it in project_plan - if "deep flaw" in flaw_desc and "project_plan" in upstream_filename: + # For "deep problem", find it in project_plan + if "deep problem" in problem_desc and "project_plan" in upstream_filename: return UpstreamCheckResult(found=True, evidence="ev2", explanation="found") return UpstreamCheckResult(found=False, evidence=None, explanation="clean") - with patch.object(tracer, '_identify_flaws', return_value=mock_identification), \ + with patch.object(tracer, '_identify_problems', return_value=mock_identification), \ patch.object(tracer, '_check_upstream', side_effect=mock_check_upstream), \ patch.object(tracer, '_analyze_source_code'): result = tracer.trace("025-2-executive_summary.md", "test") - self.assertEqual(len(result.flaws), 2) + self.assertEqual(len(result.problems), 2) # Deepest origin should be first - self.assertGreaterEqual(result.flaws[0].depth, result.flaws[1].depth) + self.assertGreaterEqual(result.problems[0].depth, result.problems[1].depth) if __name__ == "__main__": diff --git a/worker_plan/worker_plan_internal/rca/tracer.py b/worker_plan/worker_plan_internal/rca/tracer.py index 28c6aaacb..bc17bf1fa 100644 --- a/worker_plan/worker_plan_internal/rca/tracer.py +++ b/worker_plan/worker_plan_internal/rca/tracer.py @@ -1,5 +1,5 @@ # worker_plan/worker_plan_internal/rca/tracer.py -"""Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" +"""Recursive depth-first root cause analyzer for PlanExe pipeline outputs.""" from __future__ import annotations import json @@ -17,10 +17,10 @@ get_source_code_paths, ) from worker_plan_internal.rca.prompts import ( - FlawIdentificationResult, + ProblemIdentificationResult, UpstreamCheckResult, SourceCodeAnalysisResult, - build_flaw_identification_messages, + build_problem_identification_messages, build_upstream_check_messages, build_source_code_analysis_messages, ) @@ -31,7 +31,7 @@ @dataclass class TraceEntry: - """One hop in a flaw's upstream trace.""" + """One hop in a problem's upstream trace.""" node: str file: str evidence: str @@ -40,7 +40,7 @@ class TraceEntry: @dataclass class OriginInfo: - """Source code analysis at a flaw's origin node.""" + """Source code analysis at a problem's origin node.""" node: str file: str source_code_files: list[str] @@ -50,8 +50,8 @@ class OriginInfo: @dataclass -class TracedFlaw: - """A fully traced flaw with its upstream chain.""" +class TracedProblem: + """A fully traced problem with its upstream chain.""" id: str description: str severity: str @@ -64,12 +64,12 @@ class TracedFlaw: @dataclass -class FlawTraceResult: - """Complete result of a flaw trace run.""" +class RCAResult: + """Complete result of a root cause analysis run.""" starting_file: str - flaw_description: str + problem_description: str output_dir: str - flaws: list[TracedFlaw] + problems: list[TracedProblem] llm_calls_made: int = 0 @@ -99,7 +99,7 @@ def log(self, event_type: str, **data: object) -> None: class RootCauseAnalyzer: - """Traces flaws upstream through the PlanExe pipeline DAG.""" + """Traces problems upstream through the PlanExe pipeline DAG.""" def __init__( self, @@ -114,11 +114,11 @@ def __init__( self.max_depth = max_depth self.verbose = verbose self._llm_calls = 0 - self._checked: set[tuple[str, str]] = set() # (node_name, flaw_description) dedup + self._checked: set[tuple[str, str]] = set() # (node_name, problem_description) dedup self._events = EventLogger(events_path) - def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: - """Main entry point. Identify flaws and trace each upstream.""" + def trace(self, starting_file: str, problem_description: str) -> RCAResult: + """Main entry point. Identify problems and trace each upstream.""" self._llm_calls = 0 self._checked.clear() @@ -130,39 +130,39 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: found_node = find_node_by_filename(starting_file) node_name = found_node.name if found_node else "unknown" - # Phase 1: Identify flaws - self._log(f"Phase 1: Identifying flaws in {starting_file}") + # Phase 1: Identify problems + self._log(f"Phase 1: Identifying problems in {starting_file}") self._events.log("phase1_start", file=starting_file, node=node_name) - identified = self._identify_flaws(starting_file, file_content, flaw_description) - self._log(f" Found {len(identified.flaws)} flaw(s)") - self._events.log("phase1_done", flaws_found=len(identified.flaws), - summaries=[f.description for f in identified.flaws]) - - traced_flaws: list[TracedFlaw] = [] - for i, flaw in enumerate(identified.flaws): - flaw_id = f"flaw_{i + 1:03d}" - self._log(f"\nTracing {flaw_id}: {flaw.description}") - self._events.log("trace_flaw_start", flaw_id=flaw_id, - flaw_index=i + 1, flaw_total=len(identified.flaws), - description=flaw.description, severity=flaw.severity) + identified = self._identify_problems(starting_file, file_content, problem_description) + self._log(f" Found {len(identified.problems)} problem(s)") + self._events.log("phase1_done", problems_found=len(identified.problems), + summaries=[p.description for p in identified.problems]) + + traced_problems: list[TracedProblem] = [] + for i, problem in enumerate(identified.problems): + problem_id = f"problem_{i + 1:03d}" + self._log(f"\nTracing {problem_id}: {problem.description}") + self._events.log("trace_problem_start", problem_id=problem_id, + problem_index=i + 1, problem_total=len(identified.problems), + description=problem.description, severity=problem.severity) starting_entry = TraceEntry( node=node_name, file=starting_file, - evidence=flaw.evidence, + evidence=problem.evidence, is_origin=False, ) - traced = TracedFlaw( - id=flaw_id, - description=flaw.description, - severity=flaw.severity, - starting_evidence=flaw.evidence, + traced = TracedProblem( + id=problem_id, + description=problem.description, + severity=problem.severity, + starting_evidence=problem.evidence, trace=[starting_entry], ) if found_node and self.max_depth > 0: - self._trace_upstream(traced, node_name, flaw.description, flaw.evidence, depth=0) + self._trace_upstream(traced, node_name, problem.description, problem.evidence, depth=0) # Mark the last trace entry as origin if no deeper origin was found if traced.origin_node is None and traced.trace: @@ -173,45 +173,45 @@ def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: # Phase 3: Source code analysis at origin (always, when origin is known) if traced.origin_node is not None: - self._events.log("phase3_start", flaw_id=flaw_id, origin_node=traced.origin_node) + self._events.log("phase3_start", problem_id=problem_id, origin_node=traced.origin_node) self._analyze_source_code( - traced, traced.origin_node, flaw.description, - next((e.evidence for e in traced.trace if e.node == traced.origin_node), flaw.evidence) + traced, traced.origin_node, problem.description, + next((e.evidence for e in traced.trace if e.node == traced.origin_node), problem.evidence) ) - self._events.log("trace_flaw_done", flaw_id=flaw_id, + self._events.log("trace_problem_done", problem_id=problem_id, origin_node=traced.origin_node, depth=traced.depth) - traced_flaws.append(traced) + traced_problems.append(traced) # Sort by depth (deepest origin first) - traced_flaws.sort(key=lambda f: f.depth, reverse=True) + traced_problems.sort(key=lambda f: f.depth, reverse=True) - self._events.log("trace_complete", total_flaws=len(traced_flaws), + self._events.log("trace_complete", total_problems=len(traced_problems), llm_calls=self._llm_calls) - return FlawTraceResult( + return RCAResult( starting_file=starting_file, - flaw_description=flaw_description, + problem_description=problem_description, output_dir=str(self.output_dir), - flaws=traced_flaws, + problems=traced_problems, llm_calls_made=self._llm_calls, ) - def _identify_flaws(self, filename: str, file_content: str, user_description: str) -> FlawIdentificationResult: - """Phase 1: Ask LLM to identify discrete flaws in the starting file.""" - messages = build_flaw_identification_messages(filename, file_content, user_description) + def _identify_problems(self, filename: str, file_content: str, user_description: str) -> ProblemIdentificationResult: + """Phase 1: Ask LLM to identify discrete problems in the starting file.""" + messages = build_problem_identification_messages(filename, file_content, user_description) - def execute(llm: LLM) -> FlawIdentificationResult: - sllm = llm.as_structured_llm(FlawIdentificationResult) + def execute(llm: LLM) -> ProblemIdentificationResult: + sllm = llm.as_structured_llm(ProblemIdentificationResult) response = sllm.chat(messages) return response.raw self._llm_calls += 1 return self.llm_executor.run(execute) - def _check_upstream(self, flaw_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: - """Phase 2: Ask LLM if a flaw exists in an upstream file.""" - messages = build_upstream_check_messages(flaw_description, evidence, upstream_filename, upstream_content) + def _check_upstream(self, problem_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: + """Phase 2: Ask LLM if a problem exists in an upstream file.""" + messages = build_upstream_check_messages(problem_description, evidence, upstream_filename, upstream_content) def execute(llm: LLM) -> UpstreamCheckResult: sllm = llm.as_structured_llm(UpstreamCheckResult) @@ -223,13 +223,13 @@ def execute(llm: LLM) -> UpstreamCheckResult: def _trace_upstream( self, - traced: TracedFlaw, + traced: TracedProblem, current_node: str, - flaw_description: str, + problem_description: str, evidence: str, depth: int, ) -> None: - """Recursively trace a flaw through upstream nodes.""" + """Recursively trace a problem through upstream nodes.""" if depth >= self.max_depth: traced.trace_complete = False self._log(f" Max depth {self.max_depth} reached at {current_node}") @@ -241,12 +241,12 @@ def _trace_upstream( found_upstream = False for upstream_name, upstream_path in upstream_files: - # Dedup key uses flaw_description so different flaws get independent + # Dedup key uses problem_description so different problems get independent # upstream checks. If the LLM returns duplicate descriptions, they # share check results. - dedup_key = (upstream_name, flaw_description) + dedup_key = (upstream_name, problem_description) if dedup_key in self._checked: - self._log(f" Skipping {upstream_name} (already checked for this flaw)") + self._log(f" Skipping {upstream_name} (already checked for this problem)") continue self._checked.add(dedup_key) @@ -255,7 +255,7 @@ def _trace_upstream( self._events.log("upstream_check", node=upstream_name, file=upstream_path.name, depth=depth) - result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) + result = self._check_upstream(problem_description, evidence, upstream_path.name, upstream_content) if result.found: self._log(f" -> FOUND in {upstream_name}") @@ -272,7 +272,7 @@ def _trace_upstream( # Recurse deeper self._trace_upstream( - traced, upstream_name, flaw_description, + traced, upstream_name, problem_description, result.evidence or evidence, depth + 1, ) # First-match-wins: once an origin is found in one upstream @@ -281,7 +281,7 @@ def _trace_upstream( return if not found_upstream: - # Current node is the origin — flaw exists here but not in any upstream + # Current node is the origin — problem exists here but not in any upstream traced.origin_node = current_node traced.depth = len(traced.trace) self._events.log("origin_found", node=current_node, depth=traced.depth) @@ -290,7 +290,7 @@ def _trace_upstream( if entry.node == current_node: entry.is_origin = True - def _analyze_source_code(self, traced: TracedFlaw, node_name: str, flaw_description: str, evidence: str) -> None: + def _analyze_source_code(self, traced: TracedProblem, node_name: str, problem_description: str, evidence: str) -> None: """Phase 3: Analyze source code at the origin node.""" source_paths = get_source_code_paths(node_name) if not source_paths: @@ -307,7 +307,7 @@ def _analyze_source_code(self, traced: TracedFlaw, node_name: str, flaw_descript return self._log(f" Phase 3: Analyzing source code for {node_name}") - messages = build_source_code_analysis_messages(flaw_description, evidence, source_contents) + messages = build_source_code_analysis_messages(problem_description, evidence, source_contents) def execute(llm: LLM) -> SourceCodeAnalysisResult: sllm = llm.as_structured_llm(SourceCodeAnalysisResult) From 313ac04befe106f43e57d3ce80cfc5edaf016e2b Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 16:01:37 +0200 Subject: [PATCH 36/37] docs: update RCA spec and plan to current terminology Remove 700-line inline DAG registry (now auto-generated), replace all "flaw" terminology with "problem" in code samples, JSON keys, CLI args, and prose. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/superpowers/plans/2026-04-05-rca.md | 1475 +++++------------ .../specs/2026-04-05-rca-design.md | 130 +- .../worker_plan_internal/rca/README.md | 2 +- 3 files changed, 446 insertions(+), 1161 deletions(-) diff --git a/docs/superpowers/plans/2026-04-05-rca.md b/docs/superpowers/plans/2026-04-05-rca.md index fed9dae2b..8b9770497 100644 --- a/docs/superpowers/plans/2026-04-05-rca.md +++ b/docs/superpowers/plans/2026-04-05-rca.md @@ -1,7 +1,7 @@ # Root Cause Analysis (RCA) Implementation Plan > **Historical note:** This plan was written under the name "flaw tracer". The module -> has been renamed to `rca` (root cause analysis) — all paths referencing `flaw_tracer` +> has been renamed to `rca` (root cause analysis) — all paths referencing `rca` > in this document now correspond to `rca`. > The static DAG registry described here has since been replaced by `extract_dag.py` > which introspects the Luigi task graph at import time. @@ -19,11 +19,11 @@ ## File Structure ``` -worker_plan/worker_plan_internal/flaw_tracer/ - __init__.py — Package init, exports FlawTracer - registry.py — Static DAG mapping: 48 stages with deps, files, source code +worker_plan/worker_plan_internal/rca/ + __init__.py — Package init, exports RootCauseAnalyzer + registry.py — DAG mapping (auto-generated from Luigi task introspection) prompts.py — 3 Pydantic models + 3 prompt builders - tracer.py — Recursive tracing algorithm (FlawTracer class) + tracer.py — Recursive tracing algorithm (RootCauseAnalyzer class) output.py — JSON + markdown report generation __main__.py — CLI entry point (argparse) tests/ @@ -39,75 +39,75 @@ worker_plan/worker_plan_internal/flaw_tracer/ ### Task 1: Registry — DAG Mapping **Files:** -- Create: `worker_plan/worker_plan_internal/flaw_tracer/__init__.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/registry.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py` +- Create: `worker_plan/worker_plan_internal/rca/__init__.py` +- Create: `worker_plan/worker_plan_internal/rca/registry.py` +- Create: `worker_plan/worker_plan_internal/rca/tests/__init__.py` +- Create: `worker_plan/worker_plan_internal/rca/tests/test_registry.py` - [ ] **Step 1: Write the failing tests** ```python -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py +# worker_plan/worker_plan_internal/rca/tests/test_registry.py import unittest from pathlib import Path from tempfile import TemporaryDirectory -from worker_plan_internal.flaw_tracer.registry import ( - StageInfo, - STAGES, - find_stage_by_filename, +from worker_plan_internal.rca.registry import ( + NodeInfo, + NODES, + find_node_by_filename, get_upstream_files, get_source_code_paths, ) -class TestStageInfo(unittest.TestCase): +class TestNodeInfo(unittest.TestCase): def test_stages_is_nonempty(self): - self.assertGreater(len(STAGES), 40) + self.assertGreater(len(NODES), 40) def test_all_stages_have_required_fields(self): - for stage in STAGES: - self.assertIsInstance(stage.name, str, f"{stage.name} name") - self.assertIsInstance(stage.output_files, list, f"{stage.name} output_files") - self.assertTrue(len(stage.output_files) > 0, f"{stage.name} has no output_files") - self.assertIsInstance(stage.upstream_stages, list, f"{stage.name} upstream_stages") - self.assertIsInstance(stage.source_code_files, list, f"{stage.name} source_code_files") - self.assertIsInstance(stage.primary_output, str, f"{stage.name} primary_output") - self.assertIn(stage.primary_output, stage.output_files, f"{stage.name} primary_output not in output_files") - - def test_no_duplicate_stage_names(self): - names = [s.name for s in STAGES] + for node in NODES: + self.assertIsInstance(node.name, str, f"{node.name} name") + self.assertIsInstance(node.output_files, list, f"{node.name} output_files") + self.assertTrue(len(node.output_files) > 0, f"{node.name} has no output_files") + self.assertIsInstance(node.inputs, list, f"{node.name} inputs") + self.assertIsInstance(node.source_code_files, list, f"{node.name} source_code_files") + self.assertIsInstance(node.primary_output, str, f"{node.name} primary_output") + self.assertIn(node.primary_output, node.output_files, f"{node.name} primary_output not in output_files") + + def test_no_duplicate_node_names(self): + names = [s.name for s in NODES] self.assertEqual(len(names), len(set(names))) def test_upstream_references_are_valid(self): - valid_names = {s.name for s in STAGES} - for stage in STAGES: - for upstream in stage.upstream_stages: - self.assertIn(upstream, valid_names, f"{stage.name} references unknown upstream '{upstream}'") + valid_names = {s.name for s in NODES} + for node in NODES: + for upstream in node.inputs: + self.assertIn(upstream, valid_names, f"{node.name} references unknown upstream '{upstream}'") class TestFindStageByFilename(unittest.TestCase): def test_find_report(self): - stage = find_stage_by_filename("030-report.html") + stage = find_node_by_filename("030-report.html") self.assertIsNotNone(stage) - self.assertEqual(stage.name, "report") + self.assertEqual(node.name, "report") def test_find_potential_levers_clean(self): - stage = find_stage_by_filename("002-10-potential_levers.json") + stage = find_node_by_filename("002-10-potential_levers.json") self.assertIsNotNone(stage) - self.assertEqual(stage.name, "potential_levers") + self.assertEqual(node.name, "potential_levers") def test_find_potential_levers_raw(self): - stage = find_stage_by_filename("002-9-potential_levers_raw.json") + stage = find_node_by_filename("002-9-potential_levers_raw.json") self.assertIsNotNone(stage) - self.assertEqual(stage.name, "potential_levers") + self.assertEqual(node.name, "potential_levers") def test_find_executive_summary(self): - stage = find_stage_by_filename("025-2-executive_summary.md") + stage = find_node_by_filename("025-2-executive_summary.md") self.assertIsNotNone(stage) - self.assertEqual(stage.name, "executive_summary") + self.assertEqual(node.name, "executive_summary") def test_unknown_filename_returns_none(self): - stage = find_stage_by_filename("zzz-unknown.txt") + stage = find_node_by_filename("zzz-unknown.txt") self.assertIsNone(stage) @@ -127,11 +127,11 @@ class TestGetUpstreamFiles(unittest.TestCase): (output_dir / "002-0-extract_constraints.md").write_text("constraints", encoding="utf-8") result = get_upstream_files("potential_levers", output_dir) - stage_names = [name for name, _ in result] - self.assertIn("setup", stage_names) - self.assertIn("identify_purpose", stage_names) - self.assertIn("plan_type", stage_names) - self.assertIn("extract_constraints", stage_names) + node_names = [name for name, _ in result] + self.assertIn("setup", node_names) + self.assertIn("identify_purpose", node_names) + self.assertIn("plan_type", node_names) + self.assertIn("extract_constraints", node_names) def test_missing_files_are_skipped(self): with TemporaryDirectory() as d: @@ -140,10 +140,10 @@ class TestGetUpstreamFiles(unittest.TestCase): (output_dir / "001-2-plan.txt").write_text("plan", encoding="utf-8") result = get_upstream_files("potential_levers", output_dir) - stage_names = [name for name, _ in result] - self.assertIn("setup", stage_names) + node_names = [name for name, _ in result] + self.assertIn("setup", node_names) # The others should be skipped because their files don't exist - self.assertNotIn("identify_purpose", stage_names) + self.assertNotIn("identify_purpose", node_names) class TestGetSourceCodePaths(unittest.TestCase): @@ -161,23 +161,23 @@ class TestGetSourceCodePaths(unittest.TestCase): - [ ] **Step 2: Create package init files** ```python -# worker_plan/worker_plan_internal/flaw_tracer/__init__.py -"""Flaw Tracer — Root-cause analysis for PlanExe reports.""" +# worker_plan/worker_plan_internal/rca/__init__.py +"""RCA — Root-cause analysis for PlanExe reports.""" ``` ```python -# worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py +# worker_plan/worker_plan_internal/rca/tests/__init__.py ``` - [ ] **Step 3: Run tests to verify they fail** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_registry.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_registry.py -v` Expected: FAIL with `ModuleNotFoundError` or `ImportError` - [ ] **Step 4: Implement registry.py** ```python -# worker_plan/worker_plan_internal/flaw_tracer/registry.py +# worker_plan/worker_plan_internal/rca/registry.py """Static DAG mapping for the PlanExe pipeline. Maps every pipeline stage to its output files, upstream dependencies, @@ -193,783 +193,68 @@ _SOURCE_BASE = Path(__file__).resolve().parent.parent.parent # worker_plan/ @dataclass(frozen=True) -class StageInfo: +class NodeInfo: """One pipeline stage.""" name: str output_files: list[str] - primary_output: str # preferred file to read when checking for flaws - upstream_stages: list[str] = field(default_factory=list) + primary_output: str # preferred file to read when checking for problems + inputs: list[str] = field(default_factory=list) source_code_files: list[str] = field(default_factory=list) -# ── Complete pipeline registry ────────────────────────────────────────── - -STAGES: list[StageInfo] = [ - # Phase 1: Initialization - StageInfo( - name="start_time", - output_files=["001-1-start_time.json"], - primary_output="001-1-start_time.json", - upstream_stages=[], - source_code_files=["worker_plan_internal/plan/stages/start_time.py"], - ), - StageInfo( - name="setup", - output_files=["001-2-plan.txt"], - primary_output="001-2-plan.txt", - upstream_stages=[], - source_code_files=["worker_plan_internal/plan/stages/setup.py"], - ), - # Phase 2: Input Validation & Strategy - StageInfo( - name="screen_planning_prompt", - output_files=["002-0-screen_planning_prompt.json", "002-0-screen_planning_prompt.md"], - primary_output="002-0-screen_planning_prompt.md", - upstream_stages=["setup"], - source_code_files=[ - "worker_plan_internal/plan/stages/screen_planning_prompt.py", - "worker_plan_internal/diagnostics/screen_planning_prompt.py", - ], - ), - StageInfo( - name="extract_constraints", - output_files=["002-0-extract_constraints_raw.json", "002-0-extract_constraints.md"], - primary_output="002-0-extract_constraints.md", - upstream_stages=["setup"], - source_code_files=[ - "worker_plan_internal/plan/stages/extract_constraints.py", - "worker_plan_internal/diagnostics/extract_constraints.py", - ], - ), - StageInfo( - name="redline_gate", - output_files=["002-1-redline_gate.json", "002-2-redline_gate.md"], - primary_output="002-2-redline_gate.md", - upstream_stages=["setup"], - source_code_files=[ - "worker_plan_internal/plan/stages/redline_gate.py", - "worker_plan_internal/diagnostics/redline_gate.py", - ], - ), - StageInfo( - name="premise_attack", - output_files=["002-3-premise_attack.json", "002-4-premise_attack.md"], - primary_output="002-4-premise_attack.md", - upstream_stages=["setup"], - source_code_files=[ - "worker_plan_internal/plan/stages/premise_attack.py", - "worker_plan_internal/diagnostics/premise_attack.py", - ], - ), - StageInfo( - name="identify_purpose", - output_files=["002-5-identify_purpose_raw.json", "002-6-identify_purpose.md"], - primary_output="002-6-identify_purpose.md", - upstream_stages=["setup"], - source_code_files=[ - "worker_plan_internal/plan/stages/identify_purpose.py", - "worker_plan_internal/assume/identify_purpose.py", - ], - ), - StageInfo( - name="plan_type", - output_files=["002-7-plan_type_raw.json", "002-8-plan_type.md"], - primary_output="002-8-plan_type.md", - upstream_stages=["setup", "identify_purpose"], - source_code_files=[ - "worker_plan_internal/plan/stages/plan_type.py", - "worker_plan_internal/assume/identify_plan_type.py", - ], - ), - StageInfo( - name="potential_levers", - output_files=["002-9-potential_levers_raw.json", "002-10-potential_levers.json"], - primary_output="002-10-potential_levers.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "extract_constraints"], - source_code_files=[ - "worker_plan_internal/plan/stages/potential_levers.py", - "worker_plan_internal/lever/identify_potential_levers.py", - ], - ), - StageInfo( - name="deduplicate_levers", - output_files=["002-11-deduplicated_levers_raw.json"], - primary_output="002-11-deduplicated_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "potential_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/deduplicate_levers.py", - "worker_plan_internal/lever/deduplicate_levers.py", - ], - ), - StageInfo( - name="enrich_levers", - output_files=["002-12-enriched_levers_raw.json"], - primary_output="002-12-enriched_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "deduplicate_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/enrich_levers.py", - "worker_plan_internal/lever/enrich_potential_levers.py", - ], - ), - StageInfo( - name="focus_on_vital_few_levers", - output_files=["002-13-vital_few_levers_raw.json"], - primary_output="002-13-vital_few_levers_raw.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "enrich_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/focus_on_vital_few_levers.py", - "worker_plan_internal/lever/focus_on_vital_few_levers.py", - ], - ), - StageInfo( - name="strategic_decisions_markdown", - output_files=["002-14-strategic_decisions.md"], - primary_output="002-14-strategic_decisions.md", - upstream_stages=["enrich_levers", "focus_on_vital_few_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/strategic_decisions_markdown.py", - "worker_plan_internal/lever/strategic_decisions_markdown.py", - ], - ), - StageInfo( - name="candidate_scenarios", - output_files=["002-15-candidate_scenarios_raw.json", "002-16-candidate_scenarios.json"], - primary_output="002-16-candidate_scenarios.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/candidate_scenarios.py", - "worker_plan_internal/lever/candidate_scenarios.py", - ], - ), - StageInfo( - name="select_scenario", - output_files=["002-17-selected_scenario_raw.json", "002-18-selected_scenario.json"], - primary_output="002-18-selected_scenario.json", - upstream_stages=["setup", "identify_purpose", "plan_type", "focus_on_vital_few_levers", "candidate_scenarios"], - source_code_files=[ - "worker_plan_internal/plan/stages/select_scenario.py", - "worker_plan_internal/lever/select_scenario.py", - ], - ), - StageInfo( - name="scenarios_markdown", - output_files=["002-19-scenarios.md"], - primary_output="002-19-scenarios.md", - upstream_stages=["candidate_scenarios", "select_scenario"], - source_code_files=[ - "worker_plan_internal/plan/stages/scenarios_markdown.py", - "worker_plan_internal/lever/scenarios_markdown.py", - ], - ), - # Constraint checkers - StageInfo( - name="potential_levers_constraint", - output_files=["002-10-potential_levers_constraint.json"], - primary_output="002-10-potential_levers_constraint.json", - upstream_stages=["extract_constraints", "potential_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - StageInfo( - name="deduplicated_levers_constraint", - output_files=["002-11-deduplicated_levers_constraint.json"], - primary_output="002-11-deduplicated_levers_constraint.json", - upstream_stages=["extract_constraints", "deduplicate_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - StageInfo( - name="enriched_levers_constraint", - output_files=["002-12-enriched_levers_constraint.json"], - primary_output="002-12-enriched_levers_constraint.json", - upstream_stages=["extract_constraints", "enrich_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - StageInfo( - name="vital_few_levers_constraint", - output_files=["002-13-vital_few_levers_constraint.json"], - primary_output="002-13-vital_few_levers_constraint.json", - upstream_stages=["extract_constraints", "focus_on_vital_few_levers"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - StageInfo( - name="candidate_scenarios_constraint", - output_files=["002-16-candidate_scenarios_constraint.json"], - primary_output="002-16-candidate_scenarios_constraint.json", - upstream_stages=["extract_constraints", "candidate_scenarios"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - StageInfo( - name="selected_scenario_constraint", - output_files=["002-18-selected_scenario_constraint.json"], - primary_output="002-18-selected_scenario_constraint.json", - upstream_stages=["extract_constraints", "select_scenario"], - source_code_files=[ - "worker_plan_internal/plan/stages/constraint_checker_stages.py", - "worker_plan_internal/diagnostics/constraint_checker.py", - ], - ), - # Phase 3: Context & Assumptions - StageInfo( - name="physical_locations", - output_files=["002-20-physical_locations_raw.json", "002-21-physical_locations.md"], - primary_output="002-21-physical_locations.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown"], - source_code_files=[ - "worker_plan_internal/plan/stages/physical_locations.py", - "worker_plan_internal/assume/physical_locations.py", - ], - ), - StageInfo( - name="currency_strategy", - output_files=["002-22-currency_strategy_raw.json", "002-23-currency_strategy.md"], - primary_output="002-23-currency_strategy.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "physical_locations", "strategic_decisions_markdown", "scenarios_markdown"], - source_code_files=[ - "worker_plan_internal/plan/stages/currency_strategy.py", - "worker_plan_internal/assume/currency_strategy.py", - ], - ), - StageInfo( - name="identify_risks", - output_files=["003-1-identify_risks_raw.json", "003-2-identify_risks.md"], - primary_output="003-2-identify_risks.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy"], - source_code_files=[ - "worker_plan_internal/plan/stages/identify_risks.py", - "worker_plan_internal/assume/identify_risks.py", - ], - ), - StageInfo( - name="make_assumptions", - output_files=["003-3-make_assumptions_raw.json", "003-4-make_assumptions.json", "003-5-make_assumptions.md"], - primary_output="003-5-make_assumptions.md", - upstream_stages=["setup", "identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks"], - source_code_files=[ - "worker_plan_internal/plan/stages/make_assumptions.py", - "worker_plan_internal/assume/make_assumptions.py", - ], - ), - StageInfo( - name="distill_assumptions", - output_files=["003-6-distill_assumptions_raw.json", "003-7-distill_assumptions.md"], - primary_output="003-7-distill_assumptions.md", - upstream_stages=["setup", "identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "make_assumptions"], - source_code_files=[ - "worker_plan_internal/plan/stages/distill_assumptions.py", - "worker_plan_internal/assume/distill_assumptions.py", - ], - ), - StageInfo( - name="review_assumptions", - output_files=["003-8-review_assumptions_raw.json", "003-9-review_assumptions.md"], - primary_output="003-9-review_assumptions.md", - upstream_stages=["identify_purpose", "plan_type", "strategic_decisions_markdown", "scenarios_markdown", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions"], - source_code_files=[ - "worker_plan_internal/plan/stages/review_assumptions.py", - "worker_plan_internal/assume/review_assumptions.py", - ], - ), - StageInfo( - name="consolidate_assumptions_markdown", - output_files=["003-10-consolidate_assumptions_full.md", "003-11-consolidate_assumptions_short.md"], - primary_output="003-10-consolidate_assumptions_full.md", - upstream_stages=["identify_purpose", "plan_type", "physical_locations", "currency_strategy", "identify_risks", "make_assumptions", "distill_assumptions", "review_assumptions"], - source_code_files=[ - "worker_plan_internal/plan/stages/consolidate_assumptions_markdown.py", - "worker_plan_internal/assume/shorten_markdown.py", - ], - ), - # Phase 4: Pre-Project Assessment & Project Plan - StageInfo( - name="pre_project_assessment", - output_files=["004-1-pre_project_assessment_raw.json", "004-2-pre_project_assessment.json"], - primary_output="004-2-pre_project_assessment.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown"], - source_code_files=[ - "worker_plan_internal/plan/stages/pre_project_assessment.py", - "worker_plan_internal/expert/pre_project_assessment.py", - ], - ), - StageInfo( - name="project_plan", - output_files=["005-1-project_plan_raw.json", "005-2-project_plan.md"], - primary_output="005-2-project_plan.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment"], - source_code_files=[ - "worker_plan_internal/plan/stages/project_plan.py", - "worker_plan_internal/plan/project_plan.py", - ], - ), - # Phase 5: Governance - StageInfo( - name="governance_phase1_audit", - output_files=["006-1-governance_phase1_audit_raw.json", "006-2-governance_phase1_audit.md"], - primary_output="006-2-governance_phase1_audit.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase1_audit.py", - "worker_plan_internal/governance/governance_phase1_audit.py", - ], - ), - StageInfo( - name="governance_phase2_bodies", - output_files=["006-3-governance_phase2_bodies_raw.json", "006-4-governance_phase2_bodies.md"], - primary_output="006-4-governance_phase2_bodies.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase2_bodies.py", - "worker_plan_internal/governance/governance_phase2_bodies.py", - ], - ), - StageInfo( - name="governance_phase3_impl_plan", - output_files=["006-5-governance_phase3_impl_plan_raw.json", "006-6-governance_phase3_impl_plan.md"], - primary_output="006-6-governance_phase3_impl_plan.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase3_impl_plan.py", - "worker_plan_internal/governance/governance_phase3_impl_plan.py", - ], - ), - StageInfo( - name="governance_phase4_decision_escalation_matrix", - output_files=["006-7-governance_phase4_decision_escalation_matrix_raw.json", "006-8-governance_phase4_decision_escalation_matrix.md"], - primary_output="006-8-governance_phase4_decision_escalation_matrix.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase4_decision_escalation_matrix.py", - "worker_plan_internal/governance/governance_phase4_decision_escalation_matrix.py", - ], - ), - StageInfo( - name="governance_phase5_monitoring_progress", - output_files=["006-9-governance_phase5_monitoring_progress_raw.json", "006-10-governance_phase5_monitoring_progress.md"], - primary_output="006-10-governance_phase5_monitoring_progress.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase5_monitoring_progress.py", - "worker_plan_internal/governance/governance_phase5_monitoring_progress.py", - ], - ), - StageInfo( - name="governance_phase6_extra", - output_files=["006-11-governance_phase6_extra_raw.json", "006-12-governance_phase6_extra.md"], - primary_output="006-12-governance_phase6_extra.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress"], - source_code_files=[ - "worker_plan_internal/plan/stages/governance_phase6_extra.py", - "worker_plan_internal/governance/governance_phase6_extra.py", - ], - ), - StageInfo( - name="consolidate_governance", - output_files=["006-13-consolidate_governance.md"], - primary_output="006-13-consolidate_governance.md", - upstream_stages=["governance_phase1_audit", "governance_phase2_bodies", "governance_phase3_impl_plan", "governance_phase4_decision_escalation_matrix", "governance_phase5_monitoring_progress", "governance_phase6_extra"], - source_code_files=["worker_plan_internal/plan/stages/consolidate_governance.py"], - ), - # Phase 6: Resources & Team - StageInfo( - name="related_resources", - output_files=["007-1-related_resources_raw.json", "007-8-related_resources.md"], - primary_output="007-8-related_resources.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/related_resources.py", - "worker_plan_internal/plan/related_resources.py", - ], - ), - StageInfo( - name="find_team_members", - output_files=["008-1-find_team_members_raw.json", "008-2-find_team_members.json"], - primary_output="008-2-find_team_members.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/find_team_members.py", - "worker_plan_internal/team/find_team_members.py", - ], - ), - StageInfo( - name="enrich_team_contract_type", - output_files=["009-1-enrich_team_members_contract_type_raw.json", "009-2-enrich_team_members_contract_type.json"], - primary_output="009-2-enrich_team_members_contract_type.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "find_team_members", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/enrich_team_contract_type.py", - "worker_plan_internal/team/enrich_team_members_with_contract_type.py", - ], - ), - StageInfo( - name="enrich_team_background_story", - output_files=["010-1-enrich_team_members_background_story_raw.json", "010-2-enrich_team_members_background_story.json"], - primary_output="010-2-enrich_team_members_background_story.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_contract_type", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/enrich_team_background_story.py", - "worker_plan_internal/team/enrich_team_members_with_background_story.py", - ], - ), - StageInfo( - name="enrich_team_environment_info", - output_files=["011-1-enrich_team_members_environment_info_raw.json", "011-2-enrich_team_members_environment_info.json"], - primary_output="011-2-enrich_team_members_environment_info.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_background_story", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/enrich_team_environment_info.py", - "worker_plan_internal/team/enrich_team_members_with_environment_info.py", - ], - ), - StageInfo( - name="review_team", - output_files=["012-review_team_raw.json"], - primary_output="012-review_team_raw.json", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "enrich_team_environment_info", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/review_team.py", - "worker_plan_internal/team/review_team.py", - ], - ), - StageInfo( - name="team_markdown", - output_files=["013-team.md"], - primary_output="013-team.md", - upstream_stages=["enrich_team_environment_info", "review_team"], - source_code_files=[ - "worker_plan_internal/plan/stages/team_markdown.py", - "worker_plan_internal/team/team_markdown_document.py", - ], - ), - # Phase 7: Analysis & Experts - StageInfo( - name="swot_analysis", - output_files=["014-1-swot_analysis_raw.json", "014-2-swot_analysis.md"], - primary_output="014-2-swot_analysis.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "identify_purpose", "consolidate_assumptions_markdown", "pre_project_assessment", "project_plan", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/swot_analysis.py", - "worker_plan_internal/swot/swot_analysis.py", - ], - ), - StageInfo( - name="expert_review", - output_files=["015-1-experts_raw.json", "015-2-experts.json", "016-2-expert_criticism.md"], - primary_output="016-2-expert_criticism.md", - upstream_stages=["setup", "strategic_decisions_markdown", "scenarios_markdown", "pre_project_assessment", "project_plan", "swot_analysis"], - source_code_files=[ - "worker_plan_internal/plan/stages/expert_review.py", - "worker_plan_internal/expert/expert_finder.py", - "worker_plan_internal/expert/expert_criticism.py", - ], - ), - # Phase 8: Data & Documents - StageInfo( - name="data_collection", - output_files=["017-1-data_collection_raw.json", "017-2-data_collection.md"], - primary_output="017-2-data_collection.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], - source_code_files=[ - "worker_plan_internal/plan/stages/data_collection.py", - "worker_plan_internal/plan/data_collection.py", - ], - ), - StageInfo( - name="identify_documents", - output_files=["017-3-identified_documents_raw.json", "017-4-identified_documents.md", "017-5-identified_documents_to_find.json", "017-6-identified_documents_to_create.json"], - primary_output="017-4-identified_documents.md", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "related_resources", "swot_analysis", "team_markdown", "expert_review"], - source_code_files=[ - "worker_plan_internal/plan/stages/identify_documents.py", - "worker_plan_internal/document/identify_documents.py", - ], - ), - StageInfo( - name="filter_documents_to_find", - output_files=["017-7-filter_documents_to_find_raw.json", "017-8-filter_documents_to_find_clean.json"], - primary_output="017-8-filter_documents_to_find_clean.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], - source_code_files=[ - "worker_plan_internal/plan/stages/filter_documents_to_find.py", - "worker_plan_internal/document/filter_documents_to_find.py", - ], - ), - StageInfo( - name="filter_documents_to_create", - output_files=["017-9-filter_documents_to_create_raw.json", "017-10-filter_documents_to_create_clean.json"], - primary_output="017-10-filter_documents_to_create_clean.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "identify_documents"], - source_code_files=[ - "worker_plan_internal/plan/stages/filter_documents_to_create.py", - "worker_plan_internal/document/filter_documents_to_create.py", - ], - ), - StageInfo( - name="draft_documents_to_find", - output_files=["017-12-draft_documents_to_find.json"], - primary_output="017-12-draft_documents_to_find.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_find"], - source_code_files=[ - "worker_plan_internal/plan/stages/draft_documents_to_find.py", - "worker_plan_internal/document/draft_document_to_find.py", - ], - ), - StageInfo( - name="draft_documents_to_create", - output_files=["017-14-draft_documents_to_create.json"], - primary_output="017-14-draft_documents_to_create.json", - upstream_stages=["identify_purpose", "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "filter_documents_to_create"], - source_code_files=[ - "worker_plan_internal/plan/stages/draft_documents_to_create.py", - "worker_plan_internal/document/draft_document_to_create.py", - ], - ), - StageInfo( - name="markdown_documents", - output_files=["017-15-documents_to_create_and_find.md"], - primary_output="017-15-documents_to_create_and_find.md", - upstream_stages=["draft_documents_to_create", "draft_documents_to_find"], - source_code_files=[ - "worker_plan_internal/plan/stages/markdown_documents.py", - "worker_plan_internal/document/markdown_with_document.py", - ], - ), - # Phase 9: WBS - StageInfo( - name="create_wbs_level1", - output_files=["018-1-wbs_level1_raw.json", "018-2-wbs_level1.json", "018-3-wbs_level1_project_title.json"], - primary_output="018-2-wbs_level1.json", - upstream_stages=["project_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/create_wbs_level1.py", - "worker_plan_internal/plan/create_wbs_level1.py", - ], - ), - StageInfo( - name="create_wbs_level2", - output_files=["018-4-wbs_level2_raw.json", "018-5-wbs_level2.json"], - primary_output="018-5-wbs_level2.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level1", "data_collection"], - source_code_files=[ - "worker_plan_internal/plan/stages/create_wbs_level2.py", - "worker_plan_internal/plan/create_wbs_level2.py", - ], - ), - StageInfo( - name="wbs_project_level1_and_level2", - output_files=["019-wbs_project_level1_and_level2.json"], - primary_output="019-wbs_project_level1_and_level2.json", - upstream_stages=["create_wbs_level1", "create_wbs_level2"], - source_code_files=[ - "worker_plan_internal/plan/stages/wbs_project_level1_and_level2.py", - "worker_plan_internal/wbs/wbs_populate.py", - ], - ), - # Phase 10: Pitch & Dependencies - StageInfo( - name="create_pitch", - output_files=["020-1-pitch_raw.json"], - primary_output="020-1-pitch_raw.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "wbs_project_level1_and_level2", "related_resources"], - source_code_files=[ - "worker_plan_internal/plan/stages/create_pitch.py", - "worker_plan_internal/pitch/create_pitch.py", - ], - ), - StageInfo( - name="convert_pitch_to_markdown", - output_files=["020-2-pitch_to_markdown_raw.json", "020-3-pitch.md"], - primary_output="020-3-pitch.md", - upstream_stages=["create_pitch"], - source_code_files=[ - "worker_plan_internal/plan/stages/convert_pitch_to_markdown.py", - "worker_plan_internal/pitch/convert_pitch_to_markdown.py", - ], - ), - StageInfo( - name="identify_task_dependencies", - output_files=["021-task_dependencies_raw.json"], - primary_output="021-task_dependencies_raw.json", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "project_plan", "create_wbs_level2", "data_collection"], - source_code_files=[ - "worker_plan_internal/plan/stages/identify_task_dependencies.py", - "worker_plan_internal/plan/identify_wbs_task_dependencies.py", - ], - ), - StageInfo( - name="estimate_task_durations", - output_files=["022-2-task_durations.json"], - primary_output="022-2-task_durations.json", - upstream_stages=["project_plan", "wbs_project_level1_and_level2"], - source_code_files=[ - "worker_plan_internal/plan/stages/estimate_task_durations.py", - "worker_plan_internal/plan/estimate_wbs_task_durations.py", - ], - ), - # Phase 11: WBS Level 3 - StageInfo( - name="create_wbs_level3", - output_files=["023-2-wbs_level3.json"], - primary_output="023-2-wbs_level3.json", - upstream_stages=["project_plan", "wbs_project_level1_and_level2", "estimate_task_durations", "data_collection"], - source_code_files=[ - "worker_plan_internal/plan/stages/create_wbs_level3.py", - "worker_plan_internal/plan/create_wbs_level3.py", - ], - ), - StageInfo( - name="wbs_project_level1_level2_level3", - output_files=["023-3-wbs_project_level1_and_level2_and_level3.json", "023-4-wbs_project_level1_and_level2_and_level3.csv"], - primary_output="023-3-wbs_project_level1_and_level2_and_level3.json", - upstream_stages=["wbs_project_level1_and_level2", "create_wbs_level3"], - source_code_files=[ - "worker_plan_internal/plan/stages/wbs_project_level1_level2_level3.py", - "worker_plan_internal/wbs/wbs_populate.py", - ], - ), - # Phase 12: Schedule & Reviews - StageInfo( - name="create_schedule", - output_files=["026-2-schedule_gantt_dhtmlx.html", "026-3-schedule_gantt_machai.csv"], - primary_output="026-2-schedule_gantt_dhtmlx.html", - upstream_stages=["start_time", "create_wbs_level1", "identify_task_dependencies", "estimate_task_durations", "wbs_project_level1_level2_level3"], - source_code_files=[ - "worker_plan_internal/plan/stages/create_schedule.py", - "worker_plan_internal/schedule/project_schedule_populator.py", - ], - ), - StageInfo( - name="review_plan", - output_files=["024-1-review_plan_raw.json", "024-2-review_plan.md"], - primary_output="024-2-review_plan.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3"], - source_code_files=[ - "worker_plan_internal/plan/stages/review_plan.py", - "worker_plan_internal/plan/review_plan.py", - ], - ), - StageInfo( - name="executive_summary", - output_files=["025-1-executive_summary_raw.json", "025-2-executive_summary.md"], - primary_output="025-2-executive_summary.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "project_plan", "data_collection", "related_resources", "swot_analysis", "team_markdown", "convert_pitch_to_markdown", "expert_review", "wbs_project_level1_level2_level3", "review_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/executive_summary.py", - "worker_plan_internal/plan/executive_summary.py", - ], - ), - StageInfo( - name="questions_and_answers", - output_files=["027-1-questions_and_answers_raw.json", "027-2-questions_and_answers.md", "027-3-questions_and_answers.html"], - primary_output="027-2-questions_and_answers.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan"], - source_code_files=[ - "worker_plan_internal/plan/stages/questions_and_answers.py", - "worker_plan_internal/questions_answers/questions_answers.py", - ], - ), - StageInfo( - name="premortem", - output_files=["028-1-premortem_raw.json", "028-2-premortem.md"], - primary_output="028-2-premortem.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers"], - source_code_files=[ - "worker_plan_internal/plan/stages/premortem.py", - "worker_plan_internal/diagnostics/premortem.py", - ], - ), - StageInfo( - name="self_audit", - output_files=["029-1-self_audit_raw.json", "029-2-self_audit.md"], - primary_output="029-2-self_audit.md", - upstream_stages=["strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", "convert_pitch_to_markdown", "data_collection", "markdown_documents", "wbs_project_level1_level2_level3", "expert_review", "project_plan", "review_plan", "questions_and_answers", "premortem"], - source_code_files=[ - "worker_plan_internal/plan/stages/self_audit.py", - "worker_plan_internal/self_audit/self_audit.py", - ], - ), - # Phase 13: Final Report - StageInfo( - name="report", - output_files=["030-report.html"], - primary_output="030-report.html", - upstream_stages=[ - "setup", "screen_planning_prompt", "redline_gate", "premise_attack", - "strategic_decisions_markdown", "scenarios_markdown", "consolidate_assumptions_markdown", - "team_markdown", "related_resources", "consolidate_governance", "swot_analysis", - "convert_pitch_to_markdown", "data_collection", "markdown_documents", - "create_wbs_level1", "wbs_project_level1_level2_level3", "expert_review", - "project_plan", "review_plan", "executive_summary", "create_schedule", - "questions_and_answers", "premortem", "self_audit", - ], - source_code_files=[ - "worker_plan_internal/plan/stages/report.py", - "worker_plan_internal/report/report_generator.py", - ], - ), -] +# ── Pipeline registry (auto-generated at import time) ───────────────── +# The registry is built by extract_dag.py which introspects the Luigi +# task graph. See extract_dag.py for the full ~70-node DAG. +# Example entry: +NODES: tuple[NodeInfo, ...] = _build_registry() # see registry.py -# ── Lookup indexes (built once at import time) ────────────────────────── +# Example NodeInfo: +# NodeInfo( +# name="potential_levers", +# output_files=("002-9-potential_levers_raw.json", "002-10-potential_levers.json"), +# inputs=(NodeInput(from_node="setup", artifact_path="001-2-plan.txt"), ...), +# source_code_files=("worker_plan_internal/plan/nodes/potential_levers.py", ...), +# ) -_STAGE_BY_NAME: dict[str, StageInfo] = {s.name: s for s in STAGES} -_STAGE_BY_FILENAME: dict[str, StageInfo] = {} -for _stage in STAGES: - for _fname in _stage.output_files: - _STAGE_BY_FILENAME[_fname] = _stage +_NODE_BY_NAME: dict[str, NodeInfo] = {n.name: n for n in NODES} +_NODE_BY_FILENAME: dict[str, NodeInfo] = {} +for _node in NODES: + for _fname in _node.output_files: + _NODE_BY_FILENAME[_fname] = _node -def find_stage_by_filename(filename: str) -> Optional[StageInfo]: - """Given an output filename, return the stage that produced it.""" - return _STAGE_BY_FILENAME.get(filename) +def find_node_by_filename(filename: str) -> NodeInfo | None: + return _NODE_BY_FILENAME.get(filename) -def get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]: - """Return (stage_name, file_path) pairs for upstream stages whose primary output exists on disk.""" - stage = _STAGE_BY_NAME.get(stage_name) - if stage is None: +def get_upstream_files(node_name: str, output_dir: Path) -> list[tuple[str, Path]]: + node = _NODE_BY_NAME.get(node_name) + if node is None: return [] - result = [] - for upstream_name in stage.upstream_stages: - upstream_stage = _STAGE_BY_NAME.get(upstream_name) - if upstream_stage is None: - continue - primary_path = output_dir / upstream_stage.primary_output - if primary_path.exists(): - result.append((upstream_name, primary_path)) + for inp in node.inputs: + artifact_path = output_dir / inp.artifact_path + if artifact_path.exists(): + result.append((inp.from_node, artifact_path)) return result -def get_source_code_paths(stage_name: str) -> list[Path]: - """Return absolute paths to source code files for a stage.""" - stage = _STAGE_BY_NAME.get(stage_name) - if stage is None: +def get_source_code_paths(node_name: str) -> list[Path]: + node = _NODE_BY_NAME.get(node_name) + if node is None: return [] - return [_SOURCE_BASE / f for f in stage.source_code_files] + return [_SOURCE_BASE / f for f in node.source_code_files] ``` - - [ ] **Step 5: Run tests to verify they pass** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_registry.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_registry.py -v` Expected: All tests PASS - [ ] **Step 6: Commit** ```bash -git add worker_plan/worker_plan_internal/flaw_tracer/__init__.py worker_plan/worker_plan_internal/flaw_tracer/registry.py worker_plan/worker_plan_internal/flaw_tracer/tests/__init__.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_registry.py -git commit -m "feat: add flaw_tracer registry with full pipeline DAG mapping" +git add worker_plan/worker_plan_internal/rca/__init__.py worker_plan/worker_plan_internal/rca/registry.py worker_plan/worker_plan_internal/rca/tests/__init__.py worker_plan/worker_plan_internal/rca/tests/test_registry.py +git commit -m "feat: add rca registry with full pipeline DAG mapping" ``` --- @@ -977,48 +262,48 @@ git commit -m "feat: add flaw_tracer registry with full pipeline DAG mapping" ### Task 2: Prompts — Pydantic Models and Prompt Builders **Files:** -- Create: `worker_plan/worker_plan_internal/flaw_tracer/prompts.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py` +- Create: `worker_plan/worker_plan_internal/rca/prompts.py` +- Create: `worker_plan/worker_plan_internal/rca/tests/test_prompts.py` - [ ] **Step 1: Write the failing tests** ```python -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py +# worker_plan/worker_plan_internal/rca/tests/test_prompts.py import unittest from llama_index.core.llms import ChatMessage, MessageRole -from worker_plan_internal.flaw_tracer.prompts import ( - IdentifiedFlaw, - FlawIdentificationResult, +from worker_plan_internal.rca.prompts import ( + IdentifiedProblem, + ProblemIdentificationResult, UpstreamCheckResult, SourceCodeAnalysisResult, - build_flaw_identification_messages, + build_problem_identification_messages, build_upstream_check_messages, build_source_code_analysis_messages, ) class TestPydanticModels(unittest.TestCase): - def test_identified_flaw_valid(self): - flaw = IdentifiedFlaw( + def test_identified_problem_valid(self): + problem = IdentifiedProblem( description="Budget figure is fabricated", evidence="The budget is CZK 500,000", severity="HIGH", ) - self.assertEqual(flaw.severity, "HIGH") + self.assertEqual(problem.severity, "HIGH") - def test_identified_flaw_rejects_invalid_severity(self): + def test_identified_problem_rejects_invalid_severity(self): with self.assertRaises(Exception): - IdentifiedFlaw( + IdentifiedProblem( description="test", evidence="test", severity="CRITICAL", ) - def test_flaw_identification_result(self): - result = FlawIdentificationResult(flaws=[ - IdentifiedFlaw(description="test", evidence="quote", severity="LOW"), + def test_problem_identification_result(self): + result = ProblemIdentificationResult(problems=[ + IdentifiedProblem(description="test", evidence="quote", severity="LOW"), ]) - self.assertEqual(len(result.flaws), 1) + self.assertEqual(len(result.problems), 1) def test_upstream_check_result_found(self): result = UpstreamCheckResult(found=True, evidence="quote", explanation="precursor") @@ -1038,12 +323,12 @@ class TestPydanticModels(unittest.TestCase): self.assertIsInstance(result.likely_cause, str) -class TestBuildFlawIdentificationMessages(unittest.TestCase): +class TestBuildProblemIdentificationMessages(unittest.TestCase): def test_returns_chat_messages(self): - messages = build_flaw_identification_messages( + messages = build_problem_identification_messages( filename="030-report.html", file_content="report content", - user_flaw_description="budget is wrong", + user_problem_description="budget is wrong", ) self.assertIsInstance(messages, list) self.assertEqual(len(messages), 2) @@ -1051,10 +336,10 @@ class TestBuildFlawIdentificationMessages(unittest.TestCase): self.assertEqual(messages[1].role, MessageRole.USER) def test_user_message_contains_inputs(self): - messages = build_flaw_identification_messages( + messages = build_problem_identification_messages( filename="025-2-executive_summary.md", file_content="# Summary\nBudget: 500k", - user_flaw_description="fabricated budget", + user_problem_description="fabricated budget", ) user_content = messages[1].content self.assertIn("025-2-executive_summary.md", user_content) @@ -1065,7 +350,7 @@ class TestBuildFlawIdentificationMessages(unittest.TestCase): class TestBuildUpstreamCheckMessages(unittest.TestCase): def test_returns_chat_messages(self): messages = build_upstream_check_messages( - flaw_description="Budget is fabricated", + problem_description="Budget is fabricated", evidence_quote="CZK 500,000", upstream_filename="005-2-project_plan.md", upstream_file_content="# Project Plan\nBudget: 500k", @@ -1073,9 +358,9 @@ class TestBuildUpstreamCheckMessages(unittest.TestCase): self.assertIsInstance(messages, list) self.assertEqual(len(messages), 2) - def test_user_message_contains_flaw_and_upstream(self): + def test_user_message_contains_problem_and_upstream(self): messages = build_upstream_check_messages( - flaw_description="Missing market sizing", + problem_description="Missing market sizing", evidence_quote="growing Czech market", upstream_filename="003-5-make_assumptions.md", upstream_file_content="# Assumptions\nMarket is growing", @@ -1089,7 +374,7 @@ class TestBuildUpstreamCheckMessages(unittest.TestCase): class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): def test_returns_chat_messages(self): messages = build_source_code_analysis_messages( - flaw_description="Budget fabricated", + problem_description="Budget fabricated", evidence_quote="CZK 500,000", source_code_contents=[ ("stages/make_assumptions.py", "class MakeAssumptionsTask: ..."), @@ -1101,7 +386,7 @@ class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): def test_user_message_contains_source_code(self): messages = build_source_code_analysis_messages( - flaw_description="Missing analysis", + problem_description="Missing analysis", evidence_quote="no data", source_code_contents=[ ("my_stage.py", "SYSTEM_PROMPT = 'Generate assumptions'"), @@ -1114,14 +399,14 @@ class TestBuildSourceCodeAnalysisMessages(unittest.TestCase): - [ ] **Step 2: Run tests to verify they fail** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_prompts.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_prompts.py -v` Expected: FAIL with `ImportError` - [ ] **Step 3: Implement prompts.py** ```python -# worker_plan/worker_plan_internal/flaw_tracer/prompts.py -"""Pydantic models and prompt builders for the flaw tracer.""" +# worker_plan/worker_plan_internal/rca/prompts.py +"""Pydantic models and prompt builders for RCA.""" from typing import Literal from pydantic import BaseModel, Field from llama_index.core.llms import ChatMessage, MessageRole @@ -1129,55 +414,55 @@ from llama_index.core.llms import ChatMessage, MessageRole # ── Pydantic models for structured LLM output ────────────────────────── -class IdentifiedFlaw(BaseModel): - """A discrete flaw found in a pipeline output file.""" - description: str = Field(description="One-sentence description of the flaw") - evidence: str = Field(description="Direct quote from the file demonstrating the flaw") +class IdentifiedProblem(BaseModel): + """A discrete problem found in a pipeline output file.""" + description: str = Field(description="One-sentence description of the problem") + evidence: str = Field(description="Direct quote from the file demonstrating the problem") severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." ) -class FlawIdentificationResult(BaseModel): - """Result of analyzing a file for flaws.""" - flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") +class ProblemIdentificationResult(BaseModel): + """Result of analyzing a file for problems.""" + problems: list[IdentifiedProblem] = Field(description="List of discrete problems found in the file") class UpstreamCheckResult(BaseModel): - """Result of checking an upstream file for a flaw precursor.""" - found: bool = Field(description="True if this file contains the flaw or a precursor to it") + """Result of checking an upstream file for a problem precursor.""" + found: bool = Field(description="True if this file contains the problem or a precursor to it") evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") - explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + explanation: str = Field(description="How this connects to the downstream problem, or why this file is clean") class SourceCodeAnalysisResult(BaseModel): - """Result of analyzing source code at a flaw's origin stage.""" - likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + """Result of analyzing source code at a problem's origin stage.""" + likely_cause: str = Field(description="What in the prompt or logic likely caused the problem") relevant_code_section: str = Field(description="The specific code or prompt text responsible") - suggestion: str = Field(description="How to fix or prevent this flaw") + suggestion: str = Field(description="How to fix or prevent this problem") # ── Prompt builders ───────────────────────────────────────────────────── -def build_flaw_identification_messages( +def build_problem_identification_messages( filename: str, file_content: str, - user_flaw_description: str, + user_problem_description: str, ) -> list[ChatMessage]: - """Build messages for Phase 1: identifying discrete flaws in a file.""" + """Build messages for Phase 1: identifying discrete problems in a file.""" system = ( "You are analyzing an intermediary file from a project planning pipeline.\n" - "The user has identified problems in this output. Identify each discrete flaw.\n" - "For each flaw, provide a short description (one sentence), a direct quote " + "The user has identified problems in this output. Identify each discrete problem.\n" + "For each problem, provide a short description (one sentence), a direct quote " "from the file as evidence, and a severity level.\n" - "Only identify real flaws — do not flag stylistic preferences or minor formatting issues.\n" + "Only identify real problems — do not flag stylistic preferences or minor formatting issues.\n" "Severity levels:\n" "- HIGH: fabricated data, invented statistics, or missing critical analysis\n" "- MEDIUM: weak reasoning, vague unsupported claims, or shallow treatment\n" "- LOW: minor gaps that don't significantly impact the plan" ) user = ( - f"User's observation:\n{user_flaw_description}\n\n" + f"User's observation:\n{user_problem_description}\n\n" f"Filename: {filename}\n" f"File content:\n{file_content}" ) @@ -1188,22 +473,22 @@ def build_flaw_identification_messages( def build_upstream_check_messages( - flaw_description: str, + problem_description: str, evidence_quote: str, upstream_filename: str, upstream_file_content: str, ) -> list[ChatMessage]: - """Build messages for Phase 2: checking if a flaw exists in an upstream file.""" + """Build messages for Phase 2: checking if a problem exists in an upstream file.""" system = ( - "You are tracing a flaw through a project planning pipeline to find where it originated.\n" - "A downstream file contains a flaw. You are examining an upstream file that was an input " - "to the stage that produced the flawed output.\n" + "You are tracing a problem through a project planning pipeline to find where it originated.\n" + "A downstream file contains a problem. You are examining an upstream file that was an input " + "to the stage that produced the problematic output.\n" "Determine if this upstream file contains the same problem or a precursor to it.\n" - "If YES: quote the relevant passage and explain how it connects to the downstream flaw.\n" - "If NO: explain why this file is clean regarding this specific flaw." + "If YES: quote the relevant passage and explain how it connects to the downstream problem.\n" + "If NO: explain why this file is clean regarding this specific problem." ) user = ( - f"Flaw: {flaw_description}\n" + f"Problem: {problem_description}\n" f"Evidence from downstream: {evidence_quote}\n\n" f"Upstream filename: {upstream_filename}\n" f"Upstream file content:\n{upstream_file_content}" @@ -1215,20 +500,20 @@ def build_upstream_check_messages( def build_source_code_analysis_messages( - flaw_description: str, + problem_description: str, evidence_quote: str, source_code_contents: list[tuple[str, str]], ) -> list[ChatMessage]: - """Build messages for Phase 3: analyzing source code at flaw origin. + """Build messages for Phase 3: analyzing source code at problem origin. Args: source_code_contents: list of (filename, content) tuples """ system = ( - "A flaw was introduced at this pipeline stage. The flaw exists in its output " + "A problem was introduced at this pipeline stage. The problem exists in its output " "but NOT in any of its inputs, so this stage created it.\n" "Examine the source code to identify what in the prompt text, logic, or processing " - "likely caused this flaw. Be specific — point to lines or prompt phrases.\n" + "likely caused this problem. Be specific — point to lines or prompt phrases.\n" "Focus on the system prompt text and the data transformation logic." ) source_sections = [] @@ -1237,7 +522,7 @@ def build_source_code_analysis_messages( source_text = "\n\n".join(source_sections) user = ( - f"Flaw: {flaw_description}\n" + f"Problem: {problem_description}\n" f"Evidence from output: {evidence_quote}\n\n" f"Source code files:\n{source_text}" ) @@ -1249,14 +534,14 @@ def build_source_code_analysis_messages( - [ ] **Step 4: Run tests to verify they pass** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_prompts.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_prompts.py -v` Expected: All tests PASS - [ ] **Step 5: Commit** ```bash -git add worker_plan/worker_plan_internal/flaw_tracer/prompts.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_prompts.py -git commit -m "feat: add flaw_tracer Pydantic models and prompt builders" +git add worker_plan/worker_plan_internal/rca/prompts.py worker_plan/worker_plan_internal/rca/tests/test_prompts.py +git commit -m "feat: add rca Pydantic models and prompt builders" ``` --- @@ -1264,19 +549,19 @@ git commit -m "feat: add flaw_tracer Pydantic models and prompt builders" ### Task 3: Tracer — Recursive Algorithm **Files:** -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tracer.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py` +- Create: `worker_plan/worker_plan_internal/rca/tracer.py` +- Create: `worker_plan/worker_plan_internal/rca/tests/test_tracer.py` - [ ] **Step 1: Write the failing tests** ```python -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py +# worker_plan/worker_plan_internal/rca/tests/test_tracer.py import json import unittest from pathlib import Path from tempfile import TemporaryDirectory -from worker_plan_internal.flaw_tracer.tracer import FlawTracer, FlawTraceResult, TracedFlaw, TraceEntry +from worker_plan_internal.rca.tracer import RootCauseAnalyzer, RCAResult, TracedProblem, TraceEntry from worker_plan_internal.llm_util.response_mockllm import ResponseMockLLM from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelWithInstance @@ -1288,24 +573,24 @@ def _make_executor(responses: list[str]) -> LLMExecutor: return LLMExecutor(llm_models=llm_models) -class TestFlawTraceResult(unittest.TestCase): +class TestRCAResult(unittest.TestCase): def test_dataclass_creation(self): - result = FlawTraceResult( + result = RCAResult( starting_file="030-report.html", - flaw_description="test", + problem_description="test", output_dir="/tmp/test", - flaws=[], + problems=[], llm_calls_made=0, ) self.assertEqual(result.starting_file, "030-report.html") - self.assertEqual(len(result.flaws), 0) + self.assertEqual(len(result.problems), 0) -class TestFlawTracerPhase1(unittest.TestCase): - """Test flaw identification (Phase 1) using mock LLM.""" +class TestRootCauseAnalyzerPhase1(unittest.TestCase): + """Test problem identification (Phase 1) using mock LLM.""" - def test_identify_flaws_returns_flaws(self): - """The tracer should parse LLM output into IdentifiedFlaw objects.""" + def test_identify_problems(self): + """The tracer should parse LLM output into IdentifiedProblem objects.""" with TemporaryDirectory() as d: output_dir = Path(d) # Create a minimal output file @@ -1314,9 +599,9 @@ class TestFlawTracerPhase1(unittest.TestCase): # Create upstream file so trace can proceed (output_dir / "005-2-project_plan.md").write_text("# Plan", encoding="utf-8") - # Mock LLM response for flaw identification (Phase 1) - flaw_response = json.dumps({ - "flaws": [ + # Mock LLM response for problem identification (Phase 1) + problem_response = json.dumps({ + "problems": [ { "description": "Budget is unvalidated", "evidence": "CZK 500,000", @@ -1337,9 +622,9 @@ class TestFlawTracerPhase1(unittest.TestCase): "suggestion": "Add validation step", }) - executor = _make_executor([flaw_response, upstream_response, source_response]) + executor = _make_executor([problem_response, upstream_response, source_response]) source_base = Path(__file__).resolve().parent.parent.parent.parent # worker_plan/ - tracer = FlawTracer( + tracer = RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, source_code_base=source_base, @@ -1348,17 +633,17 @@ class TestFlawTracerPhase1(unittest.TestCase): ) result = tracer.trace("025-2-executive_summary.md", "budget is unvalidated") - self.assertIsInstance(result, FlawTraceResult) - self.assertGreaterEqual(len(result.flaws), 1) - flaw = result.flaws[0] - self.assertEqual(flaw.description, "Budget is unvalidated") - self.assertEqual(flaw.severity, "HIGH") + self.assertIsInstance(result, RCAResult) + self.assertGreaterEqual(len(result.problems), 1) + problem = result.problems[0] + self.assertEqual(problem.description, "Budget is unvalidated") + self.assertEqual(problem.severity, "HIGH") -class TestFlawTracerUpstreamTrace(unittest.TestCase): +class TestRootCauseAnalyzerUpstreamTrace(unittest.TestCase): """Test upstream tracing (Phase 2) with a simple two-level chain.""" - def test_traces_flaw_upstream(self): + def test_traces_problem_upstream(self): with TemporaryDirectory() as d: output_dir = Path(d) # Create files for a simple chain: executive_summary -> project_plan -> setup @@ -1371,8 +656,8 @@ class TestFlawTracerUpstreamTrace(unittest.TestCase): (output_dir / "003-10-consolidate_assumptions_full.md").write_text("assumptions", encoding="utf-8") responses = [ - # Phase 1: identify flaws in executive_summary - json.dumps({"flaws": [{"description": "Budget fabricated", "evidence": "CZK 500,000", "severity": "HIGH"}]}), + # Phase 1: identify problems in executive_summary + json.dumps({"problems": [{"description": "Budget fabricated", "evidence": "CZK 500,000", "severity": "HIGH"}]}), # Phase 2: check each upstream of executive_summary # strategic_decisions_markdown json.dumps({"found": False, "evidence": None, "explanation": "clean"}), @@ -1380,7 +665,7 @@ class TestFlawTracerUpstreamTrace(unittest.TestCase): json.dumps({"found": False, "evidence": None, "explanation": "clean"}), # consolidate_assumptions_markdown json.dumps({"found": False, "evidence": None, "explanation": "clean"}), - # project_plan — flaw found here + # project_plan — problem found here json.dumps({"found": True, "evidence": "Budget: CZK 500,000", "explanation": "Budget originates here"}), # Now trace project_plan's upstreams # setup @@ -1395,7 +680,7 @@ class TestFlawTracerUpstreamTrace(unittest.TestCase): executor = _make_executor(responses) source_base = Path(__file__).resolve().parent.parent.parent.parent - tracer = FlawTracer( + tracer = RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, source_code_base=source_base, @@ -1404,28 +689,28 @@ class TestFlawTracerUpstreamTrace(unittest.TestCase): ) result = tracer.trace("025-2-executive_summary.md", "budget is fabricated") - self.assertEqual(len(result.flaws), 1) - flaw = result.flaws[0] + self.assertEqual(len(result.problems), 1) + problem = result.problems[0] # The trace should include at least executive_summary and project_plan - trace_stages = [entry.stage for entry in flaw.trace] + trace_stages = [entry.stage for entry in problem.trace] self.assertIn("executive_summary", trace_stages) self.assertIn("project_plan", trace_stages) - # Origin should be project_plan (flaw found there but not in its upstream) - self.assertEqual(flaw.origin_stage, "project_plan") + # Origin should be project_plan (problem found there but not in its upstream) + self.assertEqual(problem.origin_stage, "project_plan") -class TestFlawTracerMaxDepth(unittest.TestCase): +class TestRootCauseAnalyzerMaxDepth(unittest.TestCase): def test_respects_max_depth(self): with TemporaryDirectory() as d: output_dir = Path(d) (output_dir / "025-2-executive_summary.md").write_text("Budget: 500k", encoding="utf-8") responses = [ - json.dumps({"flaws": [{"description": "test flaw", "evidence": "500k", "severity": "LOW"}]}), + json.dumps({"problems": [{"description": "test problem", "evidence": "500k", "severity": "LOW"}]}), ] executor = _make_executor(responses) source_base = Path(__file__).resolve().parent.parent.parent.parent - tracer = FlawTracer( + tracer = RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, source_code_base=source_base, @@ -1434,21 +719,21 @@ class TestFlawTracerMaxDepth(unittest.TestCase): ) result = tracer.trace("025-2-executive_summary.md", "test") - self.assertEqual(len(result.flaws), 1) + self.assertEqual(len(result.problems), 1) # With max_depth=0, no upstream tracing happens - self.assertEqual(len(result.flaws[0].trace), 1) # only the starting file + self.assertEqual(len(result.problems[0].trace), 1) # only the starting file ``` - [ ] **Step 2: Run tests to verify they fail** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_tracer.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_tracer.py -v` Expected: FAIL with `ImportError` - [ ] **Step 3: Implement tracer.py** ```python -# worker_plan/worker_plan_internal/flaw_tracer/tracer.py -"""Recursive depth-first flaw tracer for PlanExe pipeline outputs.""" +# worker_plan/worker_plan_internal/rca/tracer.py +"""Recursive depth-first root cause analyzer for PlanExe pipeline outputs.""" import json import logging import sys @@ -1458,16 +743,16 @@ from typing import Optional from llama_index.core.llms.llm import LLM -from worker_plan_internal.flaw_tracer.registry import ( - find_stage_by_filename, +from worker_plan_internal.rca.registry import ( + find_node_by_filename, get_upstream_files, get_source_code_paths, ) -from worker_plan_internal.flaw_tracer.prompts import ( - FlawIdentificationResult, +from worker_plan_internal.rca.prompts import ( + ProblemIdentificationResult, UpstreamCheckResult, SourceCodeAnalysisResult, - build_flaw_identification_messages, + build_problem_identification_messages, build_upstream_check_messages, build_source_code_analysis_messages, ) @@ -1478,7 +763,7 @@ logger = logging.getLogger(__name__) @dataclass class TraceEntry: - """One hop in a flaw's upstream trace.""" + """One hop in a problem's upstream trace.""" stage: str file: str evidence: str @@ -1487,7 +772,7 @@ class TraceEntry: @dataclass class OriginInfo: - """Source code analysis at a flaw's origin stage.""" + """Source code analysis at a problem's origin stage.""" stage: str file: str source_code_files: list[str] @@ -1496,8 +781,8 @@ class OriginInfo: @dataclass -class TracedFlaw: - """A fully traced flaw with its upstream chain.""" +class TracedProblem: + """A fully traced problem with its upstream chain.""" id: str description: str severity: str @@ -1510,17 +795,17 @@ class TracedFlaw: @dataclass -class FlawTraceResult: - """Complete result of a flaw trace run.""" +class RCAResult: + """Complete result of a root cause analysis run.""" starting_file: str - flaw_description: str + problem_description: str output_dir: str - flaws: list[TracedFlaw] + problems: list[TracedProblem] llm_calls_made: int = 0 -class FlawTracer: - """Traces flaws upstream through the PlanExe pipeline DAG.""" +class RootCauseAnalyzer: + """Traces problems upstream through the PlanExe pipeline DAG.""" def __init__( self, @@ -1536,10 +821,10 @@ class FlawTracer: self.max_depth = max_depth self.verbose = verbose self._llm_calls = 0 - self._checked: set[tuple[str, str]] = set() # (stage_name, flaw_description) dedup + self._checked: set[tuple[str, str]] = set() # (node_name, problem_description) dedup - def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: - """Main entry point. Identify flaws and trace each upstream.""" + def trace(self, starting_file: str, problem_description: str) -> RCAResult: + """Main entry point. Identify problems and trace each upstream.""" self._llm_calls = 0 self._checked.clear() @@ -1548,36 +833,36 @@ class FlawTracer: raise FileNotFoundError(f"Starting file not found: {file_path}") file_content = file_path.read_text(encoding="utf-8") - stage = find_stage_by_filename(starting_file) - stage_name = stage.name if stage else "unknown" + stage = find_node_by_filename(starting_file) + node_name = node.name if stage else "unknown" - # Phase 1: Identify flaws - self._log(f"Phase 1: Identifying flaws in {starting_file}") - identified = self._identify_flaws(starting_file, file_content, flaw_description) - self._log(f" Found {len(identified.flaws)} flaw(s)") + # Phase 1: Identify problems + self._log(f"Phase 1: Identifying problems in {starting_file}") + identified = self._identify_problems(starting_file, file_content, problem_description) + self._log(f" Found {len(identified.problems)} problem(s)") - traced_flaws: list[TracedFlaw] = [] - for i, flaw in enumerate(identified.flaws): - flaw_id = f"flaw_{i + 1:03d}" - self._log(f"\nTracing {flaw_id}: {flaw.description}") + traced_problems: list[TracedProblem] = [] + for i, problem in enumerate(identified.problems): + problem_id = f"problem_{i + 1:03d}" + self._log(f"\nTracing {problem_id}: {problem.description}") starting_entry = TraceEntry( - stage=stage_name, + stage=node_name, file=starting_file, - evidence=flaw.evidence, + evidence=problem.evidence, is_origin=False, ) - traced = TracedFlaw( - id=flaw_id, - description=flaw.description, - severity=flaw.severity, - starting_evidence=flaw.evidence, + traced = TracedProblem( + id=problem_id, + description=problem.description, + severity=problem.severity, + starting_evidence=problem.evidence, trace=[starting_entry], ) if stage and self.max_depth > 0: - self._trace_upstream(traced, stage_name, flaw.description, flaw.evidence, depth=0) + self._trace_upstream(traced, node_name, problem.description, problem.evidence, depth=0) # Mark the last trace entry as origin if no deeper origin was found if traced.origin_stage is None and traced.trace: @@ -1587,36 +872,36 @@ class FlawTracer: traced.depth = len(traced.trace) - 1 # Phase 3: Source code analysis at origin - self._analyze_source_code(traced, last.stage, flaw.description, last.evidence) + self._analyze_source_code(traced, last.stage, problem.description, last.evidence) - traced_flaws.append(traced) + traced_problems.append(traced) # Sort by depth (deepest origin first) - traced_flaws.sort(key=lambda f: f.depth, reverse=True) + traced_problems.sort(key=lambda f: f.depth, reverse=True) - return FlawTraceResult( + return RCAResult( starting_file=starting_file, - flaw_description=flaw_description, + problem_description=problem_description, output_dir=str(self.output_dir), - flaws=traced_flaws, + problems=traced_problems, llm_calls_made=self._llm_calls, ) - def _identify_flaws(self, filename: str, file_content: str, user_description: str) -> FlawIdentificationResult: - """Phase 1: Ask LLM to identify discrete flaws in the starting file.""" - messages = build_flaw_identification_messages(filename, file_content, user_description) + def _identify_problems(self, filename: str, file_content: str, user_description: str) -> ProblemIdentificationResult: + """Phase 1: Ask LLM to identify discrete problems in the starting file.""" + messages = build_problem_identification_messages(filename, file_content, user_description) - def execute(llm: LLM) -> FlawIdentificationResult: - sllm = llm.as_structured_llm(FlawIdentificationResult) + def execute(llm: LLM) -> ProblemIdentificationResult: + sllm = llm.as_structured_llm(ProblemIdentificationResult) response = sllm.chat(messages) return response.raw self._llm_calls += 1 return self.llm_executor.run(execute) - def _check_upstream(self, flaw_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: - """Phase 2: Ask LLM if a flaw exists in an upstream file.""" - messages = build_upstream_check_messages(flaw_description, evidence, upstream_filename, upstream_content) + def _check_upstream(self, problem_description: str, evidence: str, upstream_filename: str, upstream_content: str) -> UpstreamCheckResult: + """Phase 2: Ask LLM if a problem exists in an upstream file.""" + messages = build_upstream_check_messages(problem_description, evidence, upstream_filename, upstream_content) def execute(llm: LLM) -> UpstreamCheckResult: sllm = llm.as_structured_llm(UpstreamCheckResult) @@ -1628,13 +913,13 @@ class FlawTracer: def _trace_upstream( self, - traced: TracedFlaw, + traced: TracedProblem, current_stage: str, - flaw_description: str, + problem_description: str, evidence: str, depth: int, ) -> None: - """Recursively trace a flaw through upstream stages.""" + """Recursively trace a problem through upstream stages.""" if depth >= self.max_depth: traced.trace_complete = False self._log(f" Max depth {self.max_depth} reached at {current_stage}") @@ -1646,16 +931,16 @@ class FlawTracer: found_upstream = False for upstream_name, upstream_path in upstream_files: - dedup_key = (upstream_name, flaw_description) + dedup_key = (upstream_name, problem_description) if dedup_key in self._checked: - self._log(f" Skipping {upstream_name} (already checked for this flaw)") + self._log(f" Skipping {upstream_name} (already checked for this problem)") continue self._checked.add(dedup_key) upstream_content = upstream_path.read_text(encoding="utf-8") self._log(f" Checking upstream: {upstream_name} ({upstream_path.name})") - result = self._check_upstream(flaw_description, evidence, upstream_path.name, upstream_content) + result = self._check_upstream(problem_description, evidence, upstream_path.name, upstream_content) if result.found: self._log(f" -> FOUND in {upstream_name}") @@ -1670,7 +955,7 @@ class FlawTracer: # Recurse deeper self._trace_upstream( - traced, upstream_name, flaw_description, + traced, upstream_name, problem_description, result.evidence or evidence, depth + 1, ) # After recursion, if origin was found deeper, stop tracing other branches @@ -1678,7 +963,7 @@ class FlawTracer: return if not found_upstream: - # Current stage is the origin — flaw exists here but not in any upstream + # Current stage is the origin — problem exists here but not in any upstream traced.origin_stage = current_stage traced.depth = len(traced.trace) # Mark the current stage entry as origin @@ -1686,9 +971,9 @@ class FlawTracer: if entry.stage == current_stage: entry.is_origin = True - def _analyze_source_code(self, traced: TracedFlaw, stage_name: str, flaw_description: str, evidence: str) -> None: + def _analyze_source_code(self, traced: TracedProblem, node_name: str, problem_description: str, evidence: str) -> None: """Phase 3: Analyze source code at the origin stage.""" - source_paths = get_source_code_paths(stage_name) + source_paths = get_source_code_paths(node_name) if not source_paths: return @@ -1701,8 +986,8 @@ class FlawTracer: if not source_contents: return - self._log(f" Phase 3: Analyzing source code for {stage_name}") - messages = build_source_code_analysis_messages(flaw_description, evidence, source_contents) + self._log(f" Phase 3: Analyzing source code for {node_name}") + messages = build_source_code_analysis_messages(problem_description, evidence, source_contents) def execute(llm: LLM) -> SourceCodeAnalysisResult: sllm = llm.as_structured_llm(SourceCodeAnalysisResult) @@ -1714,14 +999,14 @@ class FlawTracer: analysis = self.llm_executor.run(execute) source_file_names = [name for name, _ in source_contents] traced.origin = OriginInfo( - stage=stage_name, + stage=node_name, file=traced.trace[-1].file if traced.trace else "", source_code_files=source_file_names, likely_cause=analysis.likely_cause, suggestion=analysis.suggestion, ) except Exception as e: - logger.warning(f"Source code analysis failed for {stage_name}: {e}") + logger.warning(f"Source code analysis failed for {node_name}: {e}") def _log(self, message: str) -> None: """Print to stderr if verbose mode is enabled.""" @@ -1731,14 +1016,14 @@ class FlawTracer: - [ ] **Step 4: Run tests to verify they pass** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_tracer.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_tracer.py -v` Expected: All tests PASS - [ ] **Step 5: Commit** ```bash -git add worker_plan/worker_plan_internal/flaw_tracer/tracer.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_tracer.py -git commit -m "feat: add flaw_tracer recursive tracing algorithm" +git add worker_plan/worker_plan_internal/rca/tracer.py worker_plan/worker_plan_internal/rca/tests/test_tracer.py +git commit -m "feat: add rca recursive tracing algorithm" ``` --- @@ -1746,36 +1031,36 @@ git commit -m "feat: add flaw_tracer recursive tracing algorithm" ### Task 4: Output — JSON and Markdown Reports **Files:** -- Create: `worker_plan/worker_plan_internal/flaw_tracer/output.py` -- Create: `worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py` +- Create: `worker_plan/worker_plan_internal/rca/output.py` +- Create: `worker_plan/worker_plan_internal/rca/tests/test_output.py` - [ ] **Step 1: Write the failing tests** ```python -# worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py +# worker_plan/worker_plan_internal/rca/tests/test_output.py import json import unittest from pathlib import Path from tempfile import TemporaryDirectory -from worker_plan_internal.flaw_tracer.tracer import ( - FlawTraceResult, - TracedFlaw, +from worker_plan_internal.rca.tracer import ( + RCAResult, + TracedProblem, TraceEntry, OriginInfo, ) -from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.rca.output import write_json_report, write_markdown_report -def _make_sample_result() -> FlawTraceResult: - """Create a sample FlawTraceResult for testing.""" - return FlawTraceResult( +def _make_sample_result() -> RCAResult: + """Create a sample RCAResult for testing.""" + return RCAResult( starting_file="025-2-executive_summary.md", - flaw_description="Budget is unvalidated", + problem_description="Budget is unvalidated", output_dir="/tmp/test_output", - flaws=[ - TracedFlaw( - id="flaw_001", + problems=[ + TracedProblem( + id="problem_001", description="Budget of CZK 500,000 is unvalidated", severity="HIGH", starting_evidence="CZK 500,000", @@ -1794,8 +1079,8 @@ def _make_sample_result() -> FlawTraceResult: ), depth=3, ), - TracedFlaw( - id="flaw_002", + TracedProblem( + id="problem_002", description="Missing market sizing", severity="MEDIUM", starting_evidence="growing Czech market", @@ -1813,54 +1098,54 @@ def _make_sample_result() -> FlawTraceResult: class TestWriteJsonReport(unittest.TestCase): def test_writes_valid_json(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) self.assertTrue(output_path.exists()) data = json.loads(output_path.read_text(encoding="utf-8")) self.assertIn("input", data) - self.assertIn("flaws", data) + self.assertIn("problems", data) self.assertIn("summary", data) def test_json_contains_correct_summary(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) data = json.loads(output_path.read_text(encoding="utf-8")) summary = data["summary"] - self.assertEqual(summary["total_flaws"], 2) - self.assertEqual(summary["deepest_origin_stage"], "make_assumptions") + self.assertEqual(summary["total_problems"], 2) + self.assertEqual(summary["deepest_origin_node"], "make_assumptions") self.assertEqual(summary["deepest_origin_depth"], 3) self.assertEqual(summary["llm_calls_made"], 8) - def test_json_flaws_sorted_by_depth(self): + def test_json_problems_sorted_by_depth(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.json" + output_path = Path(d) / "root_cause_analysis.json" result = _make_sample_result() write_json_report(result, output_path) data = json.loads(output_path.read_text(encoding="utf-8")) - depths = [f["depth"] for f in data["flaws"]] + depths = [f["depth"] for f in data["problems"]] self.assertEqual(depths, sorted(depths, reverse=True)) class TestWriteMarkdownReport(unittest.TestCase): def test_writes_markdown_file(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) self.assertTrue(output_path.exists()) content = output_path.read_text(encoding="utf-8") - self.assertIn("# Flaw Trace Report", content) + self.assertIn("# Root Cause Analysis Report", content) - def test_markdown_contains_flaw_details(self): + def test_markdown_contains_problem_details(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) @@ -1871,60 +1156,60 @@ class TestWriteMarkdownReport(unittest.TestCase): def test_markdown_contains_trace_table(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" + output_path = Path(d) / "root_cause_analysis.md" result = _make_sample_result() write_markdown_report(result, output_path) content = output_path.read_text(encoding="utf-8") - self.assertIn("| Stage |", content) + self.assertIn("| Node |", content) self.assertIn("| File |", content) def test_empty_result_produces_valid_markdown(self): with TemporaryDirectory() as d: - output_path = Path(d) / "flaw_trace.md" - result = FlawTraceResult( + output_path = Path(d) / "root_cause_analysis.md" + result = RCAResult( starting_file="030-report.html", - flaw_description="test", + problem_description="test", output_dir="/tmp", - flaws=[], + problems=[], llm_calls_made=1, ) write_markdown_report(result, output_path) content = output_path.read_text(encoding="utf-8") - self.assertIn("Flaws found:** 0", content) + self.assertIn("Problems found:** 0", content) ``` - [ ] **Step 2: Run tests to verify they fail** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_output.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_output.py -v` Expected: FAIL with `ImportError` - [ ] **Step 3: Implement output.py** ```python -# worker_plan/worker_plan_internal/flaw_tracer/output.py -"""JSON and markdown report generation for flaw trace results.""" +# worker_plan/worker_plan_internal/rca/output.py +"""JSON and markdown report generation for root cause analysis results.""" import json from datetime import datetime, UTC from pathlib import Path -from worker_plan_internal.flaw_tracer.tracer import FlawTraceResult +from worker_plan_internal.rca.tracer import RCAResult -def write_json_report(result: FlawTraceResult, output_path: Path) -> None: - """Write the flaw trace result as a JSON file.""" +def write_json_report(result: RCAResult, output_path: Path) -> None: + """Write the RCA result as a JSON file.""" data = { "input": { "starting_file": result.starting_file, - "flaw_description": result.flaw_description, + "problem_description": result.problem_description, "output_dir": result.output_dir, "timestamp": datetime.now(UTC).isoformat(), }, - "flaws": [], + "problems": [], "summary": { - "total_flaws": len(result.flaws), - "deepest_origin_stage": None, + "total_problems": len(result.problems), + "deepest_origin_node": None, "deepest_origin_depth": 0, "llm_calls_made": result.llm_calls_made, }, @@ -1933,12 +1218,12 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: max_depth = 0 deepest_stage = None - for flaw in result.flaws: - flaw_data = { - "id": flaw.id, - "description": flaw.description, - "severity": flaw.severity, - "starting_evidence": flaw.starting_evidence, + for problem in result.problems: + problem_data = { + "id": problem.id, + "description": problem.description, + "severity": problem.severity, + "starting_evidence": problem.starting_evidence, "trace": [ { "stage": entry.stage, @@ -1946,85 +1231,85 @@ def write_json_report(result: FlawTraceResult, output_path: Path) -> None: "evidence": entry.evidence, "is_origin": entry.is_origin, } - for entry in flaw.trace + for entry in problem.trace ], "origin": None, - "depth": flaw.depth, - "trace_complete": flaw.trace_complete, + "depth": problem.depth, + "trace_complete": problem.trace_complete, } - if flaw.origin: - flaw_data["origin"] = { - "stage": flaw.origin.stage, - "file": flaw.origin.file, - "source_code_files": flaw.origin.source_code_files, - "likely_cause": flaw.origin.likely_cause, - "suggestion": flaw.origin.suggestion, + if problem.origin: + problem_data["origin"] = { + "stage": problem.origin.stage, + "file": problem.origin.file, + "source_code_files": problem.origin.source_code_files, + "likely_cause": problem.origin.likely_cause, + "suggestion": problem.origin.suggestion, } - if flaw.depth > max_depth: - max_depth = flaw.depth - deepest_stage = flaw.origin_stage + if problem.depth > max_depth: + max_depth = problem.depth + deepest_stage = problem.origin_stage - data["flaws"].append(flaw_data) + data["problems"].append(problem_data) - data["summary"]["deepest_origin_stage"] = deepest_stage + data["summary"]["deepest_origin_node"] = deepest_stage data["summary"]["deepest_origin_depth"] = max_depth output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") -def write_markdown_report(result: FlawTraceResult, output_path: Path) -> None: - """Write the flaw trace result as a markdown report.""" +def write_markdown_report(result: RCAResult, output_path: Path) -> None: + """Write the RCA result as a markdown report.""" lines: list[str] = [] - lines.append("# Flaw Trace Report") + lines.append("# Root Cause Analysis Report") lines.append("") lines.append(f"**Input:** {result.starting_file}") - lines.append(f"**Flaws found:** {len(result.flaws)}") + lines.append(f"**Problems found:** {len(result.problems)}") - if result.flaws: - deepest = max(result.flaws, key=lambda f: f.depth) + if result.problems: + deepest = max(result.problems, key=lambda f: f.depth) lines.append(f"**Deepest origin:** {deepest.origin_stage} (depth {deepest.depth})") lines.append(f"**LLM calls:** {result.llm_calls_made}") lines.append("") - for flaw in result.flaws: + for problem in result.problems: lines.append("---") lines.append("") - lines.append(f"## {flaw.id.replace('_', ' ').title()} ({flaw.severity}): {flaw.description}") + lines.append(f"## {problem.id.replace('_', ' ').title()} ({problem.severity}): {problem.description}") lines.append("") # Trace chain summary - stage_names = [entry.stage for entry in flaw.trace] + node_names = [entry.stage for entry in problem.trace] chain_parts = [] - for name in stage_names: - if name == flaw.origin_stage: + for name in node_names: + if name == problem.origin_stage: chain_parts.append(f"**{name}** (origin)") else: chain_parts.append(name) lines.append(f"**Trace:** {' -> '.join(chain_parts)}") lines.append("") - if not flaw.trace_complete: + if not problem.trace_complete: lines.append("*Note: trace incomplete — max depth reached.*") lines.append("") # Trace table - lines.append("| Stage | File | Evidence |") + lines.append("| Node | File | Evidence |") lines.append("|-------|------|----------|") - for entry in flaw.trace: - stage_cell = f"**{entry.stage}**" if entry.is_origin else entry.stage + for entry in problem.trace: + node_cell = f"**{entry.stage}**" if entry.is_origin else entry.stage evidence_cell = _escape_table_cell(entry.evidence) - lines.append(f"| {stage_cell} | {entry.file} | {evidence_cell} |") + lines.append(f"| {node_cell} | {entry.file} | {evidence_cell} |") lines.append("") # Origin analysis - if flaw.origin: - lines.append(f"**Root cause:** {flaw.origin.likely_cause}") + if problem.origin: + lines.append(f"**Root cause:** {problem.origin.likely_cause}") lines.append("") - lines.append(f"**Source files:** {', '.join(flaw.origin.source_code_files)}") + lines.append(f"**Source files:** {', '.join(problem.origin.source_code_files)}") lines.append("") - lines.append(f"**Suggestion:** {flaw.origin.suggestion}") + lines.append(f"**Suggestion:** {problem.origin.suggestion}") lines.append("") output_path.write_text("\n".join(lines), encoding="utf-8") @@ -2037,14 +1322,14 @@ def _escape_table_cell(text: str) -> str: - [ ] **Step 4: Run tests to verify they pass** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/test_output.py -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/test_output.py -v` Expected: All tests PASS - [ ] **Step 5: Commit** ```bash -git add worker_plan/worker_plan_internal/flaw_tracer/output.py worker_plan/worker_plan_internal/flaw_tracer/tests/test_output.py -git commit -m "feat: add flaw_tracer JSON and markdown report generation" +git add worker_plan/worker_plan_internal/rca/output.py worker_plan/worker_plan_internal/rca/tests/test_output.py +git commit -m "feat: add rca JSON and markdown report generation" ``` --- @@ -2052,19 +1337,19 @@ git commit -m "feat: add flaw_tracer JSON and markdown report generation" ### Task 5: CLI Entry Point **Files:** -- Create: `worker_plan/worker_plan_internal/flaw_tracer/__main__.py` +- Create: `worker_plan/worker_plan_internal/rca/__main__.py` - [ ] **Step 1: Implement __main__.py** ```python -# worker_plan/worker_plan_internal/flaw_tracer/__main__.py -"""CLI entry point for the flaw tracer. +# worker_plan/worker_plan_internal/rca/__main__.py +"""CLI entry point for RCA. Usage: - python -m worker_plan_internal.flaw_tracer \ + python -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ - --flaw "The budget appears unvalidated..." \ + --problem "The budget appears unvalidated..." \ --output-dir /path/to/output \ --max-depth 15 \ --verbose @@ -2073,15 +1358,15 @@ import argparse import sys from pathlib import Path -from worker_plan_internal.flaw_tracer.tracer import FlawTracer -from worker_plan_internal.flaw_tracer.output import write_json_report, write_markdown_report +from worker_plan_internal.rca.tracer import RootCauseAnalyzer +from worker_plan_internal.rca.output import write_json_report, write_markdown_report from worker_plan_internal.llm_util.llm_executor import LLMExecutor, LLMModelFromName, RetryConfig from worker_plan_internal.llm_factory import get_llm_names_by_priority def main() -> None: parser = argparse.ArgumentParser( - description="Trace flaws in PlanExe reports upstream to their root cause.", + description="Trace problems in PlanExe reports upstream to their root cause.", ) parser.add_argument( "--dir", required=True, type=Path, @@ -2092,16 +1377,16 @@ def main() -> None: help="Starting file to analyze (relative to --dir)", ) parser.add_argument( - "--flaw", required=True, - help="Text description of the observed flaw(s)", + "--problem", required=True, + help="Text description of the observed problem(s)", ) parser.add_argument( "--output-dir", type=Path, default=None, - help="Where to write flaw_trace.json and flaw_trace.md (defaults to --dir)", + help="Where to write root_cause_analysis.json and root_cause_analysis.md (defaults to --dir)", ) parser.add_argument( "--max-depth", type=int, default=15, - help="Maximum upstream hops per flaw (default: 15)", + help="Maximum upstream hops per problem (default: 15)", ) parser.add_argument( "--verbose", action="store_true", @@ -2138,7 +1423,7 @@ def main() -> None: # Source code base is the worker_plan/ directory source_code_base = Path(__file__).resolve().parent.parent.parent - tracer = FlawTracer( + tracer = RootCauseAnalyzer( output_dir=output_dir, llm_executor=executor, source_code_base=source_code_base, @@ -2146,19 +1431,19 @@ def main() -> None: verbose=args.verbose, ) - print(f"Tracing flaws in {starting_file}...", file=sys.stderr) - result = tracer.trace(starting_file, args.flaw) + print(f"Tracing problems in {starting_file}...", file=sys.stderr) + result = tracer.trace(starting_file, args.problem) # Write reports - json_path = report_dir / "flaw_trace.json" - md_path = report_dir / "flaw_trace.md" + json_path = report_dir / "root_cause_analysis.json" + md_path = report_dir / "root_cause_analysis.md" write_json_report(result, json_path) write_markdown_report(result, md_path) # Print summary - print(f"\nFlaws found: {len(result.flaws)}", file=sys.stderr) - if result.flaws: - deepest = max(result.flaws, key=lambda f: f.depth) + print(f"\nProblems found: {len(result.problems)}", file=sys.stderr) + if result.problems: + deepest = max(result.problems, key=lambda f: f.depth) print(f"Deepest origin: {deepest.origin_stage} (depth {deepest.depth})", file=sys.stderr) print(f"LLM calls made: {result.llm_calls_made}", file=sys.stderr) print(f"\nReports written:", file=sys.stderr) @@ -2172,19 +1457,19 @@ if __name__ == "__main__": - [ ] **Step 2: Verify the module is importable** -Run: `cd worker_plan && python -c "from worker_plan_internal.flaw_tracer.__main__ import main; print('OK')"` +Run: `cd worker_plan && python -c "from worker_plan_internal.rca.__main__ import main; print('OK')"` Expected: `OK` - [ ] **Step 3: Verify --help works** -Run: `cd worker_plan && python -m worker_plan_internal.flaw_tracer --help` -Expected: Help text showing `--dir`, `--file`, `--flaw`, `--output-dir`, `--max-depth`, `--verbose` +Run: `cd worker_plan && python -m worker_plan_internal.rca --help` +Expected: Help text showing `--dir`, `--file`, `--problem`, `--output-dir`, `--max-depth`, `--verbose` - [ ] **Step 4: Commit** ```bash -git add worker_plan/worker_plan_internal/flaw_tracer/__main__.py -git commit -m "feat: add flaw_tracer CLI entry point" +git add worker_plan/worker_plan_internal/rca/__main__.py +git commit -m "feat: add rca CLI entry point" ``` --- @@ -2194,9 +1479,9 @@ git commit -m "feat: add flaw_tracer CLI entry point" **Files:** - No new files -- [ ] **Step 1: Run the full test suite for the flaw_tracer package** +- [ ] **Step 1: Run the full test suite for the rca package** -Run: `cd worker_plan && python -m pytest worker_plan_internal/flaw_tracer/tests/ -v` +Run: `cd worker_plan && python -m pytest worker_plan_internal/rca/tests/ -v` Expected: All tests PASS - [ ] **Step 2: Run the broader worker_plan test suite to check for regressions** @@ -2209,5 +1494,5 @@ Expected: No new failures If tests required fixes, commit them: ```bash git add -u -git commit -m "fix: address test issues in flaw_tracer" +git commit -m "fix: address test issues in rca" ``` diff --git a/docs/superpowers/specs/2026-04-05-rca-design.md b/docs/superpowers/specs/2026-04-05-rca-design.md index e8dd344b3..49d4a0deb 100644 --- a/docs/superpowers/specs/2026-04-05-rca-design.md +++ b/docs/superpowers/specs/2026-04-05-rca-design.md @@ -7,18 +7,18 @@ ## Goal -A CLI tool that takes a PlanExe output directory, a starting file, and a flaw description, then recursively traces the flaw upstream through the DAG of intermediary files to find where it originated. Produces both JSON and markdown output. Built on PlanExe's existing LLM infrastructure so it can eventually become a pipeline stage. +A CLI tool that takes a PlanExe output directory, a starting file, and a problem description, then recursively traces the problem upstream through the DAG of intermediary files to find where it originated. Produces both JSON and markdown output. Built on PlanExe's existing LLM infrastructure so it can eventually become a pipeline stage. ## Architecture -The tool performs a recursive depth-first search through the pipeline DAG. Starting from a downstream file where a flaw is observed, it walks upstream one hop at a time — reading input files, asking an LLM whether the flaw or a precursor exists there, and continuing until it reaches a stage where the flaw exists in the output but not in any inputs. At that origin point, it reads the stage's source code to identify the likely cause. +The tool performs a recursive depth-first search through the pipeline DAG. Starting from a downstream file where a problem is observed, it walks upstream one hop at a time — reading input files, asking an LLM whether the problem or a precursor exists there, and continuing until it reaches a node where the problem exists in the output but not in any inputs. At that origin point, it reads the node's source code to identify the likely cause. -Three LLM prompts drive the analysis: flaw identification (once at the start), upstream checking (at each hop), and source code analysis (at each origin). All use Pydantic models for structured output and LLMExecutor for fallback resilience. +Three LLM prompts drive the analysis: problem identification (once at the start), upstream checking (at each hop), and source code analysis (at each origin). All use Pydantic models for structured output and LLMExecutor for fallback resilience. ## Components ``` -worker_plan/worker_plan_internal/flaw_tracer/ +worker_plan/worker_plan_internal/rca/ __init__.py __main__.py — CLI entry point (argparse, LLM setup, orchestration) registry.py — Static DAG mapping: stages, output files, dependencies, source code paths @@ -33,16 +33,16 @@ A static Python data structure mapping the full pipeline topology. Each entry de ```python @dataclass -class StageInfo: +class NodeInfo: name: str # e.g., "potential_levers" output_files: list[str] # e.g., ["002-9-potential_levers_raw.json", "002-10-potential_levers.json"] - upstream_stages: list[str] # e.g., ["setup", "identify_purpose", "plan_type", "extract_constraints"] + inputs: list[str] # e.g., ["setup", "identify_purpose", "plan_type", "extract_constraints"] source_code_files: list[str] # Relative to worker_plan/, e.g., ["worker_plan_internal/plan/stages/potential_levers.py", "worker_plan_internal/lever/identify_potential_levers.py"] ``` The registry covers all ~48 pipeline stages. Key functions: -- `find_stage_by_filename(filename: str) -> StageInfo | None` — Given an output filename, return the stage that produced it. +- `find_node_by_filename(filename: str) -> NodeInfo | None` — Given an output filename, return the stage that produced it. - `get_upstream_files(stage_name: str, output_dir: Path) -> list[tuple[str, Path]]` — Return `(stage_name, file_path)` pairs for all upstream stages, resolved against the output directory. Skip files that don't exist on disk. When a stage has multiple output files (e.g., both `_raw.json` and `.json`), prefer the clean/processed file since that's what downstream stages consume. If only the raw file exists, use that. - `get_source_code_paths(stage_name: str) -> list[Path]` — Return absolute paths to source code files for a stage. @@ -53,103 +53,103 @@ The mapping is derived from the Luigi task classes (`requires()` and `output()` Three Pydantic models for structured LLM output: ```python -class IdentifiedFlaw(BaseModel): - description: str = Field(description="One-sentence description of the flaw") - evidence: str = Field(description="Direct quote from the file demonstrating the flaw") +class IdentifiedProblem(BaseModel): + description: str = Field(description="One-sentence description of the problem") + evidence: str = Field(description="Direct quote from the file demonstrating the problem") severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( description="HIGH: fabricated data or missing critical analysis. MEDIUM: weak reasoning or vague claims. LOW: minor gaps." ) -class FlawIdentificationResult(BaseModel): - flaws: list[IdentifiedFlaw] = Field(description="List of discrete flaws found in the file") +class ProblemIdentificationResult(BaseModel): + problems: list[IdentifiedProblem] = Field(description="List of discrete problems found in the file") class UpstreamCheckResult(BaseModel): - found: bool = Field(description="True if this file contains the flaw or a precursor to it") + found: bool = Field(description="True if this file contains the problem or a precursor to it") evidence: str | None = Field(description="Direct quote from the file if found, null otherwise") - explanation: str = Field(description="How this connects to the downstream flaw, or why this file is clean") + explanation: str = Field(description="How this connects to the downstream problem, or why this file is clean") class SourceCodeAnalysisResult(BaseModel): - likely_cause: str = Field(description="What in the prompt or logic likely caused the flaw") + likely_cause: str = Field(description="What in the prompt or logic likely caused the problem") relevant_code_section: str = Field(description="The specific code or prompt text responsible") - suggestion: str = Field(description="How to fix or prevent this flaw") + suggestion: str = Field(description="How to fix or prevent this problem") ``` Three prompt-building functions, each returning a `list[ChatMessage]`: -**`build_flaw_identification_prompt(filename, file_content, user_flaw_description)`** +**`build_problem_identification_messages(filename, file_content, user_problem_description)`** System message: ``` You are analyzing an intermediary file from a project planning pipeline. -The user has identified problems in this output. Identify each discrete flaw. -For each flaw, provide a short description, a direct quote as evidence, and a severity level. -Only identify real flaws — do not flag stylistic preferences or minor formatting issues. +The user has identified problems in this output. Identify each discrete problem. +For each problem, provide a short description, a direct quote as evidence, and a severity level. +Only identify real problems — do not flag stylistic preferences or minor formatting issues. ``` -User message contains the filename, file content, and the user's flaw description. +User message contains the filename, file content, and the user's problem description. -**`build_upstream_check_prompt(flaw_description, evidence_quote, upstream_filename, upstream_file_content)`** +**`build_upstream_check_messages(problem_description, evidence_quote, upstream_filename, upstream_file_content)`** System message: ``` -You are tracing a flaw through a project planning pipeline to find where it originated. -A downstream file contains a flaw. You are examining an upstream file that was an input -to the stage that produced the flawed output. Determine if this upstream file contains +You are tracing a problem through a project planning pipeline to find where it originated. +A downstream file contains a problem. You are examining an upstream file that was an input +to the stage that produced the problematic output. Determine if this upstream file contains the same problem or a precursor to it. ``` -User message contains the flaw details and the upstream file content. +User message contains the problem details and the upstream file content. -**`build_source_code_analysis_prompt(flaw_description, evidence_quote, source_code_contents)`** +**`build_source_code_analysis_messages(problem_description, evidence_quote, source_code_contents)`** System message: ``` -A flaw was introduced at this pipeline stage. The flaw exists in its output but NOT +A problem was introduced at this pipeline stage. The problem exists in its output but NOT in any of its inputs. Examine the source code to identify what in the prompt text, -logic, or processing likely caused this flaw. Be specific — point to lines or prompt phrases. +logic, or processing likely caused this problem. Be specific — point to lines or prompt phrases. ``` -User message contains the flaw details and the concatenated source code. +User message contains the problem details and the concatenated source code. ### `tracer.py` — Recursive Tracing Algorithm ```python -class FlawTracer: +class RootCauseAnalyzer: def __init__(self, output_dir: Path, llm_executor: LLMExecutor, source_code_base: Path, max_depth: int = 15, verbose: bool = False): ... - def trace(self, starting_file: str, flaw_description: str) -> FlawTraceResult: + def trace(self, starting_file: str, problem_description: str) -> RCAResult: """Main entry point. Returns the complete trace result.""" ... ``` The `trace` method implements three phases: -**Phase 1 — Identify flaws.** -Read the starting file. Build the flaw identification prompt with the file content and user's description. Call the LLM via `LLMExecutor.run()` using `llm.as_structured_llm(FlawIdentificationResult)`. Returns a list of `IdentifiedFlaw` objects. +**Phase 1 — Identify problems.** +Read the starting file. Build the problem identification prompt with the file content and user's description. Call the LLM via `LLMExecutor.run()` using `llm.as_structured_llm(ProblemIdentificationResult)`. Returns a list of `IdentifiedProblem` objects. **Phase 2 — Recursive upstream trace.** -For each identified flaw, call `_trace_flaw_upstream(flaw, stage_name, current_file, depth)`: +For each identified problem, call `_trace_upstream(problem, node_name, current_file, depth)`: -1. Look up the current stage's upstream stages via the registry. -2. For each upstream stage, resolve its output files on disk. +1. Look up the current node's upstream nodes via the registry. +2. For each upstream node, resolve its output files on disk. 3. Read each upstream file. Build the upstream check prompt. Call the LLM. -4. If `found=True`: append to the trace chain and recurse into that stage's upstream dependencies. +4. If `found=True`: append to the trace chain and recurse into that node's upstream dependencies. 5. If `found=False`: this branch is clean, stop. 6. If depth reaches `max_depth`: stop and mark trace as incomplete. -**Deduplication:** Track which `(stage_name, flaw_description)` pairs have already been analyzed. If two flaws converge on the same upstream file, reuse the earlier result. +**Deduplication:** Track which `(node_name, problem_description)` pairs have already been analyzed. If two problems converge on the same upstream file, reuse the earlier result. -**Multiple upstream branches:** When a stage has multiple upstream inputs and the flaw is found in more than one, follow all branches. The trace can fork — the JSON output represents this as a list of trace entries per flaw (each entry has a stage and file), ordered from downstream to upstream. +**Multiple upstream branches:** When a node has multiple upstream inputs and the problem is found in more than one, follow all branches. The trace can fork — the JSON output represents this as a list of trace entries per problem (each entry has a node and file), ordered from downstream to upstream. **Phase 3 — Source code analysis at origin.** -When a flaw is found in a stage's output but not in any of its inputs, that stage is the origin. Read the source code files for that stage (via registry). Build the source code analysis prompt. Call the LLM. Attach the result to the flaw's origin data. +When a problem is found in a node's output but not in any of its inputs, that node is the origin. Read the source code files for that node (via registry). Build the source code analysis prompt. Call the LLM. Attach the result to the problem's origin data. ### `output.py` — Report Generation Two functions: -**`write_json_report(result: FlawTraceResult, output_path: Path)`** +**`write_json_report(result: RCAResult, output_path: Path)`** Writes the full trace as JSON: @@ -157,32 +157,32 @@ Writes the full trace as JSON: { "input": { "starting_file": "030-report.html", - "flaw_description": "...", + "problem_description": "...", "output_dir": "/path/to/output", "timestamp": "2026-04-05T14:30:00Z" }, - "flaws": [ + "problems": [ { - "id": "flaw_001", + "id": "problem_001", "description": "Budget of CZK 500,000 is unvalidated", "severity": "HIGH", "starting_evidence": "quote from starting file...", "trace": [ { - "stage": "executive_summary", + "node": "executive_summary", "file": "025-2-executive_summary.md", "evidence": "...", "is_origin": false }, { - "stage": "make_assumptions", + "node": "make_assumptions", "file": "003-5-make_assumptions.md", "evidence": "...", "is_origin": true } ], "origin": { - "stage": "make_assumptions", + "node": "make_assumptions", "file": "003-5-make_assumptions.md", "source_code_files": ["stages/make_assumptions.py", "assumption/make_assumptions.py"], "likely_cause": "The prompt asks the LLM to...", @@ -192,33 +192,33 @@ Writes the full trace as JSON: } ], "summary": { - "total_flaws": 3, - "deepest_origin_stage": "make_assumptions", + "total_problems": 3, + "deepest_origin_node": "make_assumptions", "deepest_origin_depth": 3, "llm_calls_made": 12 } } ``` -**`write_markdown_report(result: FlawTraceResult, output_path: Path)`** +**`write_markdown_report(result: RCAResult, output_path: Path)`** Writes a human-readable report: ```markdown -# Flaw Trace Report +# Root Cause Analysis Report **Input:** 030-report.html -**Flaws found:** 3 +**Problems found:** 3 **Deepest origin:** make_assumptions (depth 3) --- -## Flaw 1 (HIGH): Budget of CZK 500,000 is unvalidated +## Problem 1 (HIGH): Budget of CZK 500,000 is unvalidated **Trace:** executive_summary -> project_plan -> **make_assumptions** (origin) -| Stage | File | Evidence | -|-------|------|----------| +| Node | File | Evidence | +|------|------|----------| | executive_summary | 025-2-executive_summary.md | "The budget is CZK 500,000..." | | project_plan | 005-2-project_plan.md | "Estimated budget: CZK 500,000..." | | **make_assumptions** | 003-5-make_assumptions.md | "Assume total budget..." | @@ -229,15 +229,15 @@ without requiring external data sources... **Suggestion:** Add a validation step that... ``` -Flaws are sorted by depth (deepest origin first) so the most upstream root cause appears at the top. +Problems are sorted by depth (deepest origin first) so the most upstream root cause appears at the top. ### `__main__.py` — CLI Entry Point ``` -python -m worker_plan_internal.flaw_tracer \ +python -m worker_plan_internal.rca \ --dir /path/to/output \ --file 030-report.html \ - --flaw "The budget is CZK 500,000 but this number appears unvalidated..." \ + --problem "The budget is CZK 500,000 but this number appears unvalidated..." \ --output-dir /path/to/output \ --max-depth 15 \ --verbose @@ -246,16 +246,16 @@ python -m worker_plan_internal.flaw_tracer \ Arguments: - `--dir` (required): Path to the output directory containing intermediary files. - `--file` (required): Starting file to analyze, relative to `--dir`. -- `--flaw` (required): Text description of the observed flaw(s). -- `--output-dir` (optional): Where to write `flaw_trace.json` and `flaw_trace.md`. Defaults to `--dir`. -- `--max-depth` (optional): Maximum upstream hops per flaw. Default 15. +- `--problem` (required): Text description of the observed problem(s). +- `--output-dir` (optional): Where to write `root_cause_analysis.json` and `root_cause_analysis.md`. Defaults to `--dir`. +- `--max-depth` (optional): Maximum upstream hops per problem. Default 15. - `--verbose` (optional): Print each LLM call and result to stderr as the trace runs. Orchestration: 1. Parse arguments. 2. Load model profile via `PlanExeLLMConfig.load()` and create `LLMExecutor` with priority-ordered models from the profile. -3. Create `FlawTracer` instance. -4. Call `tracer.trace(starting_file, flaw_description)`. +3. Create `RootCauseAnalyzer` instance. +4. Call `analyzer.trace(starting_file, problem_description)`. 5. Write JSON and markdown reports via `output.py`. 6. Print summary to stdout. @@ -270,7 +270,7 @@ Orchestration: ## Scope Boundaries **In scope:** -- CLI tool with `--dir`, `--file`, `--flaw`, `--output-dir`, `--max-depth`, `--verbose`. +- CLI tool with `--dir`, `--file`, `--problem`, `--output-dir`, `--max-depth`, `--verbose`. - Static registry of all ~48 pipeline stages with dependencies and source code paths. - Recursive depth-first upstream tracing with three LLM prompt types. - JSON + markdown output sorted by trace depth. diff --git a/worker_plan/worker_plan_internal/rca/README.md b/worker_plan/worker_plan_internal/rca/README.md index 4f303b763..df83c481d 100644 --- a/worker_plan/worker_plan_internal/rca/README.md +++ b/worker_plan/worker_plan_internal/rca/README.md @@ -1,6 +1,6 @@ # Root Cause Analysis (RCA) for PlanExe -Given a problem observed in a pipeline output, this tool traces upstream through the DAG of intermediary artifacts to find where the problem originated and classify its root cause. +Given a problem observed in a pipeline output, this tool traces upstream through the DAG of intermediary artifacts to find where the problem originated and classify its root cause. The classification (`prompt_fixable`, `domain_complexity`, `missing_input`) tells the self-improve loop whether a problem can be fixed automatically by editing a prompt, or whether it requires human input or is an inherent domain limitation. ## How it works From 557cffcacaf490529bff3e2364ece8c142311fcb Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Wed, 8 Apr 2026 16:09:35 +0200 Subject: [PATCH 37/37] fix: consistent depth calculation in RCA tracer Both the fallback path and the upstream-origin path now use len(trace) - 1, fixing an off-by-one that inflated depth when the origin was found after checking upstream inputs. Co-Authored-By: Claude Opus 4.6 (1M context) --- worker_plan/worker_plan_internal/rca/tracer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker_plan/worker_plan_internal/rca/tracer.py b/worker_plan/worker_plan_internal/rca/tracer.py index bc17bf1fa..c3d3ca20e 100644 --- a/worker_plan/worker_plan_internal/rca/tracer.py +++ b/worker_plan/worker_plan_internal/rca/tracer.py @@ -283,7 +283,7 @@ def _trace_upstream( if not found_upstream: # Current node is the origin — problem exists here but not in any upstream traced.origin_node = current_node - traced.depth = len(traced.trace) + traced.depth = len(traced.trace) - 1 self._events.log("origin_found", node=current_node, depth=traced.depth) # Mark the current node entry as origin for entry in traced.trace: