From 34358cb0cb983fe098f18b769cf84c0a6d3f8350 Mon Sep 17 00:00:00 2001 From: bgagent Date: Thu, 4 Jun 2026 13:39:08 -0400 Subject: [PATCH 1/3] feat(agent): add in-pipeline pre-PR self-review phase (#262) Adds an optional self-review phase between agent execution and post-hooks where the LLM critiques its own cumulative diff before the PR is created. This improves first-pass PR quality by catching bugs, style issues, and test gaps before human review. - New self_review.py orchestration module with run_self_review() - New prompts/self_review.py with focused review prompt template - TaskConfig extended with self_review_enabled and self_review_max_turns - Fields threaded through build_config, get_config, server, pipeline - Fail-open: self-review errors never block PR creation - Uses remaining turns/budget from original allocation (capped) - Feature is opt-in (disabled by default) --- agent/src/config.py | 7 + agent/src/models.py | 3 + agent/src/pipeline.py | 21 ++ agent/src/prompts/__init__.py | 1 + agent/src/prompts/self_review.py | 40 ++++ agent/src/self_review.py | 198 +++++++++++++++++++ agent/src/server.py | 9 + agent/tests/test_self_review.py | 316 +++++++++++++++++++++++++++++++ 8 files changed, 595 insertions(+) create mode 100644 agent/src/prompts/self_review.py create mode 100644 agent/src/self_review.py create mode 100644 agent/tests/test_self_review.py diff --git a/agent/src/config.py b/agent/src/config.py index bf4ad57c..da18e90b 100644 --- a/agent/src/config.py +++ b/agent/src/config.py @@ -471,6 +471,8 @@ def build_config( initial_approval_gate_count: int = 0, approval_gate_cap: int | None = None, attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> TaskConfig: """Build and validate configuration from explicit parameters. @@ -583,6 +585,8 @@ def build_config( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=validated_attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) @@ -607,6 +611,9 @@ def get_config() -> TaskConfig: # an unreachable ``traces//`` key. trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"), user_id=os.environ.get("USER_ID", ""), + self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower() + in ("1", "true", "yes"), + self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")), ) except ValueError as e: print(f"ERROR: {e}", file=sys.stderr) diff --git a/agent/src/models.py b/agent/src/models.py index 0befafda..6b4132a9 100644 --- a/agent/src/models.py +++ b/agent/src/models.py @@ -240,6 +240,9 @@ class TaskConfig(BaseModel): # Attachments from the orchestrator payload (Phase 3). Validated as # AttachmentConfig models. Empty list for tasks without attachments. attachments: list[AttachmentConfig] = Field(default_factory=list) + # Self-review: optional LLM diff critique before PR creation. + self_review_enabled: bool = False + self_review_max_turns: int = 5 # Cap on turns allocated to self-review @model_validator(mode="after") def _validate_trace_requires_user_id(self) -> Self: diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index 7718cb48..c09ea93e 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -36,6 +36,7 @@ ) from progress_writer import _ProgressWriter from prompt_builder import build_system_prompt, discover_project_config +from self_review import run_self_review from shell import log, log_error_cw from system_prompt import SYSTEM_PROMPT from telemetry import ( @@ -589,6 +590,8 @@ def run_task( trace: bool = False, user_id: str = "", attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> dict: """Run the full agent pipeline and return a serialized result dict. @@ -628,6 +631,8 @@ def run_task( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) # Inject Cedar policies into config for the PolicyEngine in runner.py @@ -1006,6 +1011,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: ) ensure_pr_strategy = "create" + # Self-review phase: LLM critiques its own diff before PR creation. + # Runs between cancel-check and post-hooks. Fail-open: errors here + # never block PR creation. + with task_span("task.self_review"): + review_result = run_self_review( + config, setup, agent_result, trajectory, progress + ) + if review_result is not None: + # Accumulate turns and cost from the review phase + agent_result.turns += review_result.turns + agent_result.num_turns += review_result.num_turns or review_result.turns + if review_result.cost_usd is not None: + agent_result.cost_usd = ( + (agent_result.cost_usd or 0.0) + review_result.cost_usd + ) + # Post-hooks (agent_result is guaranteed set by the try/except above) with task_span("task.post_hooks") as post_span: # Safety net: commit any uncommitted tracked changes (skip for read-only tasks) diff --git a/agent/src/prompts/__init__.py b/agent/src/prompts/__init__.py index 60c5b2c0..8334c10b 100644 --- a/agent/src/prompts/__init__.py +++ b/agent/src/prompts/__init__.py @@ -13,6 +13,7 @@ from .new_task import NEW_TASK_WORKFLOW from .pr_iteration import PR_ITERATION_WORKFLOW from .pr_review import PR_REVIEW_WORKFLOW +from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT from .web_research import WEB_RESEARCH_PROMPT DEFAULT_WORKFLOW_ID = "coding/new-task-v1" diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py new file mode 100644 index 00000000..9a6d109a --- /dev/null +++ b/agent/src/prompts/self_review.py @@ -0,0 +1,40 @@ +"""Self-review prompt template for pre-PR diff critique.""" + +SELF_REVIEW_PROMPT = """\ +You are reviewing your own work before it becomes a pull request. Below is the \ +cumulative diff of all changes on this branch compared to the base branch. + + +{diff} + + +## Task context + +{task_description} + +## Review checklist + +Examine the diff carefully for: + +1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \ +incorrect assumptions about data shapes or API contracts. +2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \ +race conditions. +3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \ +insecure defaults, OWASP Top 10 issues. +4. **Style & consistency** — Naming conventions, code style violations relative to \ +the surrounding codebase, unnecessary complexity. +5. **Test gaps** — Important behaviour that is untested, assertions that don't \ +verify the right thing, missing edge-case coverage. + +## Instructions + +- If you find issues, fix them directly: edit the files, run the build/tests to \ +verify your fixes, and commit the changes. +- If no issues are found, stop immediately — do not make changes for the sake of \ +making changes. +- Do NOT refactor code that was not part of the original diff unless it has a \ +concrete bug or security issue. +- Keep fixes minimal and focused. Each fix should be a separate commit with a \ +clear message. +""" diff --git a/agent/src/self_review.py b/agent/src/self_review.py new file mode 100644 index 00000000..a73932db --- /dev/null +++ b/agent/src/self_review.py @@ -0,0 +1,198 @@ +"""Self-review orchestration: LLM critiques its own diff before PR creation.""" + +from __future__ import annotations + +import asyncio +import subprocess +from typing import TYPE_CHECKING + +from prompts.self_review import SELF_REVIEW_PROMPT +from shell import log + +if TYPE_CHECKING: + from models import AgentResult, RepoSetup, TaskConfig + from progress_writer import _ProgressWriter + from telemetry import _TrajectoryWriter + +# Diff truncation limit (characters). Large diffs are cut at hunk boundaries. +_MAX_DIFF_CHARS = 60_000 + +# Minimal system prompt for the self-review agent invocation. +_REVIEW_SYSTEM_PROMPT = """\ +You are a code reviewer working inside the repository {repo_url} on branch {branch_name}. +Your working directory is {repo_dir}. + +You have full access to the filesystem and can run commands. Fix any issues you \ +find directly — edit files, run the build, and commit fixes. Keep changes minimal \ +and focused. + +Do NOT open a pull request or push. Just fix issues and commit locally. +""" + + +def _get_diff(repo_dir: str, default_branch: str) -> str: + """Generate the cumulative diff of the branch vs origin/{default_branch}.""" + try: + result = subprocess.run( + ["git", "diff", f"origin/{default_branch}...HEAD"], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + log("WARN", f"self_review: git diff failed (exit {result.returncode})") + return "" + return result.stdout + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}") + return "" + + +def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str: + """Truncate diff at a hunk boundary if it exceeds max_chars. + + Cuts at the last complete hunk (line starting with '@@') that fits + within the limit, appending a truncation notice. + """ + if len(diff) <= max_chars: + return diff + + # Find the last hunk header that starts before max_chars + truncated = diff[:max_chars] + last_hunk = truncated.rfind("\n@@") + if last_hunk > 0: + # Cut just before this hunk header + truncated = truncated[:last_hunk] + else: + # No hunk boundary found — hard-cut at max_chars + last_newline = truncated.rfind("\n") + if last_newline > 0: + truncated = truncated[:last_newline] + + total_lines = diff.count("\n") + kept_lines = truncated.count("\n") + truncated += ( + f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; " + f"{len(diff) - len(truncated)} chars omitted] ..." + ) + return truncated + + +def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str: + """Build a minimal system prompt for the self-review agent.""" + return _REVIEW_SYSTEM_PROMPT.format( + repo_url=config.repo_url, + branch_name=setup.branch, + repo_dir=setup.repo_dir, + ) + + +def run_self_review( + config: TaskConfig, + setup: RepoSetup, + agent_result: AgentResult, + trajectory: _TrajectoryWriter, + progress: _ProgressWriter, +) -> AgentResult | None: + """Run the self-review phase: LLM critiques its own diff and fixes issues. + + Returns the AgentResult from the review phase, or None if skipped. + Fail-open: errors are logged but never block the pipeline. + """ + # Skip condition: feature disabled + if not config.self_review_enabled: + log("TASK", "self_review: disabled (self_review_enabled=False)") + return None + + # Skip condition: read-only workflows produce no diff to review + if config.read_only: + log("TASK", "self_review: skipped for read-only workflow") + return None + + # Compute remaining turns + used_turns = agent_result.turns or 0 + remaining_turns = config.max_turns - used_turns + review_turns = min(remaining_turns, config.self_review_max_turns) + if review_turns <= 0: + log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})") + return None + + # Compute remaining budget + review_budget: float | None = None + if config.max_budget_usd is not None: + used_cost = agent_result.cost_usd or 0.0 + remaining_budget = config.max_budget_usd - used_cost + if remaining_budget <= 0: + log( + "TASK", + f"self_review: no remaining budget " + f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})", + ) + return None + review_budget = remaining_budget + + # Get the diff + diff = _get_diff(setup.repo_dir, setup.default_branch) + if not diff.strip(): + log("TASK", "self_review: no diff found — skipping") + return None + + # Truncate if needed + diff = _truncate_diff(diff) + + # Build the review prompt + task_desc = config.task_description or f"Issue #{config.issue_number}" + user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc) + system_prompt = _build_review_system_prompt(config, setup) + + # Build a modified config for the review run + review_config = config.model_copy( + update={ + "max_turns": review_turns, + "max_budget_usd": review_budget, + } + ) + + log( + "TASK", + f"self_review: starting (turns={review_turns}, " + f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, " + f"diff_chars={len(diff)})", + ) + progress.write_agent_milestone( + "self_review_started", + f"turns={review_turns} diff_chars={len(diff)}", + ) + + try: + from runner import run_agent + + review_result = asyncio.run( + run_agent( + user_prompt, + system_prompt, + review_config, + cwd=setup.repo_dir, + trajectory=trajectory, + ) + ) + except Exception as e: + # Fail-open: self-review errors never block the pipeline + log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}") + progress.write_agent_milestone( + "self_review_complete", + f"status=error error={type(e).__name__}: {e}", + ) + return None + + log( + "TASK", + f"self_review: complete (status={review_result.status}, " + f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})", + ) + progress.write_agent_milestone( + "self_review_complete", + f"status={review_result.status} turns={review_result.turns}", + ) + return review_result diff --git a/agent/src/server.py b/agent/src/server.py index e1114374..ea615ce9 100644 --- a/agent/src/server.py +++ b/agent/src/server.py @@ -405,6 +405,8 @@ def _run_task_background( user_id: str = "", workload_access_token: str = "", attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> None: """Run the agent task in a background thread.""" global _background_pipeline_failed @@ -488,6 +490,8 @@ def _run_task_background( trace=trace, user_id=user_id, attachments=attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) _background_pipeline_failed = False except Exception as e: @@ -576,6 +580,9 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: channel_source = inp.get("channel_source", "") or "" channel_metadata = inp.get("channel_metadata") or {} attachments = inp.get("attachments") or [] + # Self-review (opt-in): LLM critiques its own diff before PR creation. + self_review_enabled = inp.get("self_review_enabled") is True + self_review_max_turns = int(inp.get("self_review_max_turns", 5)) # ``trace`` is strictly opt-in (design §10.1). Accept only real # booleans from the orchestrator — a string "false" would otherwise # flip the flag on. @@ -649,6 +656,8 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: "user_id": user_id, "workload_access_token": workload_access_token, "attachments": attachments, + "self_review_enabled": self_review_enabled, + "self_review_max_turns": self_review_max_turns, } diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py new file mode 100644 index 00000000..bd39741a --- /dev/null +++ b/agent/tests/test_self_review.py @@ -0,0 +1,316 @@ +"""Unit tests for self_review.py — self-review orchestration module.""" + +from unittest.mock import MagicMock, patch + +from models import AgentResult, RepoSetup, TaskConfig +from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review + + +def _make_config(**overrides) -> TaskConfig: + """Create a minimal TaskConfig for testing.""" + defaults = { + "repo_url": "owner/repo", + "github_token": "ghp_test123", + "aws_region": "us-east-1", + "task_description": "Fix the bug", + "max_turns": 10, + "self_review_enabled": True, + "self_review_max_turns": 5, + "task_type": "new_task", + } + defaults.update(overrides) + return TaskConfig(**defaults) + + +def _make_setup(**overrides) -> RepoSetup: + """Create a minimal RepoSetup for testing.""" + defaults = { + "repo_dir": "/workspace/repo", + "branch": "feat/123-fix", + "default_branch": "main", + } + defaults.update(overrides) + return RepoSetup(**defaults) + + +def _make_agent_result(**overrides) -> AgentResult: + """Create a minimal AgentResult for testing.""" + defaults = { + "status": "success", + "turns": 5, + "num_turns": 5, + "cost_usd": 0.50, + } + defaults.update(overrides) + return AgentResult(**defaults) + + +class TestSkipConditions: + """Test all conditions that cause self-review to be skipped.""" + + def test_skip_when_disabled(self): + config = _make_config(self_review_enabled=False) + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_for_pr_review_task_type(self): + config = _make_config( + task_type="pr_review", + pr_number="42", + task_description="", + issue_number="", + ) + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_when_no_remaining_turns(self): + config = _make_config(max_turns=5, self_review_max_turns=5) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_when_no_remaining_budget(self): + config = _make_config(max_budget_usd=1.0) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=1.0) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value="") + def test_skip_when_empty_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value=" \n \n ") + def test_skip_when_whitespace_only_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + +class TestGetDiff: + """Test _get_diff helper.""" + + @patch("self_review.subprocess.run") + def test_returns_diff_output(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="diff --git a/f\n+line\n") + result = _get_diff("/repo", "main") + assert result == "diff --git a/f\n+line\n" + mock_run.assert_called_once_with( + ["git", "diff", "origin/main...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + @patch("self_review.subprocess.run") + def test_returns_empty_on_failure(self, mock_run): + mock_run.return_value = MagicMock(returncode=1, stdout="") + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_returns_empty_on_timeout(self, mock_run): + import subprocess + + mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=60) + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_uses_custom_default_branch(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="some diff") + _get_diff("/repo", "develop") + mock_run.assert_called_once_with( + ["git", "diff", "origin/develop...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + +class TestTruncateDiff: + """Test _truncate_diff helper.""" + + def test_no_truncation_needed(self): + short_diff = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n+new line\n" + result = _truncate_diff(short_diff) + assert result == short_diff + + def test_truncates_at_hunk_boundary(self): + # Build a diff that exceeds the limit with multiple hunks + hunk1 = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n" + "+line\n" * 100 + hunk2 = "\n@@ -100,3 +101,4 @@\n" + "+more\n" * 100 + big_diff = hunk1 + hunk2 + result = _truncate_diff(big_diff, max_chars=len(hunk1) + 10) + # Should truncate before hunk2's @@ marker + assert "@@ -100,3" not in result + assert "truncated" in result + + def test_hard_cut_when_no_hunk_boundary(self): + # Single hunk that exceeds max + big_diff = "+x\n" * 30000 + result = _truncate_diff(big_diff, max_chars=100) + assert len(result) < len(big_diff) + assert "truncated" in result + + def test_exact_limit_no_truncation(self): + diff = "a" * _MAX_DIFF_CHARS + result = _truncate_diff(diff) + assert result == diff + + def test_one_over_limit_truncates(self): + diff = "a" * (_MAX_DIFF_CHARS + 1) + result = _truncate_diff(diff) + assert "truncated" in result + + +class TestBudgetAndTurnComputation: + """Test that budget and turn limits are computed correctly.""" + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_capped_at_self_review_max_turns(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2) + config = _make_config(max_turns=100, self_review_max_turns=3) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # The review config passed to run_agent should have max_turns=3 + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + # Close the coroutine to avoid warnings + coro.close() + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1) + config = _make_config(max_turns=8, self_review_max_turns=5) + setup = _make_setup() + agent_result = _make_agent_result(turns=6) # Only 2 remaining + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # Should use min(2, 5) = 2 turns + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + coro.close() + + +class TestHappyPath: + """Test the full happy path with mocked run_agent.""" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_returns_review_result(self, mock_asyncio_run, mock_diff): + review_agent_result = AgentResult( + status="success", turns=2, num_turns=2, cost_usd=0.10 + ) + mock_asyncio_run.return_value = review_agent_result + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + + assert result is not None + assert result.status == "success" + assert result.turns == 2 + assert result.cost_usd == 0.10 + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_writes_progress_milestones(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.05 + ) + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # Should write both started and complete milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert len(milestone_calls) == 2 + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_fail_open_on_agent_error(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.side_effect = RuntimeError("SDK crashed") + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + # Should not raise + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + # Should still write milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + assert "error" in milestone_calls[1][0][1] + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.03 + ) + config = _make_config(max_budget_usd=None) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=None) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is not None + assert result.status == "success" From bf720e8f43a95365abaac730c9e916e7f8bb017b Mon Sep 17 00:00:00 2001 From: bgagent Date: Thu, 4 Jun 2026 15:29:18 -0400 Subject: [PATCH 2/3] feat(agent): post self-review summary as PR comment (#263) After the self-review phase completes, the agent writes a structured summary of findings to `.self-review-summary.md`. The pipeline reads this file, posts it as a PR comment via `gh pr comment`, then deletes it so it never appears in the PR diff. Fail-open: if the file is missing or the comment fails to post, the pipeline continues normally. --- agent/src/pipeline.py | 5 + agent/src/post_hooks.py | 62 +++++++++++ agent/src/prompts/self_review.py | 21 ++++ agent/src/self_review.py | 41 ++++++++ agent/tests/test_self_review.py | 170 ++++++++++++++++++++++++++++++- 5 files changed, 298 insertions(+), 1 deletion(-) diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index c09ea93e..f486aa83 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -31,6 +31,7 @@ _extract_agent_notes, ensure_committed, ensure_pr, + post_self_review_comment, verify_build, verify_lint, ) @@ -1049,6 +1050,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: if pr_url: progress.write_agent_milestone("pr_created", pr_url) + # Post self-review summary as PR comment (if self-review ran and produced findings) + if pr_url and review_result is not None: + post_self_review_comment(setup.repo_dir, pr_url, config) + # Memory write — capture task episode and repo learnings memory_written = False effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "") diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py index 058a1e4c..dc840657 100644 --- a/agent/src/post_hooks.py +++ b/agent/src/post_hooks.py @@ -392,6 +392,68 @@ def ensure_pr( return None +def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool: + """Post the self-review summary as a PR comment. + + Reads the summary file written by the self-review agent, formats it as a + comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged + but never propagated. + + Returns True if a comment was posted, False otherwise. + """ + from self_review import read_self_review_summary + + try: + summary = read_self_review_summary(repo_dir) + except Exception as e: + log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}") + return False + + if not summary: + log("POST", "post_self_review_comment: no summary file found — skipping") + return False + + # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123) + match = re.search(r"/pull/(\d+)", pr_url) + if not match: + log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}") + return False + pr_number = match.group(1) + + comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}" + + try: + result = subprocess.run( + [ + "gh", + "pr", + "comment", + pr_number, + "--repo", + config.repo_url, + "--body", + comment_body, + ], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + log("POST", f"Self-review summary posted as comment on PR #{pr_number}") + return True + stderr = result.stderr.strip()[:200] if result.stderr else "" + log( + "WARN", + f"post_self_review_comment: gh pr comment failed " + f"(rc={result.returncode}): {stderr}", + ) + return False + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}") + return False + + def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None: """Extract the "## Agent notes" section from the PR body. diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py index 9a6d109a..25cf47a2 100644 --- a/agent/src/prompts/self_review.py +++ b/agent/src/prompts/self_review.py @@ -37,4 +37,25 @@ concrete bug or security issue. - Keep fixes minimal and focused. Each fix should be a separate commit with a \ clear message. + +## Summary output + +After completing your review (whether you made fixes or not), write a file \ +`.self-review-summary.md` in the repository root with your findings in this format: + +```markdown +### Self-Review Summary + +**Findings:** +**Fixes applied:** + +#### Issues found + +- : +``` + +If no issues were found, write the file with: "No issues found — code looks good." + +This file is a pipeline artifact and will be deleted automatically — it will NOT \ +appear in the pull request. """ diff --git a/agent/src/self_review.py b/agent/src/self_review.py index a73932db..69114d68 100644 --- a/agent/src/self_review.py +++ b/agent/src/self_review.py @@ -3,6 +3,8 @@ from __future__ import annotations import asyncio +import contextlib +import os import subprocess from typing import TYPE_CHECKING @@ -196,3 +198,42 @@ def run_self_review( f"status={review_result.status} turns={review_result.turns}", ) return review_result + + +_SUMMARY_FILENAME = ".self-review-summary.md" + + +def read_self_review_summary(repo_dir: str) -> str | None: + """Read and delete the self-review summary file. + + The self-review agent writes `.self-review-summary.md` in the repo root. + This function reads the content, removes the file (so it never appears in + the PR), and returns the content. Returns None if the file doesn't exist. + """ + summary_path = os.path.join(repo_dir, _SUMMARY_FILENAME) + if not os.path.isfile(summary_path): + return None + + try: + with open(summary_path) as f: + content = f.read() + except OSError as e: + log("WARN", f"self_review: failed to read summary file: {type(e).__name__}: {e}") + return None + + # Remove the file so it doesn't end up in the PR + try: + os.remove(summary_path) + except OSError as e: + log("WARN", f"self_review: failed to delete summary file: {type(e).__name__}: {e}") + + # If the file was staged by the agent, unstage it + with contextlib.suppress(subprocess.TimeoutExpired, OSError): + subprocess.run( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=repo_dir, + capture_output=True, + timeout=30, + ) + + return content.strip() if content.strip() else None diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py index bd39741a..e2a817f1 100644 --- a/agent/tests/test_self_review.py +++ b/agent/tests/test_self_review.py @@ -1,9 +1,17 @@ """Unit tests for self_review.py — self-review orchestration module.""" +import os from unittest.mock import MagicMock, patch from models import AgentResult, RepoSetup, TaskConfig -from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review +from self_review import ( + _MAX_DIFF_CHARS, + _SUMMARY_FILENAME, + _get_diff, + _truncate_diff, + read_self_review_summary, + run_self_review, +) def _make_config(**overrides) -> TaskConfig: @@ -314,3 +322,163 @@ def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff): result = run_self_review(config, setup, agent_result, trajectory, progress) assert result is not None assert result.status == "success" + + +class TestReadSelfReviewSummary: + """Tests for read_self_review_summary — reads and cleans up the summary file.""" + + def test_returns_content_when_file_exists(self, tmp_path): + summary_content = ( + "### Self-Review Summary\n\n" + "**Findings:** 2\n" + "**Fixes applied:** 1\n\n" + "#### Issues found\n\n" + "- Security: hardcoded token — fixed\n" + "- Style: inconsistent naming — not fixed (cosmetic)\n" + ) + (tmp_path / _SUMMARY_FILENAME).write_text(summary_content) + + result = read_self_review_summary(str(tmp_path)) + + assert result == summary_content.strip() + + def test_returns_none_when_file_missing(self, tmp_path): + result = read_self_review_summary(str(tmp_path)) + assert result is None + + def test_deletes_file_after_reading(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("No issues found — code looks good.") + + read_self_review_summary(str(tmp_path)) + + assert not (tmp_path / _SUMMARY_FILENAME).exists() + + def test_returns_none_for_empty_file(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text(" \n\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + def test_returns_none_for_whitespace_only(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("\t\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + @patch("self_review.subprocess.run") + def test_runs_git_rm_cached_for_cleanup(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + mock_run.return_value = MagicMock(returncode=0) + + read_self_review_summary(str(tmp_path)) + + mock_run.assert_called_once_with( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=str(tmp_path), + capture_output=True, + timeout=30, + ) + + @patch("self_review.subprocess.run", side_effect=OSError("git not found")) + def test_git_rm_failure_does_not_block(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Findings here") + + # Should not raise even if git rm fails + result = read_self_review_summary(str(tmp_path)) + + assert result == "Findings here" + + +class TestPostSelfReviewComment: + """Tests for post_hooks.post_self_review_comment.""" + + def test_posts_comment_on_success(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1\n**Fixes applied:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is True + call_args = mock_run.call_args[0][0] + assert call_args[0:3] == ["gh", "pr", "comment"] + assert "42" in call_args + assert "owner/repo" in call_args + + def test_returns_false_when_no_summary(self, tmp_path): + from post_hooks import post_self_review_comment + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_invalid_pr_url(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/issues/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_gh_failure(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/99" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_fail_open_on_exception(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/5" + + with patch("post_hooks.subprocess.run", side_effect=OSError("network error")): + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_comment_body_includes_header(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 0\nNo issues.") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/7" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + post_self_review_comment(str(tmp_path), pr_url, config) + + call_args = mock_run.call_args[0][0] + body_idx = call_args.index("--body") + 1 + body = call_args[body_idx] + assert "Self-Review Summary" in body + assert "**Findings:** 0" in body From 32112308a0a34f990923e02d845edda4b1ac108d Mon Sep 17 00:00:00 2001 From: bgagent Date: Mon, 22 Jun 2026 16:08:03 -0400 Subject: [PATCH 3/3] refactor(agent): expose self-review as a configurable workflow step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback: rather than a hardcoded inline pipeline phase gated by a per-task API flag, self-review is now a first-class workflow StepKind, configurable as a step in the coding workflow YAML. - Add `self_review` StepKind to workflow.schema.json (with a `max_turns` step field, 1-20, default 5) + models.Step; excluded from repo-less workflows (schema conditional + validator rule 3) since it diffs/re-runs the agent against the cloned tree. - Register `_handle_self_review` in the runner's STEP_HANDLERS; it wraps run_self_review and accumulates the review loop's turns/cost onto the shared agent_result. Non-side-effecting, so it may use on_failure: continue (advisory / fail-open). - Drive the step from pipeline.run_task via `_execute_self_review_step` (only_kinds={"self_review"}), mirroring _execute_agent_step. Runs after the cancel short-circuit and before post-hooks; posts the summary PR comment after ensure_pr when the step ran. - Declare the step in coding/new-task-v1.yaml (opt-in by presence; on_failure: continue). Remove it to disable; tune via max_turns. - Drop the now-redundant self_review_enabled / self_review_max_turns fields from TaskConfig, build_config, get_config, server params, and run_task — configurability lives in the workflow YAML. - Add golden validator-corpus fixtures (valid self_review step; repo-less + self_review rejected) to lock parity. Agent quality green: ruff, ty (0 errors), 1129 tests, 79.87% coverage. --- agent/src/config.py | 7 - agent/src/models.py | 3 - agent/src/pipeline.py | 84 +++++-- agent/src/post_hooks.py | 3 +- agent/src/self_review.py | 36 ++- agent/src/server.py | 9 - agent/src/workflow/models.py | 4 + agent/src/workflow/runner.py | 71 ++++++ agent/src/workflow/validator.py | 6 +- agent/tests/test_self_review.py | 208 +++++++++++++----- agent/workflows/coding/new-task-v1.yaml | 6 + agent/workflows/schema/workflow.schema.json | 10 +- .../rule3-repo-less-has-self-review.json | 69 ++++++ .../valid-coding-self-review-step.json | 94 ++++++++ 14 files changed, 496 insertions(+), 114 deletions(-) create mode 100644 contracts/workflow-validation/rule3-repo-less-has-self-review.json create mode 100644 contracts/workflow-validation/valid-coding-self-review-step.json diff --git a/agent/src/config.py b/agent/src/config.py index da18e90b..bf4ad57c 100644 --- a/agent/src/config.py +++ b/agent/src/config.py @@ -471,8 +471,6 @@ def build_config( initial_approval_gate_count: int = 0, approval_gate_cap: int | None = None, attachments: list[dict] | None = None, - self_review_enabled: bool = False, - self_review_max_turns: int = 5, ) -> TaskConfig: """Build and validate configuration from explicit parameters. @@ -585,8 +583,6 @@ def build_config( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=validated_attachments, - self_review_enabled=self_review_enabled, - self_review_max_turns=self_review_max_turns, ) @@ -611,9 +607,6 @@ def get_config() -> TaskConfig: # an unreachable ``traces//`` key. trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"), user_id=os.environ.get("USER_ID", ""), - self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower() - in ("1", "true", "yes"), - self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")), ) except ValueError as e: print(f"ERROR: {e}", file=sys.stderr) diff --git a/agent/src/models.py b/agent/src/models.py index 6b4132a9..0befafda 100644 --- a/agent/src/models.py +++ b/agent/src/models.py @@ -240,9 +240,6 @@ class TaskConfig(BaseModel): # Attachments from the orchestrator payload (Phase 3). Validated as # AttachmentConfig models. Empty list for tasks without attachments. attachments: list[AttachmentConfig] = Field(default_factory=list) - # Self-review: optional LLM diff critique before PR creation. - self_review_enabled: bool = False - self_review_max_turns: int = 5 # Cap on turns allocated to self-review @model_validator(mode="after") def _validate_trace_requires_user_id(self) -> Self: diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index f486aa83..f4c29206 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -37,7 +37,6 @@ ) from progress_writer import _ProgressWriter from prompt_builder import build_system_prompt, discover_project_config -from self_review import run_self_review from shell import log, log_error_cw from system_prompt import SYSTEM_PROMPT from telemetry import ( @@ -219,6 +218,50 @@ def _execute_agent_step( return ctx.agent_result +def _execute_self_review_step( + workflow: Workflow | None, + config, + setup, + agent_result, + hydrated, + trajectory, + progress, +) -> bool: + """Drive the workflow's ``self_review`` step (if declared) through the runner. + + Mirrors ``_execute_agent_step``: only the ``self_review`` step is dispatched + (``only_kinds={"self_review"}``) so clone / build / PR stay on the inline + path. The step's handler accumulates the review loop's turns/cost back onto + ``agent_result`` (a shared mutable model), so the terminal result reflects + implement + review. + + Returns True when the review actually ran (so the caller posts the summary + PR comment after ``ensure_pr``); False when no ``self_review`` step is + declared, the workflow failed to reload, or the review was skipped (read-only + / empty diff / no remaining turns). Fully fail-open — a review failure is + recorded as a step outcome and never propagates to block PR creation. + """ + if workflow is None or not any(s.kind == "self_review" for s in workflow.steps): + return False + + from workflow import StepContext, run_workflow + + ctx = StepContext( + workflow=workflow, + config=config, + hydrated=hydrated, + progress=progress, + trajectory=trajectory, + setup=setup, + # The implement step's result, threaded in so the handler can size the + # review's turn budget and accumulate its turns/cost onto it. + agent_result=agent_result, + ) + with task_span("task.self_review"): + run_workflow(workflow, ctx, only_kinds={"self_review"}) + return bool(ctx.artifacts.get("self_review_ran", False)) + + def _run_repoless_task( *, config, @@ -591,8 +634,6 @@ def run_task( trace: bool = False, user_id: str = "", attachments: list[dict] | None = None, - self_review_enabled: bool = False, - self_review_max_turns: int = 5, ) -> dict: """Run the full agent pipeline and return a serialized result dict. @@ -632,8 +673,6 @@ def run_task( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=attachments, - self_review_enabled=self_review_enabled, - self_review_max_turns=self_review_max_turns, ) # Inject Cedar policies into config for the PolicyEngine in runner.py @@ -1012,21 +1051,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: ) ensure_pr_strategy = "create" - # Self-review phase: LLM critiques its own diff before PR creation. - # Runs between cancel-check and post-hooks. Fail-open: errors here - # never block PR creation. - with task_span("task.self_review"): - review_result = run_self_review( - config, setup, agent_result, trajectory, progress - ) - if review_result is not None: - # Accumulate turns and cost from the review phase - agent_result.turns += review_result.turns - agent_result.num_turns += review_result.num_turns or review_result.turns - if review_result.cost_usd is not None: - agent_result.cost_usd = ( - (agent_result.cost_usd or 0.0) + review_result.cost_usd - ) + # Self-review step: if the resolved workflow declares a ``self_review`` + # step, drive it through the workflow runner (same pattern as + # ``_execute_agent_step``). The step has the LLM critique its own diff + # and fix issues, accumulating its turns/cost onto ``agent_result``. + # Runs AFTER the cancel short-circuit so a cancelled task never starts + # a second agent loop, and BEFORE post-hooks so fixes land in the PR. + # Fail-open: a review failure/skip never blocks PR creation. + self_review_ran = _execute_self_review_step( + _workflow, + config, + setup, + agent_result, + hc, + trajectory, + progress, + ) # Post-hooks (agent_result is guaranteed set by the try/except above) with task_span("task.post_hooks") as post_span: @@ -1050,8 +1090,8 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: if pr_url: progress.write_agent_milestone("pr_created", pr_url) - # Post self-review summary as PR comment (if self-review ran and produced findings) - if pr_url and review_result is not None: + # Post self-review summary as PR comment (if the self_review step ran) + if pr_url and self_review_ran: post_self_review_comment(setup.repo_dir, pr_url, config) # Memory write — capture task episode and repo learnings diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py index dc840657..00268847 100644 --- a/agent/src/post_hooks.py +++ b/agent/src/post_hooks.py @@ -445,8 +445,7 @@ def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> stderr = result.stderr.strip()[:200] if result.stderr else "" log( "WARN", - f"post_self_review_comment: gh pr comment failed " - f"(rc={result.returncode}): {stderr}", + f"post_self_review_comment: gh pr comment failed (rc={result.returncode}): {stderr}", ) return False except (subprocess.TimeoutExpired, OSError) as e: diff --git a/agent/src/self_review.py b/agent/src/self_review.py index 69114d68..0c7634cb 100644 --- a/agent/src/self_review.py +++ b/agent/src/self_review.py @@ -19,6 +19,9 @@ # Diff truncation limit (characters). Large diffs are cut at hunk boundaries. _MAX_DIFF_CHARS = 60_000 +# Default cap on review-loop turns when the ``self_review`` step omits max_turns. +_DEFAULT_REVIEW_MAX_TURNS = 5 + # Minimal system prompt for the self-review agent invocation. _REVIEW_SYSTEM_PROMPT = """\ You are a code reviewer working inside the repository {repo_url} on branch {branch_name}. @@ -90,23 +93,31 @@ def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str: ) +def _milestone(progress: _ProgressWriter | None, name: str, detail: str) -> None: + """Emit a progress milestone if a writer is wired up (no-op otherwise).""" + if progress is not None: + progress.write_agent_milestone(name, detail) + + def run_self_review( config: TaskConfig, setup: RepoSetup, agent_result: AgentResult, - trajectory: _TrajectoryWriter, - progress: _ProgressWriter, + trajectory: _TrajectoryWriter | None, + progress: _ProgressWriter | None, + *, + max_turns: int = _DEFAULT_REVIEW_MAX_TURNS, ) -> AgentResult | None: """Run the self-review phase: LLM critiques its own diff and fixes issues. + Invoked by the ``self_review`` workflow step handler. The step's presence in + the resolved workflow is the enablement signal — there is no separate + feature flag; ``max_turns`` comes from the step (``self_review.max_turns``, + default 5) and caps the review loop within the task's remaining allowance. + Returns the AgentResult from the review phase, or None if skipped. Fail-open: errors are logged but never block the pipeline. """ - # Skip condition: feature disabled - if not config.self_review_enabled: - log("TASK", "self_review: disabled (self_review_enabled=False)") - return None - # Skip condition: read-only workflows produce no diff to review if config.read_only: log("TASK", "self_review: skipped for read-only workflow") @@ -115,7 +126,7 @@ def run_self_review( # Compute remaining turns used_turns = agent_result.turns or 0 remaining_turns = config.max_turns - used_turns - review_turns = min(remaining_turns, config.self_review_max_turns) + review_turns = min(remaining_turns, max_turns) if review_turns <= 0: log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})") return None @@ -162,7 +173,8 @@ def run_self_review( f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, " f"diff_chars={len(diff)})", ) - progress.write_agent_milestone( + _milestone( + progress, "self_review_started", f"turns={review_turns} diff_chars={len(diff)}", ) @@ -182,7 +194,8 @@ def run_self_review( except Exception as e: # Fail-open: self-review errors never block the pipeline log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}") - progress.write_agent_milestone( + _milestone( + progress, "self_review_complete", f"status=error error={type(e).__name__}: {e}", ) @@ -193,7 +206,8 @@ def run_self_review( f"self_review: complete (status={review_result.status}, " f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})", ) - progress.write_agent_milestone( + _milestone( + progress, "self_review_complete", f"status={review_result.status} turns={review_result.turns}", ) diff --git a/agent/src/server.py b/agent/src/server.py index ea615ce9..e1114374 100644 --- a/agent/src/server.py +++ b/agent/src/server.py @@ -405,8 +405,6 @@ def _run_task_background( user_id: str = "", workload_access_token: str = "", attachments: list[dict] | None = None, - self_review_enabled: bool = False, - self_review_max_turns: int = 5, ) -> None: """Run the agent task in a background thread.""" global _background_pipeline_failed @@ -490,8 +488,6 @@ def _run_task_background( trace=trace, user_id=user_id, attachments=attachments, - self_review_enabled=self_review_enabled, - self_review_max_turns=self_review_max_turns, ) _background_pipeline_failed = False except Exception as e: @@ -580,9 +576,6 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: channel_source = inp.get("channel_source", "") or "" channel_metadata = inp.get("channel_metadata") or {} attachments = inp.get("attachments") or [] - # Self-review (opt-in): LLM critiques its own diff before PR creation. - self_review_enabled = inp.get("self_review_enabled") is True - self_review_max_turns = int(inp.get("self_review_max_turns", 5)) # ``trace`` is strictly opt-in (design §10.1). Accept only real # booleans from the orchestrator — a string "false" would otherwise # flip the flag on. @@ -656,8 +649,6 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: "user_id": user_id, "workload_access_token": workload_access_token, "attachments": attachments, - "self_review_enabled": self_review_enabled, - "self_review_max_turns": self_review_max_turns, } diff --git a/agent/src/workflow/models.py b/agent/src/workflow/models.py index 1411a16e..887d425a 100644 --- a/agent/src/workflow/models.py +++ b/agent/src/workflow/models.py @@ -31,6 +31,7 @@ "run_agent", "verify_build", "verify_lint", + "self_review", "ensure_pr", "post_review", "deliver_artifact", @@ -140,6 +141,9 @@ class Step(BaseModel): strategy: EnsurePrStrategy | None = None gate: VerifyGate | None = None target: DeliverTarget | None = None + # self_review: cap on turns the review loop may use (default applied by the + # handler when omitted). + max_turns: int | None = None class TerminalOutcomes(BaseModel): diff --git a/agent/src/workflow/runner.py b/agent/src/workflow/runner.py index 4f818689..b58008c1 100644 --- a/agent/src/workflow/runner.py +++ b/agent/src/workflow/runner.py @@ -375,6 +375,10 @@ def _milestone(ctx: StepContext, milestone: str) -> None: # pipeline.py) to avoid pulling the SDK / boto3 at module import and to keep the # orchestration core importable in isolation for tests. +# Default turn cap for a self_review step that omits max_turns (kept in sync +# with the schema's self_review max_turns default and self_review.py). +_DEFAULT_SELF_REVIEW_MAX_TURNS = 5 + def _handle_clone_repo(step: Step, ctx: StepContext) -> StepOutcome: """Clone + prepare the repo (replaces the inline ``setup_repo`` call). @@ -560,6 +564,72 @@ def _handle_verify_lint(step: Step, ctx: StepContext) -> StepOutcome: ) +def _handle_self_review(step: Step, ctx: StepContext) -> StepOutcome: + """Have the agent critique its own diff and fix issues before the PR opens. + + Declaring a ``self_review`` step in the workflow *is* the enablement — there + is no separate feature flag. The step runs a second, review-focused + ``run_agent`` loop over the cumulative branch diff (``self_review.run_self_review``), + capped at ``step.max_turns`` turns (default 5) drawn from the task's + remaining turn/budget allowance, and accumulates the review's turns/cost back + onto ``ctx.agent_result`` so the terminal result reflects the full task. + + Authored as an advisory step (``on_failure: continue``): a review that is + skipped (read-only, no diff, no remaining turns) or fails is recorded but + never blocks PR creation — matching the original fail-open design. The + summary the review agent writes is posted as a PR comment by the pipeline + after ``ensure_pr`` (``self_review_ran`` in this outcome's data signals it). + """ + if ctx.setup is None: + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="failed", + error="self_review requires a cloned repo (no clone_repo step ran)", + ) + if ctx.agent_result is None: + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="failed", + error="self_review requires a prior run_agent result to size its turn budget", + ) + + from self_review import run_self_review + + review_result = run_self_review( + ctx.config, + ctx.setup, + ctx.agent_result, + ctx.trajectory, + ctx.progress, + max_turns=step.max_turns or _DEFAULT_SELF_REVIEW_MAX_TURNS, + ) + if review_result is None: + # Skipped (read-only / empty diff / no remaining turns) — not a failure. + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="succeeded", + data={"self_review_ran": False}, + ) + + # Accumulate the review phase's turns/cost onto the running agent result so + # the terminal TaskResult reflects the whole task (implement + review). + ar = ctx.agent_result + ar.turns += review_result.turns + ar.num_turns += review_result.num_turns or review_result.turns + if review_result.cost_usd is not None: + ar.cost_usd = (ar.cost_usd or 0.0) + review_result.cost_usd + + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="succeeded", + data={"self_review_ran": True, "review_status": review_result.status}, + ) + + def _handle_ensure_pr(step: Step, ctx: StepContext) -> StepOutcome: """Create / push+resolve / resolve a PR per the step's ``strategy``. @@ -645,6 +715,7 @@ def _handle_deliver_artifact(step: Step, ctx: StepContext) -> StepOutcome: "run_agent": _handle_run_agent, "verify_build": _handle_verify_build, "verify_lint": _handle_verify_lint, + "self_review": _handle_self_review, "ensure_pr": _handle_ensure_pr, "post_review": _handle_post_review, "deliver_artifact": _handle_deliver_artifact, diff --git a/agent/src/workflow/validator.py b/agent/src/workflow/validator.py index 25f2a6ee..21f27c45 100644 --- a/agent/src/workflow/validator.py +++ b/agent/src/workflow/validator.py @@ -40,8 +40,12 @@ _SIDE_EFFECTING_KINDS = frozenset({"ensure_pr", "post_review", "deliver_artifact"}) # Steps that only make sense when a repo is cloned (rule 3 / rule 7). +# self_review diffs and re-runs the agent against the cloned working tree, so it +# is repo-only too. It is deliberately NOT in _SIDE_EFFECTING_KINDS: it commits +# locally but never pushes or opens a PR, so it may be marked on_failure: +# continue (advisory / fail-open). _REPO_ONLY_KINDS = frozenset( - {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"} + {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"} ) # Built-in (Phase 1-3) policy modules / MCP servers. Registry refs (registry://) diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py index e2a817f1..11f62deb 100644 --- a/agent/tests/test_self_review.py +++ b/agent/tests/test_self_review.py @@ -1,6 +1,5 @@ """Unit tests for self_review.py — self-review orchestration module.""" -import os from unittest.mock import MagicMock, patch from models import AgentResult, RepoSetup, TaskConfig @@ -16,63 +15,42 @@ def _make_config(**overrides) -> TaskConfig: """Create a minimal TaskConfig for testing.""" - defaults = { - "repo_url": "owner/repo", - "github_token": "ghp_test123", - "aws_region": "us-east-1", - "task_description": "Fix the bug", - "max_turns": 10, - "self_review_enabled": True, - "self_review_max_turns": 5, - "task_type": "new_task", - } - defaults.update(overrides) - return TaskConfig(**defaults) + base = TaskConfig( + repo_url="owner/repo", + github_token="ghp_test123", + aws_region="us-east-1", + task_description="Fix the bug", + max_turns=10, + ) + return base.model_copy(update=overrides) if overrides else base def _make_setup(**overrides) -> RepoSetup: """Create a minimal RepoSetup for testing.""" - defaults = { - "repo_dir": "/workspace/repo", - "branch": "feat/123-fix", - "default_branch": "main", - } - defaults.update(overrides) - return RepoSetup(**defaults) + base = RepoSetup( + repo_dir="/workspace/repo", + branch="feat/123-fix", + default_branch="main", + ) + return base.model_copy(update=overrides) if overrides else base def _make_agent_result(**overrides) -> AgentResult: """Create a minimal AgentResult for testing.""" - defaults = { - "status": "success", - "turns": 5, - "num_turns": 5, - "cost_usd": 0.50, - } - defaults.update(overrides) - return AgentResult(**defaults) + base = AgentResult( + status="success", + turns=5, + num_turns=5, + cost_usd=0.50, + ) + return base.model_copy(update=overrides) if overrides else base class TestSkipConditions: """Test all conditions that cause self-review to be skipped.""" - def test_skip_when_disabled(self): - config = _make_config(self_review_enabled=False) - setup = _make_setup() - agent_result = _make_agent_result() - trajectory = MagicMock() - progress = MagicMock() - - result = run_self_review(config, setup, agent_result, trajectory, progress) - assert result is None - - def test_skip_for_pr_review_task_type(self): - config = _make_config( - task_type="pr_review", - pr_number="42", - task_description="", - issue_number="", - ) + def test_skip_for_read_only_workflow(self): + config = _make_config(read_only=True) setup = _make_setup() agent_result = _make_agent_result() trajectory = MagicMock() @@ -82,13 +60,13 @@ def test_skip_for_pr_review_task_type(self): assert result is None def test_skip_when_no_remaining_turns(self): - config = _make_config(max_turns=5, self_review_max_turns=5) + config = _make_config(max_turns=5) setup = _make_setup() agent_result = _make_agent_result(turns=5) trajectory = MagicMock() progress = MagicMock() - result = run_self_review(config, setup, agent_result, trajectory, progress) + result = run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5) assert result is None def test_skip_when_no_remaining_budget(self): @@ -208,17 +186,17 @@ class TestBudgetAndTurnComputation: @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") @patch("self_review.asyncio.run") - def test_review_turns_capped_at_self_review_max_turns(self, mock_asyncio_run, mock_diff): + def test_review_turns_capped_at_step_max_turns(self, mock_asyncio_run, mock_diff): mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2) - config = _make_config(max_turns=100, self_review_max_turns=3) + config = _make_config(max_turns=100) setup = _make_setup() agent_result = _make_agent_result(turns=5) trajectory = MagicMock() progress = MagicMock() - run_self_review(config, setup, agent_result, trajectory, progress) + # The step's max_turns (3) caps the review even though 95 turns remain. + run_self_review(config, setup, agent_result, trajectory, progress, max_turns=3) - # The review config passed to run_agent should have max_turns=3 call_args = mock_asyncio_run.call_args coro = call_args[0][0] # Close the coroutine to avoid warnings @@ -228,15 +206,15 @@ def test_review_turns_capped_at_self_review_max_turns(self, mock_asyncio_run, mo @patch("self_review.asyncio.run") def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff): mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1) - config = _make_config(max_turns=8, self_review_max_turns=5) + config = _make_config(max_turns=8) setup = _make_setup() agent_result = _make_agent_result(turns=6) # Only 2 remaining trajectory = MagicMock() progress = MagicMock() - run_self_review(config, setup, agent_result, trajectory, progress) + # Should use min(2 remaining, 5 cap) = 2 turns + run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5) - # Should use min(2, 5) = 2 turns call_args = mock_asyncio_run.call_args coro = call_args[0][0] coro.close() @@ -248,9 +226,7 @@ class TestHappyPath: @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") @patch("self_review.asyncio.run") def test_returns_review_result(self, mock_asyncio_run, mock_diff): - review_agent_result = AgentResult( - status="success", turns=2, num_turns=2, cost_usd=0.10 - ) + review_agent_result = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10) mock_asyncio_run.return_value = review_agent_result config = _make_config() @@ -482,3 +458,121 @@ def test_comment_body_includes_header(self, tmp_path): body = call_args[body_idx] assert "Self-Review Summary" in body assert "**Findings:** 0" in body + + +class TestSelfReviewStepHandler: + """Tests for the workflow ``self_review`` step handler (workflow/runner.py). + + The step's presence in a workflow is the enablement signal; the handler + wraps ``run_self_review`` and accumulates the review's turns/cost onto the + shared ``ctx.agent_result``. + """ + + def _ctx(self, *, setup, agent_result): + from workflow import Step, StepContext, Workflow + + wf = Workflow.model_validate( + { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "prompt": {"template": "registry://prompt/x"}, + "hydration": {"sources": ["task_description"]}, + "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]}, + "steps": [ + {"kind": "run_agent"}, + {"kind": "self_review", "name": "review", "max_turns": 4}, + ], + "terminal_outcomes": {"primary": "pr_url"}, + "status": "production", + } + ) + step = next(s for s in wf.steps if s.kind == "self_review") + ctx = StepContext( + workflow=wf, + config=_make_config(), + setup=setup, + agent_result=agent_result, + trajectory=MagicMock(), + progress=MagicMock(), + ) + return step, ctx, Step + + def test_passes_step_max_turns_and_accumulates(self): + from workflow.runner import _handle_self_review + + setup = _make_setup() + agent_result = _make_agent_result(turns=5, num_turns=5, cost_usd=0.50) + step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result) + + review = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10) + with patch("self_review.run_self_review", return_value=review) as mock_review: + outcome = _handle_self_review(step, ctx) + + # The step's max_turns (4) is forwarded to run_self_review. + assert mock_review.call_args.kwargs["max_turns"] == 4 + assert outcome.status == "succeeded" + assert outcome.data["self_review_ran"] is True + # Review turns/cost accumulated onto the shared agent_result. + assert ctx.agent_result.turns == 7 + assert ctx.agent_result.num_turns == 7 + assert abs(ctx.agent_result.cost_usd - 0.60) < 1e-9 + + def test_skipped_review_records_not_ran(self): + from workflow.runner import _handle_self_review + + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result) + + with patch("self_review.run_self_review", return_value=None): + outcome = _handle_self_review(step, ctx) + + assert outcome.status == "succeeded" + assert outcome.data["self_review_ran"] is False + # No accumulation when the review was skipped. + assert ctx.agent_result.turns == 5 + + def test_fails_without_clone(self): + from workflow.runner import _handle_self_review + + agent_result = _make_agent_result() + step, ctx, _ = self._ctx(setup=None, agent_result=agent_result) + + outcome = _handle_self_review(step, ctx) + assert outcome.status == "failed" + assert "cloned repo" in (outcome.error or "") + + def test_default_max_turns_when_step_omits_it(self): + from workflow import Step, StepContext, Workflow + from workflow.runner import _DEFAULT_SELF_REVIEW_MAX_TURNS, _handle_self_review + + wf = Workflow.model_validate( + { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "prompt": {"template": "registry://prompt/x"}, + "hydration": {"sources": ["task_description"]}, + "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]}, + "steps": [{"kind": "run_agent"}, {"kind": "self_review"}], + "terminal_outcomes": {"primary": "pr_url"}, + "status": "production", + } + ) + step = next(s for s in wf.steps if s.kind == "self_review") + assert step.max_turns is None + ctx = StepContext( + workflow=wf, + config=_make_config(), + setup=_make_setup(), + agent_result=_make_agent_result(turns=2), + trajectory=MagicMock(), + progress=MagicMock(), + ) + review = AgentResult(status="success", turns=1, num_turns=1) + with patch("self_review.run_self_review", return_value=review) as mock_review: + _handle_self_review(step, ctx) + assert mock_review.call_args.kwargs["max_turns"] == _DEFAULT_SELF_REVIEW_MAX_TURNS + # Step model unused beyond fixture; keep import referenced. + assert Step is not None diff --git a/agent/workflows/coding/new-task-v1.yaml b/agent/workflows/coding/new-task-v1.yaml index 7219fdf3..2454b007 100644 --- a/agent/workflows/coding/new-task-v1.yaml +++ b/agent/workflows/coding/new-task-v1.yaml @@ -42,6 +42,12 @@ steps: - { kind: clone_repo, name: setup } - { kind: hydrate_context, name: context } - { kind: run_agent, name: implement } + # Optional pre-PR self-review: the agent critiques its own cumulative diff and + # fixes issues before the PR is opened, then posts a summary as a PR comment. + # Advisory (on_failure: continue) and fail-open — a review failure/skip never + # blocks PR creation. Remove this step to disable self-review for the workflow; + # tune the turn budget via max_turns (1-20, default 5). + - { kind: self_review, name: review, on_failure: continue, max_turns: 5 } - { kind: verify_build, name: build, gate: regression_only } - { kind: ensure_pr, name: open_pr, strategy: create } terminal_outcomes: diff --git a/agent/workflows/schema/workflow.schema.json b/agent/workflows/schema/workflow.schema.json index b4b753bb..f68d867b 100644 --- a/agent/workflows/schema/workflow.schema.json +++ b/agent/workflows/schema/workflow.schema.json @@ -187,7 +187,7 @@ "kind": { "type": "string", "description": "Dispatches to a registered step handler.", - "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "ensure_pr", "post_review", "deliver_artifact"] + "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "self_review", "ensure_pr", "post_review", "deliver_artifact"] }, "name": { "type": "string", @@ -213,6 +213,12 @@ "type": "string", "description": "For deliver_artifact: the name of a registered Python deliverer (workflow/deliverers.py / DELIVERERS) to route the produced output through. Open string, not a closed enum, so a new delivery method is a registered deliverer, not a schema change (ADR-014 addendum 2026-06-08). First-party names: s3, comment, s3_and_comment. Each deliverer declares the terminal outcomes it produces; validator rule 11 consults that registry.", "minLength": 1 + }, + "max_turns": { + "type": "integer", + "description": "For self_review: cap on agent turns the self-review loop may use (drawn from the task's remaining turn/budget allowance). Defaults to 5 when omitted.", + "minimum": 1, + "maximum": 20 } } } @@ -295,7 +301,7 @@ "steps": { "items": { "properties": { - "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"] } } + "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"] } } } } }, diff --git a/contracts/workflow-validation/rule3-repo-less-has-self-review.json b/contracts/workflow-validation/rule3-repo-less-has-self-review.json new file mode 100644 index 00000000..42378e53 --- /dev/null +++ b/contracts/workflow-validation/rule3-repo-less-has-self-review.json @@ -0,0 +1,69 @@ +{ + "name": "rule3-repo-less-has-self-review", + "description": "Repo-less knowledge workflow illegally declares a self_review step. self_review is repo-only (it diffs and re-runs the agent against a cloned tree), so both the schema's requires_repo:false conditional and cross-field rule 3 reject it.", + "workflow": { + "id": "knowledge/web-research-v1", + "version": "1.0.0", + "domain": "knowledge", + "requires_repo": false, + "read_only": false, + "prompt": { + "template": "registry://prompt/web-research-workflow" + }, + "hydration": { + "sources": [ + "task_description", + "urls" + ] + }, + "agent_config": { + "tier": "elevated", + "allowed_tools": [ + "Read", + "WebFetch" + ], + "cedar_policy_modules": [ + "builtin/hard_deny", + "builtin/soft_deny" + ] + }, + "repo_config": { + "discover": false + }, + "required_inputs": { + "all_of": [ + "task_description" + ] + }, + "steps": [ + { + "kind": "hydrate_context", + "name": "context" + }, + { + "kind": "run_agent", + "name": "research" + }, + { + "kind": "self_review", + "name": "review" + }, + { + "kind": "deliver_artifact", + "name": "deliver", + "target": "s3_and_comment" + } + ], + "terminal_outcomes": { + "primary": "artifact" + }, + "status": "production" + }, + "expected": { + "valid": false, + "violations": [ + "rule-3", + "schema" + ] + } +} diff --git a/contracts/workflow-validation/valid-coding-self-review-step.json b/contracts/workflow-validation/valid-coding-self-review-step.json new file mode 100644 index 00000000..3dccb547 --- /dev/null +++ b/contracts/workflow-validation/valid-coding-self-review-step.json @@ -0,0 +1,94 @@ +{ + "name": "valid-coding-self-review-step", + "description": "Coding workflow with an optional pre-PR self_review step (advisory, on_failure: continue). self_review is repo-only and non-side-effecting, so on_failure: continue is permitted (unlike ensure_pr).", + "workflow": { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "description": "Implement a GitHub issue, self-review, and open a PR.", + "requires_repo": true, + "read_only": false, + "prompt": { + "template": "registry://prompt/coding-new-task-workflow" + }, + "hydration": { + "sources": [ + "issue", + "memory", + "task_description" + ] + }, + "agent_config": { + "tier": "standard", + "allowed_tools": [ + "Bash", + "Read", + "Write", + "Edit", + "Glob", + "Grep", + "WebFetch" + ], + "cedar_policy_modules": [ + "builtin/hard_deny", + "builtin/soft_deny" + ] + }, + "repo_config": { + "provider": "github", + "discover": true + }, + "required_inputs": { + "one_of": [ + "issue_number", + "task_description" + ] + }, + "steps": [ + { + "kind": "clone_repo", + "name": "setup" + }, + { + "kind": "hydrate_context", + "name": "context" + }, + { + "kind": "run_agent", + "name": "implement" + }, + { + "kind": "self_review", + "name": "review", + "on_failure": "continue", + "max_turns": 5 + }, + { + "kind": "verify_build", + "name": "build", + "gate": "regression_only" + }, + { + "kind": "ensure_pr", + "name": "open_pr", + "strategy": "create" + } + ], + "terminal_outcomes": { + "primary": "pr_url" + }, + "limits": { + "max_turns": 100 + }, + "promotion_gate": { + "requires": [ + "tests:agent/new_task" + ] + }, + "status": "production" + }, + "expected": { + "valid": true, + "violations": [] + } +}