diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index 7718cb48..f4c29206 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -31,6 +31,7 @@ _extract_agent_notes, ensure_committed, ensure_pr, + post_self_review_comment, verify_build, verify_lint, ) @@ -217,6 +218,50 @@ def _execute_agent_step( return ctx.agent_result +def _execute_self_review_step( + workflow: Workflow | None, + config, + setup, + agent_result, + hydrated, + trajectory, + progress, +) -> bool: + """Drive the workflow's ``self_review`` step (if declared) through the runner. + + Mirrors ``_execute_agent_step``: only the ``self_review`` step is dispatched + (``only_kinds={"self_review"}``) so clone / build / PR stay on the inline + path. The step's handler accumulates the review loop's turns/cost back onto + ``agent_result`` (a shared mutable model), so the terminal result reflects + implement + review. + + Returns True when the review actually ran (so the caller posts the summary + PR comment after ``ensure_pr``); False when no ``self_review`` step is + declared, the workflow failed to reload, or the review was skipped (read-only + / empty diff / no remaining turns). Fully fail-open — a review failure is + recorded as a step outcome and never propagates to block PR creation. + """ + if workflow is None or not any(s.kind == "self_review" for s in workflow.steps): + return False + + from workflow import StepContext, run_workflow + + ctx = StepContext( + workflow=workflow, + config=config, + hydrated=hydrated, + progress=progress, + trajectory=trajectory, + setup=setup, + # The implement step's result, threaded in so the handler can size the + # review's turn budget and accumulate its turns/cost onto it. + agent_result=agent_result, + ) + with task_span("task.self_review"): + run_workflow(workflow, ctx, only_kinds={"self_review"}) + return bool(ctx.artifacts.get("self_review_ran", False)) + + def _run_repoless_task( *, config, @@ -1006,6 +1051,23 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: ) ensure_pr_strategy = "create" + # Self-review step: if the resolved workflow declares a ``self_review`` + # step, drive it through the workflow runner (same pattern as + # ``_execute_agent_step``). The step has the LLM critique its own diff + # and fix issues, accumulating its turns/cost onto ``agent_result``. + # Runs AFTER the cancel short-circuit so a cancelled task never starts + # a second agent loop, and BEFORE post-hooks so fixes land in the PR. + # Fail-open: a review failure/skip never blocks PR creation. + self_review_ran = _execute_self_review_step( + _workflow, + config, + setup, + agent_result, + hc, + trajectory, + progress, + ) + # Post-hooks (agent_result is guaranteed set by the try/except above) with task_span("task.post_hooks") as post_span: # Safety net: commit any uncommitted tracked changes (skip for read-only tasks) @@ -1028,6 +1090,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: if pr_url: progress.write_agent_milestone("pr_created", pr_url) + # Post self-review summary as PR comment (if the self_review step ran) + if pr_url and self_review_ran: + post_self_review_comment(setup.repo_dir, pr_url, config) + # Memory write — capture task episode and repo learnings memory_written = False effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "") diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py index 058a1e4c..00268847 100644 --- a/agent/src/post_hooks.py +++ b/agent/src/post_hooks.py @@ -392,6 +392,67 @@ def ensure_pr( return None +def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool: + """Post the self-review summary as a PR comment. + + Reads the summary file written by the self-review agent, formats it as a + comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged + but never propagated. + + Returns True if a comment was posted, False otherwise. + """ + from self_review import read_self_review_summary + + try: + summary = read_self_review_summary(repo_dir) + except Exception as e: + log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}") + return False + + if not summary: + log("POST", "post_self_review_comment: no summary file found — skipping") + return False + + # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123) + match = re.search(r"/pull/(\d+)", pr_url) + if not match: + log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}") + return False + pr_number = match.group(1) + + comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}" + + try: + result = subprocess.run( + [ + "gh", + "pr", + "comment", + pr_number, + "--repo", + config.repo_url, + "--body", + comment_body, + ], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + log("POST", f"Self-review summary posted as comment on PR #{pr_number}") + return True + stderr = result.stderr.strip()[:200] if result.stderr else "" + log( + "WARN", + f"post_self_review_comment: gh pr comment failed (rc={result.returncode}): {stderr}", + ) + return False + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}") + return False + + def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None: """Extract the "## Agent notes" section from the PR body. diff --git a/agent/src/prompts/__init__.py b/agent/src/prompts/__init__.py index 60c5b2c0..8334c10b 100644 --- a/agent/src/prompts/__init__.py +++ b/agent/src/prompts/__init__.py @@ -13,6 +13,7 @@ from .new_task import NEW_TASK_WORKFLOW from .pr_iteration import PR_ITERATION_WORKFLOW from .pr_review import PR_REVIEW_WORKFLOW +from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT from .web_research import WEB_RESEARCH_PROMPT DEFAULT_WORKFLOW_ID = "coding/new-task-v1" diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py new file mode 100644 index 00000000..25cf47a2 --- /dev/null +++ b/agent/src/prompts/self_review.py @@ -0,0 +1,61 @@ +"""Self-review prompt template for pre-PR diff critique.""" + +SELF_REVIEW_PROMPT = """\ +You are reviewing your own work before it becomes a pull request. Below is the \ +cumulative diff of all changes on this branch compared to the base branch. + + +{diff} + + +## Task context + +{task_description} + +## Review checklist + +Examine the diff carefully for: + +1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \ +incorrect assumptions about data shapes or API contracts. +2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \ +race conditions. +3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \ +insecure defaults, OWASP Top 10 issues. +4. **Style & consistency** — Naming conventions, code style violations relative to \ +the surrounding codebase, unnecessary complexity. +5. **Test gaps** — Important behaviour that is untested, assertions that don't \ +verify the right thing, missing edge-case coverage. + +## Instructions + +- If you find issues, fix them directly: edit the files, run the build/tests to \ +verify your fixes, and commit the changes. +- If no issues are found, stop immediately — do not make changes for the sake of \ +making changes. +- Do NOT refactor code that was not part of the original diff unless it has a \ +concrete bug or security issue. +- Keep fixes minimal and focused. Each fix should be a separate commit with a \ +clear message. + +## Summary output + +After completing your review (whether you made fixes or not), write a file \ +`.self-review-summary.md` in the repository root with your findings in this format: + +```markdown +### Self-Review Summary + +**Findings:** +**Fixes applied:** + +#### Issues found + +- : +``` + +If no issues were found, write the file with: "No issues found — code looks good." + +This file is a pipeline artifact and will be deleted automatically — it will NOT \ +appear in the pull request. +""" diff --git a/agent/src/self_review.py b/agent/src/self_review.py new file mode 100644 index 00000000..0c7634cb --- /dev/null +++ b/agent/src/self_review.py @@ -0,0 +1,253 @@ +"""Self-review orchestration: LLM critiques its own diff before PR creation.""" + +from __future__ import annotations + +import asyncio +import contextlib +import os +import subprocess +from typing import TYPE_CHECKING + +from prompts.self_review import SELF_REVIEW_PROMPT +from shell import log + +if TYPE_CHECKING: + from models import AgentResult, RepoSetup, TaskConfig + from progress_writer import _ProgressWriter + from telemetry import _TrajectoryWriter + +# Diff truncation limit (characters). Large diffs are cut at hunk boundaries. +_MAX_DIFF_CHARS = 60_000 + +# Default cap on review-loop turns when the ``self_review`` step omits max_turns. +_DEFAULT_REVIEW_MAX_TURNS = 5 + +# Minimal system prompt for the self-review agent invocation. +_REVIEW_SYSTEM_PROMPT = """\ +You are a code reviewer working inside the repository {repo_url} on branch {branch_name}. +Your working directory is {repo_dir}. + +You have full access to the filesystem and can run commands. Fix any issues you \ +find directly — edit files, run the build, and commit fixes. Keep changes minimal \ +and focused. + +Do NOT open a pull request or push. Just fix issues and commit locally. +""" + + +def _get_diff(repo_dir: str, default_branch: str) -> str: + """Generate the cumulative diff of the branch vs origin/{default_branch}.""" + try: + result = subprocess.run( + ["git", "diff", f"origin/{default_branch}...HEAD"], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + log("WARN", f"self_review: git diff failed (exit {result.returncode})") + return "" + return result.stdout + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}") + return "" + + +def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str: + """Truncate diff at a hunk boundary if it exceeds max_chars. + + Cuts at the last complete hunk (line starting with '@@') that fits + within the limit, appending a truncation notice. + """ + if len(diff) <= max_chars: + return diff + + # Find the last hunk header that starts before max_chars + truncated = diff[:max_chars] + last_hunk = truncated.rfind("\n@@") + if last_hunk > 0: + # Cut just before this hunk header + truncated = truncated[:last_hunk] + else: + # No hunk boundary found — hard-cut at max_chars + last_newline = truncated.rfind("\n") + if last_newline > 0: + truncated = truncated[:last_newline] + + total_lines = diff.count("\n") + kept_lines = truncated.count("\n") + truncated += ( + f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; " + f"{len(diff) - len(truncated)} chars omitted] ..." + ) + return truncated + + +def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str: + """Build a minimal system prompt for the self-review agent.""" + return _REVIEW_SYSTEM_PROMPT.format( + repo_url=config.repo_url, + branch_name=setup.branch, + repo_dir=setup.repo_dir, + ) + + +def _milestone(progress: _ProgressWriter | None, name: str, detail: str) -> None: + """Emit a progress milestone if a writer is wired up (no-op otherwise).""" + if progress is not None: + progress.write_agent_milestone(name, detail) + + +def run_self_review( + config: TaskConfig, + setup: RepoSetup, + agent_result: AgentResult, + trajectory: _TrajectoryWriter | None, + progress: _ProgressWriter | None, + *, + max_turns: int = _DEFAULT_REVIEW_MAX_TURNS, +) -> AgentResult | None: + """Run the self-review phase: LLM critiques its own diff and fixes issues. + + Invoked by the ``self_review`` workflow step handler. The step's presence in + the resolved workflow is the enablement signal — there is no separate + feature flag; ``max_turns`` comes from the step (``self_review.max_turns``, + default 5) and caps the review loop within the task's remaining allowance. + + Returns the AgentResult from the review phase, or None if skipped. + Fail-open: errors are logged but never block the pipeline. + """ + # Skip condition: read-only workflows produce no diff to review + if config.read_only: + log("TASK", "self_review: skipped for read-only workflow") + return None + + # Compute remaining turns + used_turns = agent_result.turns or 0 + remaining_turns = config.max_turns - used_turns + review_turns = min(remaining_turns, max_turns) + if review_turns <= 0: + log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})") + return None + + # Compute remaining budget + review_budget: float | None = None + if config.max_budget_usd is not None: + used_cost = agent_result.cost_usd or 0.0 + remaining_budget = config.max_budget_usd - used_cost + if remaining_budget <= 0: + log( + "TASK", + f"self_review: no remaining budget " + f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})", + ) + return None + review_budget = remaining_budget + + # Get the diff + diff = _get_diff(setup.repo_dir, setup.default_branch) + if not diff.strip(): + log("TASK", "self_review: no diff found — skipping") + return None + + # Truncate if needed + diff = _truncate_diff(diff) + + # Build the review prompt + task_desc = config.task_description or f"Issue #{config.issue_number}" + user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc) + system_prompt = _build_review_system_prompt(config, setup) + + # Build a modified config for the review run + review_config = config.model_copy( + update={ + "max_turns": review_turns, + "max_budget_usd": review_budget, + } + ) + + log( + "TASK", + f"self_review: starting (turns={review_turns}, " + f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, " + f"diff_chars={len(diff)})", + ) + _milestone( + progress, + "self_review_started", + f"turns={review_turns} diff_chars={len(diff)}", + ) + + try: + from runner import run_agent + + review_result = asyncio.run( + run_agent( + user_prompt, + system_prompt, + review_config, + cwd=setup.repo_dir, + trajectory=trajectory, + ) + ) + except Exception as e: + # Fail-open: self-review errors never block the pipeline + log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}") + _milestone( + progress, + "self_review_complete", + f"status=error error={type(e).__name__}: {e}", + ) + return None + + log( + "TASK", + f"self_review: complete (status={review_result.status}, " + f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})", + ) + _milestone( + progress, + "self_review_complete", + f"status={review_result.status} turns={review_result.turns}", + ) + return review_result + + +_SUMMARY_FILENAME = ".self-review-summary.md" + + +def read_self_review_summary(repo_dir: str) -> str | None: + """Read and delete the self-review summary file. + + The self-review agent writes `.self-review-summary.md` in the repo root. + This function reads the content, removes the file (so it never appears in + the PR), and returns the content. Returns None if the file doesn't exist. + """ + summary_path = os.path.join(repo_dir, _SUMMARY_FILENAME) + if not os.path.isfile(summary_path): + return None + + try: + with open(summary_path) as f: + content = f.read() + except OSError as e: + log("WARN", f"self_review: failed to read summary file: {type(e).__name__}: {e}") + return None + + # Remove the file so it doesn't end up in the PR + try: + os.remove(summary_path) + except OSError as e: + log("WARN", f"self_review: failed to delete summary file: {type(e).__name__}: {e}") + + # If the file was staged by the agent, unstage it + with contextlib.suppress(subprocess.TimeoutExpired, OSError): + subprocess.run( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=repo_dir, + capture_output=True, + timeout=30, + ) + + return content.strip() if content.strip() else None diff --git a/agent/src/workflow/models.py b/agent/src/workflow/models.py index 1411a16e..887d425a 100644 --- a/agent/src/workflow/models.py +++ b/agent/src/workflow/models.py @@ -31,6 +31,7 @@ "run_agent", "verify_build", "verify_lint", + "self_review", "ensure_pr", "post_review", "deliver_artifact", @@ -140,6 +141,9 @@ class Step(BaseModel): strategy: EnsurePrStrategy | None = None gate: VerifyGate | None = None target: DeliverTarget | None = None + # self_review: cap on turns the review loop may use (default applied by the + # handler when omitted). + max_turns: int | None = None class TerminalOutcomes(BaseModel): diff --git a/agent/src/workflow/runner.py b/agent/src/workflow/runner.py index 4f818689..b58008c1 100644 --- a/agent/src/workflow/runner.py +++ b/agent/src/workflow/runner.py @@ -375,6 +375,10 @@ def _milestone(ctx: StepContext, milestone: str) -> None: # pipeline.py) to avoid pulling the SDK / boto3 at module import and to keep the # orchestration core importable in isolation for tests. +# Default turn cap for a self_review step that omits max_turns (kept in sync +# with the schema's self_review max_turns default and self_review.py). +_DEFAULT_SELF_REVIEW_MAX_TURNS = 5 + def _handle_clone_repo(step: Step, ctx: StepContext) -> StepOutcome: """Clone + prepare the repo (replaces the inline ``setup_repo`` call). @@ -560,6 +564,72 @@ def _handle_verify_lint(step: Step, ctx: StepContext) -> StepOutcome: ) +def _handle_self_review(step: Step, ctx: StepContext) -> StepOutcome: + """Have the agent critique its own diff and fix issues before the PR opens. + + Declaring a ``self_review`` step in the workflow *is* the enablement — there + is no separate feature flag. The step runs a second, review-focused + ``run_agent`` loop over the cumulative branch diff (``self_review.run_self_review``), + capped at ``step.max_turns`` turns (default 5) drawn from the task's + remaining turn/budget allowance, and accumulates the review's turns/cost back + onto ``ctx.agent_result`` so the terminal result reflects the full task. + + Authored as an advisory step (``on_failure: continue``): a review that is + skipped (read-only, no diff, no remaining turns) or fails is recorded but + never blocks PR creation — matching the original fail-open design. The + summary the review agent writes is posted as a PR comment by the pipeline + after ``ensure_pr`` (``self_review_ran`` in this outcome's data signals it). + """ + if ctx.setup is None: + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="failed", + error="self_review requires a cloned repo (no clone_repo step ran)", + ) + if ctx.agent_result is None: + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="failed", + error="self_review requires a prior run_agent result to size its turn budget", + ) + + from self_review import run_self_review + + review_result = run_self_review( + ctx.config, + ctx.setup, + ctx.agent_result, + ctx.trajectory, + ctx.progress, + max_turns=step.max_turns or _DEFAULT_SELF_REVIEW_MAX_TURNS, + ) + if review_result is None: + # Skipped (read-only / empty diff / no remaining turns) — not a failure. + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="succeeded", + data={"self_review_ran": False}, + ) + + # Accumulate the review phase's turns/cost onto the running agent result so + # the terminal TaskResult reflects the whole task (implement + review). + ar = ctx.agent_result + ar.turns += review_result.turns + ar.num_turns += review_result.num_turns or review_result.turns + if review_result.cost_usd is not None: + ar.cost_usd = (ar.cost_usd or 0.0) + review_result.cost_usd + + return StepOutcome( + kind=step.kind, + name=_step_key(step), + status="succeeded", + data={"self_review_ran": True, "review_status": review_result.status}, + ) + + def _handle_ensure_pr(step: Step, ctx: StepContext) -> StepOutcome: """Create / push+resolve / resolve a PR per the step's ``strategy``. @@ -645,6 +715,7 @@ def _handle_deliver_artifact(step: Step, ctx: StepContext) -> StepOutcome: "run_agent": _handle_run_agent, "verify_build": _handle_verify_build, "verify_lint": _handle_verify_lint, + "self_review": _handle_self_review, "ensure_pr": _handle_ensure_pr, "post_review": _handle_post_review, "deliver_artifact": _handle_deliver_artifact, diff --git a/agent/src/workflow/validator.py b/agent/src/workflow/validator.py index 25f2a6ee..21f27c45 100644 --- a/agent/src/workflow/validator.py +++ b/agent/src/workflow/validator.py @@ -40,8 +40,12 @@ _SIDE_EFFECTING_KINDS = frozenset({"ensure_pr", "post_review", "deliver_artifact"}) # Steps that only make sense when a repo is cloned (rule 3 / rule 7). +# self_review diffs and re-runs the agent against the cloned working tree, so it +# is repo-only too. It is deliberately NOT in _SIDE_EFFECTING_KINDS: it commits +# locally but never pushes or opens a PR, so it may be marked on_failure: +# continue (advisory / fail-open). _REPO_ONLY_KINDS = frozenset( - {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"} + {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"} ) # Built-in (Phase 1-3) policy modules / MCP servers. Registry refs (registry://) diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py new file mode 100644 index 00000000..11f62deb --- /dev/null +++ b/agent/tests/test_self_review.py @@ -0,0 +1,578 @@ +"""Unit tests for self_review.py — self-review orchestration module.""" + +from unittest.mock import MagicMock, patch + +from models import AgentResult, RepoSetup, TaskConfig +from self_review import ( + _MAX_DIFF_CHARS, + _SUMMARY_FILENAME, + _get_diff, + _truncate_diff, + read_self_review_summary, + run_self_review, +) + + +def _make_config(**overrides) -> TaskConfig: + """Create a minimal TaskConfig for testing.""" + base = TaskConfig( + repo_url="owner/repo", + github_token="ghp_test123", + aws_region="us-east-1", + task_description="Fix the bug", + max_turns=10, + ) + return base.model_copy(update=overrides) if overrides else base + + +def _make_setup(**overrides) -> RepoSetup: + """Create a minimal RepoSetup for testing.""" + base = RepoSetup( + repo_dir="/workspace/repo", + branch="feat/123-fix", + default_branch="main", + ) + return base.model_copy(update=overrides) if overrides else base + + +def _make_agent_result(**overrides) -> AgentResult: + """Create a minimal AgentResult for testing.""" + base = AgentResult( + status="success", + turns=5, + num_turns=5, + cost_usd=0.50, + ) + return base.model_copy(update=overrides) if overrides else base + + +class TestSkipConditions: + """Test all conditions that cause self-review to be skipped.""" + + def test_skip_for_read_only_workflow(self): + config = _make_config(read_only=True) + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_when_no_remaining_turns(self): + config = _make_config(max_turns=5) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5) + assert result is None + + def test_skip_when_no_remaining_budget(self): + config = _make_config(max_budget_usd=1.0) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=1.0) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value="") + def test_skip_when_empty_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value=" \n \n ") + def test_skip_when_whitespace_only_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + +class TestGetDiff: + """Test _get_diff helper.""" + + @patch("self_review.subprocess.run") + def test_returns_diff_output(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="diff --git a/f\n+line\n") + result = _get_diff("/repo", "main") + assert result == "diff --git a/f\n+line\n" + mock_run.assert_called_once_with( + ["git", "diff", "origin/main...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + @patch("self_review.subprocess.run") + def test_returns_empty_on_failure(self, mock_run): + mock_run.return_value = MagicMock(returncode=1, stdout="") + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_returns_empty_on_timeout(self, mock_run): + import subprocess + + mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=60) + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_uses_custom_default_branch(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="some diff") + _get_diff("/repo", "develop") + mock_run.assert_called_once_with( + ["git", "diff", "origin/develop...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + +class TestTruncateDiff: + """Test _truncate_diff helper.""" + + def test_no_truncation_needed(self): + short_diff = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n+new line\n" + result = _truncate_diff(short_diff) + assert result == short_diff + + def test_truncates_at_hunk_boundary(self): + # Build a diff that exceeds the limit with multiple hunks + hunk1 = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n" + "+line\n" * 100 + hunk2 = "\n@@ -100,3 +101,4 @@\n" + "+more\n" * 100 + big_diff = hunk1 + hunk2 + result = _truncate_diff(big_diff, max_chars=len(hunk1) + 10) + # Should truncate before hunk2's @@ marker + assert "@@ -100,3" not in result + assert "truncated" in result + + def test_hard_cut_when_no_hunk_boundary(self): + # Single hunk that exceeds max + big_diff = "+x\n" * 30000 + result = _truncate_diff(big_diff, max_chars=100) + assert len(result) < len(big_diff) + assert "truncated" in result + + def test_exact_limit_no_truncation(self): + diff = "a" * _MAX_DIFF_CHARS + result = _truncate_diff(diff) + assert result == diff + + def test_one_over_limit_truncates(self): + diff = "a" * (_MAX_DIFF_CHARS + 1) + result = _truncate_diff(diff) + assert "truncated" in result + + +class TestBudgetAndTurnComputation: + """Test that budget and turn limits are computed correctly.""" + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_capped_at_step_max_turns(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2) + config = _make_config(max_turns=100) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + # The step's max_turns (3) caps the review even though 95 turns remain. + run_self_review(config, setup, agent_result, trajectory, progress, max_turns=3) + + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + # Close the coroutine to avoid warnings + coro.close() + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1) + config = _make_config(max_turns=8) + setup = _make_setup() + agent_result = _make_agent_result(turns=6) # Only 2 remaining + trajectory = MagicMock() + progress = MagicMock() + + # Should use min(2 remaining, 5 cap) = 2 turns + run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5) + + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + coro.close() + + +class TestHappyPath: + """Test the full happy path with mocked run_agent.""" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_returns_review_result(self, mock_asyncio_run, mock_diff): + review_agent_result = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10) + mock_asyncio_run.return_value = review_agent_result + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + + assert result is not None + assert result.status == "success" + assert result.turns == 2 + assert result.cost_usd == 0.10 + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_writes_progress_milestones(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.05 + ) + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # Should write both started and complete milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert len(milestone_calls) == 2 + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_fail_open_on_agent_error(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.side_effect = RuntimeError("SDK crashed") + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + # Should not raise + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + # Should still write milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + assert "error" in milestone_calls[1][0][1] + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.03 + ) + config = _make_config(max_budget_usd=None) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=None) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is not None + assert result.status == "success" + + +class TestReadSelfReviewSummary: + """Tests for read_self_review_summary — reads and cleans up the summary file.""" + + def test_returns_content_when_file_exists(self, tmp_path): + summary_content = ( + "### Self-Review Summary\n\n" + "**Findings:** 2\n" + "**Fixes applied:** 1\n\n" + "#### Issues found\n\n" + "- Security: hardcoded token — fixed\n" + "- Style: inconsistent naming — not fixed (cosmetic)\n" + ) + (tmp_path / _SUMMARY_FILENAME).write_text(summary_content) + + result = read_self_review_summary(str(tmp_path)) + + assert result == summary_content.strip() + + def test_returns_none_when_file_missing(self, tmp_path): + result = read_self_review_summary(str(tmp_path)) + assert result is None + + def test_deletes_file_after_reading(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("No issues found — code looks good.") + + read_self_review_summary(str(tmp_path)) + + assert not (tmp_path / _SUMMARY_FILENAME).exists() + + def test_returns_none_for_empty_file(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text(" \n\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + def test_returns_none_for_whitespace_only(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("\t\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + @patch("self_review.subprocess.run") + def test_runs_git_rm_cached_for_cleanup(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + mock_run.return_value = MagicMock(returncode=0) + + read_self_review_summary(str(tmp_path)) + + mock_run.assert_called_once_with( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=str(tmp_path), + capture_output=True, + timeout=30, + ) + + @patch("self_review.subprocess.run", side_effect=OSError("git not found")) + def test_git_rm_failure_does_not_block(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Findings here") + + # Should not raise even if git rm fails + result = read_self_review_summary(str(tmp_path)) + + assert result == "Findings here" + + +class TestPostSelfReviewComment: + """Tests for post_hooks.post_self_review_comment.""" + + def test_posts_comment_on_success(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1\n**Fixes applied:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is True + call_args = mock_run.call_args[0][0] + assert call_args[0:3] == ["gh", "pr", "comment"] + assert "42" in call_args + assert "owner/repo" in call_args + + def test_returns_false_when_no_summary(self, tmp_path): + from post_hooks import post_self_review_comment + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_invalid_pr_url(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/issues/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_gh_failure(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/99" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_fail_open_on_exception(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/5" + + with patch("post_hooks.subprocess.run", side_effect=OSError("network error")): + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_comment_body_includes_header(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 0\nNo issues.") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/7" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + post_self_review_comment(str(tmp_path), pr_url, config) + + call_args = mock_run.call_args[0][0] + body_idx = call_args.index("--body") + 1 + body = call_args[body_idx] + assert "Self-Review Summary" in body + assert "**Findings:** 0" in body + + +class TestSelfReviewStepHandler: + """Tests for the workflow ``self_review`` step handler (workflow/runner.py). + + The step's presence in a workflow is the enablement signal; the handler + wraps ``run_self_review`` and accumulates the review's turns/cost onto the + shared ``ctx.agent_result``. + """ + + def _ctx(self, *, setup, agent_result): + from workflow import Step, StepContext, Workflow + + wf = Workflow.model_validate( + { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "prompt": {"template": "registry://prompt/x"}, + "hydration": {"sources": ["task_description"]}, + "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]}, + "steps": [ + {"kind": "run_agent"}, + {"kind": "self_review", "name": "review", "max_turns": 4}, + ], + "terminal_outcomes": {"primary": "pr_url"}, + "status": "production", + } + ) + step = next(s for s in wf.steps if s.kind == "self_review") + ctx = StepContext( + workflow=wf, + config=_make_config(), + setup=setup, + agent_result=agent_result, + trajectory=MagicMock(), + progress=MagicMock(), + ) + return step, ctx, Step + + def test_passes_step_max_turns_and_accumulates(self): + from workflow.runner import _handle_self_review + + setup = _make_setup() + agent_result = _make_agent_result(turns=5, num_turns=5, cost_usd=0.50) + step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result) + + review = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10) + with patch("self_review.run_self_review", return_value=review) as mock_review: + outcome = _handle_self_review(step, ctx) + + # The step's max_turns (4) is forwarded to run_self_review. + assert mock_review.call_args.kwargs["max_turns"] == 4 + assert outcome.status == "succeeded" + assert outcome.data["self_review_ran"] is True + # Review turns/cost accumulated onto the shared agent_result. + assert ctx.agent_result.turns == 7 + assert ctx.agent_result.num_turns == 7 + assert abs(ctx.agent_result.cost_usd - 0.60) < 1e-9 + + def test_skipped_review_records_not_ran(self): + from workflow.runner import _handle_self_review + + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result) + + with patch("self_review.run_self_review", return_value=None): + outcome = _handle_self_review(step, ctx) + + assert outcome.status == "succeeded" + assert outcome.data["self_review_ran"] is False + # No accumulation when the review was skipped. + assert ctx.agent_result.turns == 5 + + def test_fails_without_clone(self): + from workflow.runner import _handle_self_review + + agent_result = _make_agent_result() + step, ctx, _ = self._ctx(setup=None, agent_result=agent_result) + + outcome = _handle_self_review(step, ctx) + assert outcome.status == "failed" + assert "cloned repo" in (outcome.error or "") + + def test_default_max_turns_when_step_omits_it(self): + from workflow import Step, StepContext, Workflow + from workflow.runner import _DEFAULT_SELF_REVIEW_MAX_TURNS, _handle_self_review + + wf = Workflow.model_validate( + { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "prompt": {"template": "registry://prompt/x"}, + "hydration": {"sources": ["task_description"]}, + "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]}, + "steps": [{"kind": "run_agent"}, {"kind": "self_review"}], + "terminal_outcomes": {"primary": "pr_url"}, + "status": "production", + } + ) + step = next(s for s in wf.steps if s.kind == "self_review") + assert step.max_turns is None + ctx = StepContext( + workflow=wf, + config=_make_config(), + setup=_make_setup(), + agent_result=_make_agent_result(turns=2), + trajectory=MagicMock(), + progress=MagicMock(), + ) + review = AgentResult(status="success", turns=1, num_turns=1) + with patch("self_review.run_self_review", return_value=review) as mock_review: + _handle_self_review(step, ctx) + assert mock_review.call_args.kwargs["max_turns"] == _DEFAULT_SELF_REVIEW_MAX_TURNS + # Step model unused beyond fixture; keep import referenced. + assert Step is not None diff --git a/agent/workflows/coding/new-task-v1.yaml b/agent/workflows/coding/new-task-v1.yaml index 7219fdf3..2454b007 100644 --- a/agent/workflows/coding/new-task-v1.yaml +++ b/agent/workflows/coding/new-task-v1.yaml @@ -42,6 +42,12 @@ steps: - { kind: clone_repo, name: setup } - { kind: hydrate_context, name: context } - { kind: run_agent, name: implement } + # Optional pre-PR self-review: the agent critiques its own cumulative diff and + # fixes issues before the PR is opened, then posts a summary as a PR comment. + # Advisory (on_failure: continue) and fail-open — a review failure/skip never + # blocks PR creation. Remove this step to disable self-review for the workflow; + # tune the turn budget via max_turns (1-20, default 5). + - { kind: self_review, name: review, on_failure: continue, max_turns: 5 } - { kind: verify_build, name: build, gate: regression_only } - { kind: ensure_pr, name: open_pr, strategy: create } terminal_outcomes: diff --git a/agent/workflows/schema/workflow.schema.json b/agent/workflows/schema/workflow.schema.json index b4b753bb..f68d867b 100644 --- a/agent/workflows/schema/workflow.schema.json +++ b/agent/workflows/schema/workflow.schema.json @@ -187,7 +187,7 @@ "kind": { "type": "string", "description": "Dispatches to a registered step handler.", - "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "ensure_pr", "post_review", "deliver_artifact"] + "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "self_review", "ensure_pr", "post_review", "deliver_artifact"] }, "name": { "type": "string", @@ -213,6 +213,12 @@ "type": "string", "description": "For deliver_artifact: the name of a registered Python deliverer (workflow/deliverers.py / DELIVERERS) to route the produced output through. Open string, not a closed enum, so a new delivery method is a registered deliverer, not a schema change (ADR-014 addendum 2026-06-08). First-party names: s3, comment, s3_and_comment. Each deliverer declares the terminal outcomes it produces; validator rule 11 consults that registry.", "minLength": 1 + }, + "max_turns": { + "type": "integer", + "description": "For self_review: cap on agent turns the self-review loop may use (drawn from the task's remaining turn/budget allowance). Defaults to 5 when omitted.", + "minimum": 1, + "maximum": 20 } } } @@ -295,7 +301,7 @@ "steps": { "items": { "properties": { - "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"] } } + "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"] } } } } }, diff --git a/contracts/workflow-validation/rule3-repo-less-has-self-review.json b/contracts/workflow-validation/rule3-repo-less-has-self-review.json new file mode 100644 index 00000000..42378e53 --- /dev/null +++ b/contracts/workflow-validation/rule3-repo-less-has-self-review.json @@ -0,0 +1,69 @@ +{ + "name": "rule3-repo-less-has-self-review", + "description": "Repo-less knowledge workflow illegally declares a self_review step. self_review is repo-only (it diffs and re-runs the agent against a cloned tree), so both the schema's requires_repo:false conditional and cross-field rule 3 reject it.", + "workflow": { + "id": "knowledge/web-research-v1", + "version": "1.0.0", + "domain": "knowledge", + "requires_repo": false, + "read_only": false, + "prompt": { + "template": "registry://prompt/web-research-workflow" + }, + "hydration": { + "sources": [ + "task_description", + "urls" + ] + }, + "agent_config": { + "tier": "elevated", + "allowed_tools": [ + "Read", + "WebFetch" + ], + "cedar_policy_modules": [ + "builtin/hard_deny", + "builtin/soft_deny" + ] + }, + "repo_config": { + "discover": false + }, + "required_inputs": { + "all_of": [ + "task_description" + ] + }, + "steps": [ + { + "kind": "hydrate_context", + "name": "context" + }, + { + "kind": "run_agent", + "name": "research" + }, + { + "kind": "self_review", + "name": "review" + }, + { + "kind": "deliver_artifact", + "name": "deliver", + "target": "s3_and_comment" + } + ], + "terminal_outcomes": { + "primary": "artifact" + }, + "status": "production" + }, + "expected": { + "valid": false, + "violations": [ + "rule-3", + "schema" + ] + } +} diff --git a/contracts/workflow-validation/valid-coding-self-review-step.json b/contracts/workflow-validation/valid-coding-self-review-step.json new file mode 100644 index 00000000..3dccb547 --- /dev/null +++ b/contracts/workflow-validation/valid-coding-self-review-step.json @@ -0,0 +1,94 @@ +{ + "name": "valid-coding-self-review-step", + "description": "Coding workflow with an optional pre-PR self_review step (advisory, on_failure: continue). self_review is repo-only and non-side-effecting, so on_failure: continue is permitted (unlike ensure_pr).", + "workflow": { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "description": "Implement a GitHub issue, self-review, and open a PR.", + "requires_repo": true, + "read_only": false, + "prompt": { + "template": "registry://prompt/coding-new-task-workflow" + }, + "hydration": { + "sources": [ + "issue", + "memory", + "task_description" + ] + }, + "agent_config": { + "tier": "standard", + "allowed_tools": [ + "Bash", + "Read", + "Write", + "Edit", + "Glob", + "Grep", + "WebFetch" + ], + "cedar_policy_modules": [ + "builtin/hard_deny", + "builtin/soft_deny" + ] + }, + "repo_config": { + "provider": "github", + "discover": true + }, + "required_inputs": { + "one_of": [ + "issue_number", + "task_description" + ] + }, + "steps": [ + { + "kind": "clone_repo", + "name": "setup" + }, + { + "kind": "hydrate_context", + "name": "context" + }, + { + "kind": "run_agent", + "name": "implement" + }, + { + "kind": "self_review", + "name": "review", + "on_failure": "continue", + "max_turns": 5 + }, + { + "kind": "verify_build", + "name": "build", + "gate": "regression_only" + }, + { + "kind": "ensure_pr", + "name": "open_pr", + "strategy": "create" + } + ], + "terminal_outcomes": { + "primary": "pr_url" + }, + "limits": { + "max_turns": 100 + }, + "promotion_gate": { + "requires": [ + "tests:agent/new_task" + ] + }, + "status": "production" + }, + "expected": { + "valid": true, + "violations": [] + } +}