diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py
index 7718cb48..f4c29206 100644
--- a/agent/src/pipeline.py
+++ b/agent/src/pipeline.py
@@ -31,6 +31,7 @@
_extract_agent_notes,
ensure_committed,
ensure_pr,
+ post_self_review_comment,
verify_build,
verify_lint,
)
@@ -217,6 +218,50 @@ def _execute_agent_step(
return ctx.agent_result
+def _execute_self_review_step(
+ workflow: Workflow | None,
+ config,
+ setup,
+ agent_result,
+ hydrated,
+ trajectory,
+ progress,
+) -> bool:
+ """Drive the workflow's ``self_review`` step (if declared) through the runner.
+
+ Mirrors ``_execute_agent_step``: only the ``self_review`` step is dispatched
+ (``only_kinds={"self_review"}``) so clone / build / PR stay on the inline
+ path. The step's handler accumulates the review loop's turns/cost back onto
+ ``agent_result`` (a shared mutable model), so the terminal result reflects
+ implement + review.
+
+ Returns True when the review actually ran (so the caller posts the summary
+ PR comment after ``ensure_pr``); False when no ``self_review`` step is
+ declared, the workflow failed to reload, or the review was skipped (read-only
+ / empty diff / no remaining turns). Fully fail-open — a review failure is
+ recorded as a step outcome and never propagates to block PR creation.
+ """
+ if workflow is None or not any(s.kind == "self_review" for s in workflow.steps):
+ return False
+
+ from workflow import StepContext, run_workflow
+
+ ctx = StepContext(
+ workflow=workflow,
+ config=config,
+ hydrated=hydrated,
+ progress=progress,
+ trajectory=trajectory,
+ setup=setup,
+ # The implement step's result, threaded in so the handler can size the
+ # review's turn budget and accumulate its turns/cost onto it.
+ agent_result=agent_result,
+ )
+ with task_span("task.self_review"):
+ run_workflow(workflow, ctx, only_kinds={"self_review"})
+ return bool(ctx.artifacts.get("self_review_ran", False))
+
+
def _run_repoless_task(
*,
config,
@@ -1006,6 +1051,23 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
)
ensure_pr_strategy = "create"
+ # Self-review step: if the resolved workflow declares a ``self_review``
+ # step, drive it through the workflow runner (same pattern as
+ # ``_execute_agent_step``). The step has the LLM critique its own diff
+ # and fix issues, accumulating its turns/cost onto ``agent_result``.
+ # Runs AFTER the cancel short-circuit so a cancelled task never starts
+ # a second agent loop, and BEFORE post-hooks so fixes land in the PR.
+ # Fail-open: a review failure/skip never blocks PR creation.
+ self_review_ran = _execute_self_review_step(
+ _workflow,
+ config,
+ setup,
+ agent_result,
+ hc,
+ trajectory,
+ progress,
+ )
+
# Post-hooks (agent_result is guaranteed set by the try/except above)
with task_span("task.post_hooks") as post_span:
# Safety net: commit any uncommitted tracked changes (skip for read-only tasks)
@@ -1028,6 +1090,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
if pr_url:
progress.write_agent_milestone("pr_created", pr_url)
+ # Post self-review summary as PR comment (if the self_review step ran)
+ if pr_url and self_review_ran:
+ post_self_review_comment(setup.repo_dir, pr_url, config)
+
# Memory write — capture task episode and repo learnings
memory_written = False
effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "")
diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py
index 058a1e4c..00268847 100644
--- a/agent/src/post_hooks.py
+++ b/agent/src/post_hooks.py
@@ -392,6 +392,67 @@ def ensure_pr(
return None
+def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool:
+ """Post the self-review summary as a PR comment.
+
+ Reads the summary file written by the self-review agent, formats it as a
+ comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged
+ but never propagated.
+
+ Returns True if a comment was posted, False otherwise.
+ """
+ from self_review import read_self_review_summary
+
+ try:
+ summary = read_self_review_summary(repo_dir)
+ except Exception as e:
+ log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}")
+ return False
+
+ if not summary:
+ log("POST", "post_self_review_comment: no summary file found — skipping")
+ return False
+
+ # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123)
+ match = re.search(r"/pull/(\d+)", pr_url)
+ if not match:
+ log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}")
+ return False
+ pr_number = match.group(1)
+
+ comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}"
+
+ try:
+ result = subprocess.run(
+ [
+ "gh",
+ "pr",
+ "comment",
+ pr_number,
+ "--repo",
+ config.repo_url,
+ "--body",
+ comment_body,
+ ],
+ cwd=repo_dir,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ if result.returncode == 0:
+ log("POST", f"Self-review summary posted as comment on PR #{pr_number}")
+ return True
+ stderr = result.stderr.strip()[:200] if result.stderr else ""
+ log(
+ "WARN",
+ f"post_self_review_comment: gh pr comment failed (rc={result.returncode}): {stderr}",
+ )
+ return False
+ except (subprocess.TimeoutExpired, OSError) as e:
+ log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}")
+ return False
+
+
def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None:
"""Extract the "## Agent notes" section from the PR body.
diff --git a/agent/src/prompts/__init__.py b/agent/src/prompts/__init__.py
index 60c5b2c0..8334c10b 100644
--- a/agent/src/prompts/__init__.py
+++ b/agent/src/prompts/__init__.py
@@ -13,6 +13,7 @@
from .new_task import NEW_TASK_WORKFLOW
from .pr_iteration import PR_ITERATION_WORKFLOW
from .pr_review import PR_REVIEW_WORKFLOW
+from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT
from .web_research import WEB_RESEARCH_PROMPT
DEFAULT_WORKFLOW_ID = "coding/new-task-v1"
diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py
new file mode 100644
index 00000000..25cf47a2
--- /dev/null
+++ b/agent/src/prompts/self_review.py
@@ -0,0 +1,61 @@
+"""Self-review prompt template for pre-PR diff critique."""
+
+SELF_REVIEW_PROMPT = """\
+You are reviewing your own work before it becomes a pull request. Below is the \
+cumulative diff of all changes on this branch compared to the base branch.
+
+
+{diff}
+
+
+## Task context
+
+{task_description}
+
+## Review checklist
+
+Examine the diff carefully for:
+
+1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
+incorrect assumptions about data shapes or API contracts.
+2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
+race conditions.
+3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
+insecure defaults, OWASP Top 10 issues.
+4. **Style & consistency** — Naming conventions, code style violations relative to \
+the surrounding codebase, unnecessary complexity.
+5. **Test gaps** — Important behaviour that is untested, assertions that don't \
+verify the right thing, missing edge-case coverage.
+
+## Instructions
+
+- If you find issues, fix them directly: edit the files, run the build/tests to \
+verify your fixes, and commit the changes.
+- If no issues are found, stop immediately — do not make changes for the sake of \
+making changes.
+- Do NOT refactor code that was not part of the original diff unless it has a \
+concrete bug or security issue.
+- Keep fixes minimal and focused. Each fix should be a separate commit with a \
+clear message.
+
+## Summary output
+
+After completing your review (whether you made fixes or not), write a file \
+`.self-review-summary.md` in the repository root with your findings in this format:
+
+```markdown
+### Self-Review Summary
+
+**Findings:**
+**Fixes applied:**
+
+#### Issues found
+
+- : —
+```
+
+If no issues were found, write the file with: "No issues found — code looks good."
+
+This file is a pipeline artifact and will be deleted automatically — it will NOT \
+appear in the pull request.
+"""
diff --git a/agent/src/self_review.py b/agent/src/self_review.py
new file mode 100644
index 00000000..0c7634cb
--- /dev/null
+++ b/agent/src/self_review.py
@@ -0,0 +1,253 @@
+"""Self-review orchestration: LLM critiques its own diff before PR creation."""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import os
+import subprocess
+from typing import TYPE_CHECKING
+
+from prompts.self_review import SELF_REVIEW_PROMPT
+from shell import log
+
+if TYPE_CHECKING:
+ from models import AgentResult, RepoSetup, TaskConfig
+ from progress_writer import _ProgressWriter
+ from telemetry import _TrajectoryWriter
+
+# Diff truncation limit (characters). Large diffs are cut at hunk boundaries.
+_MAX_DIFF_CHARS = 60_000
+
+# Default cap on review-loop turns when the ``self_review`` step omits max_turns.
+_DEFAULT_REVIEW_MAX_TURNS = 5
+
+# Minimal system prompt for the self-review agent invocation.
+_REVIEW_SYSTEM_PROMPT = """\
+You are a code reviewer working inside the repository {repo_url} on branch {branch_name}.
+Your working directory is {repo_dir}.
+
+You have full access to the filesystem and can run commands. Fix any issues you \
+find directly — edit files, run the build, and commit fixes. Keep changes minimal \
+and focused.
+
+Do NOT open a pull request or push. Just fix issues and commit locally.
+"""
+
+
+def _get_diff(repo_dir: str, default_branch: str) -> str:
+ """Generate the cumulative diff of the branch vs origin/{default_branch}."""
+ try:
+ result = subprocess.run(
+ ["git", "diff", f"origin/{default_branch}...HEAD"],
+ cwd=repo_dir,
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+ if result.returncode != 0:
+ log("WARN", f"self_review: git diff failed (exit {result.returncode})")
+ return ""
+ return result.stdout
+ except (subprocess.TimeoutExpired, OSError) as e:
+ log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}")
+ return ""
+
+
+def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str:
+ """Truncate diff at a hunk boundary if it exceeds max_chars.
+
+ Cuts at the last complete hunk (line starting with '@@') that fits
+ within the limit, appending a truncation notice.
+ """
+ if len(diff) <= max_chars:
+ return diff
+
+ # Find the last hunk header that starts before max_chars
+ truncated = diff[:max_chars]
+ last_hunk = truncated.rfind("\n@@")
+ if last_hunk > 0:
+ # Cut just before this hunk header
+ truncated = truncated[:last_hunk]
+ else:
+ # No hunk boundary found — hard-cut at max_chars
+ last_newline = truncated.rfind("\n")
+ if last_newline > 0:
+ truncated = truncated[:last_newline]
+
+ total_lines = diff.count("\n")
+ kept_lines = truncated.count("\n")
+ truncated += (
+ f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; "
+ f"{len(diff) - len(truncated)} chars omitted] ..."
+ )
+ return truncated
+
+
+def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str:
+ """Build a minimal system prompt for the self-review agent."""
+ return _REVIEW_SYSTEM_PROMPT.format(
+ repo_url=config.repo_url,
+ branch_name=setup.branch,
+ repo_dir=setup.repo_dir,
+ )
+
+
+def _milestone(progress: _ProgressWriter | None, name: str, detail: str) -> None:
+ """Emit a progress milestone if a writer is wired up (no-op otherwise)."""
+ if progress is not None:
+ progress.write_agent_milestone(name, detail)
+
+
+def run_self_review(
+ config: TaskConfig,
+ setup: RepoSetup,
+ agent_result: AgentResult,
+ trajectory: _TrajectoryWriter | None,
+ progress: _ProgressWriter | None,
+ *,
+ max_turns: int = _DEFAULT_REVIEW_MAX_TURNS,
+) -> AgentResult | None:
+ """Run the self-review phase: LLM critiques its own diff and fixes issues.
+
+ Invoked by the ``self_review`` workflow step handler. The step's presence in
+ the resolved workflow is the enablement signal — there is no separate
+ feature flag; ``max_turns`` comes from the step (``self_review.max_turns``,
+ default 5) and caps the review loop within the task's remaining allowance.
+
+ Returns the AgentResult from the review phase, or None if skipped.
+ Fail-open: errors are logged but never block the pipeline.
+ """
+ # Skip condition: read-only workflows produce no diff to review
+ if config.read_only:
+ log("TASK", "self_review: skipped for read-only workflow")
+ return None
+
+ # Compute remaining turns
+ used_turns = agent_result.turns or 0
+ remaining_turns = config.max_turns - used_turns
+ review_turns = min(remaining_turns, max_turns)
+ if review_turns <= 0:
+ log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})")
+ return None
+
+ # Compute remaining budget
+ review_budget: float | None = None
+ if config.max_budget_usd is not None:
+ used_cost = agent_result.cost_usd or 0.0
+ remaining_budget = config.max_budget_usd - used_cost
+ if remaining_budget <= 0:
+ log(
+ "TASK",
+ f"self_review: no remaining budget "
+ f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})",
+ )
+ return None
+ review_budget = remaining_budget
+
+ # Get the diff
+ diff = _get_diff(setup.repo_dir, setup.default_branch)
+ if not diff.strip():
+ log("TASK", "self_review: no diff found — skipping")
+ return None
+
+ # Truncate if needed
+ diff = _truncate_diff(diff)
+
+ # Build the review prompt
+ task_desc = config.task_description or f"Issue #{config.issue_number}"
+ user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc)
+ system_prompt = _build_review_system_prompt(config, setup)
+
+ # Build a modified config for the review run
+ review_config = config.model_copy(
+ update={
+ "max_turns": review_turns,
+ "max_budget_usd": review_budget,
+ }
+ )
+
+ log(
+ "TASK",
+ f"self_review: starting (turns={review_turns}, "
+ f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, "
+ f"diff_chars={len(diff)})",
+ )
+ _milestone(
+ progress,
+ "self_review_started",
+ f"turns={review_turns} diff_chars={len(diff)}",
+ )
+
+ try:
+ from runner import run_agent
+
+ review_result = asyncio.run(
+ run_agent(
+ user_prompt,
+ system_prompt,
+ review_config,
+ cwd=setup.repo_dir,
+ trajectory=trajectory,
+ )
+ )
+ except Exception as e:
+ # Fail-open: self-review errors never block the pipeline
+ log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}")
+ _milestone(
+ progress,
+ "self_review_complete",
+ f"status=error error={type(e).__name__}: {e}",
+ )
+ return None
+
+ log(
+ "TASK",
+ f"self_review: complete (status={review_result.status}, "
+ f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})",
+ )
+ _milestone(
+ progress,
+ "self_review_complete",
+ f"status={review_result.status} turns={review_result.turns}",
+ )
+ return review_result
+
+
+_SUMMARY_FILENAME = ".self-review-summary.md"
+
+
+def read_self_review_summary(repo_dir: str) -> str | None:
+ """Read and delete the self-review summary file.
+
+ The self-review agent writes `.self-review-summary.md` in the repo root.
+ This function reads the content, removes the file (so it never appears in
+ the PR), and returns the content. Returns None if the file doesn't exist.
+ """
+ summary_path = os.path.join(repo_dir, _SUMMARY_FILENAME)
+ if not os.path.isfile(summary_path):
+ return None
+
+ try:
+ with open(summary_path) as f:
+ content = f.read()
+ except OSError as e:
+ log("WARN", f"self_review: failed to read summary file: {type(e).__name__}: {e}")
+ return None
+
+ # Remove the file so it doesn't end up in the PR
+ try:
+ os.remove(summary_path)
+ except OSError as e:
+ log("WARN", f"self_review: failed to delete summary file: {type(e).__name__}: {e}")
+
+ # If the file was staged by the agent, unstage it
+ with contextlib.suppress(subprocess.TimeoutExpired, OSError):
+ subprocess.run(
+ ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME],
+ cwd=repo_dir,
+ capture_output=True,
+ timeout=30,
+ )
+
+ return content.strip() if content.strip() else None
diff --git a/agent/src/workflow/models.py b/agent/src/workflow/models.py
index 1411a16e..887d425a 100644
--- a/agent/src/workflow/models.py
+++ b/agent/src/workflow/models.py
@@ -31,6 +31,7 @@
"run_agent",
"verify_build",
"verify_lint",
+ "self_review",
"ensure_pr",
"post_review",
"deliver_artifact",
@@ -140,6 +141,9 @@ class Step(BaseModel):
strategy: EnsurePrStrategy | None = None
gate: VerifyGate | None = None
target: DeliverTarget | None = None
+ # self_review: cap on turns the review loop may use (default applied by the
+ # handler when omitted).
+ max_turns: int | None = None
class TerminalOutcomes(BaseModel):
diff --git a/agent/src/workflow/runner.py b/agent/src/workflow/runner.py
index 4f818689..b58008c1 100644
--- a/agent/src/workflow/runner.py
+++ b/agent/src/workflow/runner.py
@@ -375,6 +375,10 @@ def _milestone(ctx: StepContext, milestone: str) -> None:
# pipeline.py) to avoid pulling the SDK / boto3 at module import and to keep the
# orchestration core importable in isolation for tests.
+# Default turn cap for a self_review step that omits max_turns (kept in sync
+# with the schema's self_review max_turns default and self_review.py).
+_DEFAULT_SELF_REVIEW_MAX_TURNS = 5
+
def _handle_clone_repo(step: Step, ctx: StepContext) -> StepOutcome:
"""Clone + prepare the repo (replaces the inline ``setup_repo`` call).
@@ -560,6 +564,72 @@ def _handle_verify_lint(step: Step, ctx: StepContext) -> StepOutcome:
)
+def _handle_self_review(step: Step, ctx: StepContext) -> StepOutcome:
+ """Have the agent critique its own diff and fix issues before the PR opens.
+
+ Declaring a ``self_review`` step in the workflow *is* the enablement — there
+ is no separate feature flag. The step runs a second, review-focused
+ ``run_agent`` loop over the cumulative branch diff (``self_review.run_self_review``),
+ capped at ``step.max_turns`` turns (default 5) drawn from the task's
+ remaining turn/budget allowance, and accumulates the review's turns/cost back
+ onto ``ctx.agent_result`` so the terminal result reflects the full task.
+
+ Authored as an advisory step (``on_failure: continue``): a review that is
+ skipped (read-only, no diff, no remaining turns) or fails is recorded but
+ never blocks PR creation — matching the original fail-open design. The
+ summary the review agent writes is posted as a PR comment by the pipeline
+ after ``ensure_pr`` (``self_review_ran`` in this outcome's data signals it).
+ """
+ if ctx.setup is None:
+ return StepOutcome(
+ kind=step.kind,
+ name=_step_key(step),
+ status="failed",
+ error="self_review requires a cloned repo (no clone_repo step ran)",
+ )
+ if ctx.agent_result is None:
+ return StepOutcome(
+ kind=step.kind,
+ name=_step_key(step),
+ status="failed",
+ error="self_review requires a prior run_agent result to size its turn budget",
+ )
+
+ from self_review import run_self_review
+
+ review_result = run_self_review(
+ ctx.config,
+ ctx.setup,
+ ctx.agent_result,
+ ctx.trajectory,
+ ctx.progress,
+ max_turns=step.max_turns or _DEFAULT_SELF_REVIEW_MAX_TURNS,
+ )
+ if review_result is None:
+ # Skipped (read-only / empty diff / no remaining turns) — not a failure.
+ return StepOutcome(
+ kind=step.kind,
+ name=_step_key(step),
+ status="succeeded",
+ data={"self_review_ran": False},
+ )
+
+ # Accumulate the review phase's turns/cost onto the running agent result so
+ # the terminal TaskResult reflects the whole task (implement + review).
+ ar = ctx.agent_result
+ ar.turns += review_result.turns
+ ar.num_turns += review_result.num_turns or review_result.turns
+ if review_result.cost_usd is not None:
+ ar.cost_usd = (ar.cost_usd or 0.0) + review_result.cost_usd
+
+ return StepOutcome(
+ kind=step.kind,
+ name=_step_key(step),
+ status="succeeded",
+ data={"self_review_ran": True, "review_status": review_result.status},
+ )
+
+
def _handle_ensure_pr(step: Step, ctx: StepContext) -> StepOutcome:
"""Create / push+resolve / resolve a PR per the step's ``strategy``.
@@ -645,6 +715,7 @@ def _handle_deliver_artifact(step: Step, ctx: StepContext) -> StepOutcome:
"run_agent": _handle_run_agent,
"verify_build": _handle_verify_build,
"verify_lint": _handle_verify_lint,
+ "self_review": _handle_self_review,
"ensure_pr": _handle_ensure_pr,
"post_review": _handle_post_review,
"deliver_artifact": _handle_deliver_artifact,
diff --git a/agent/src/workflow/validator.py b/agent/src/workflow/validator.py
index 25f2a6ee..21f27c45 100644
--- a/agent/src/workflow/validator.py
+++ b/agent/src/workflow/validator.py
@@ -40,8 +40,12 @@
_SIDE_EFFECTING_KINDS = frozenset({"ensure_pr", "post_review", "deliver_artifact"})
# Steps that only make sense when a repo is cloned (rule 3 / rule 7).
+# self_review diffs and re-runs the agent against the cloned working tree, so it
+# is repo-only too. It is deliberately NOT in _SIDE_EFFECTING_KINDS: it commits
+# locally but never pushes or opens a PR, so it may be marked on_failure:
+# continue (advisory / fail-open).
_REPO_ONLY_KINDS = frozenset(
- {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"}
+ {"clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"}
)
# Built-in (Phase 1-3) policy modules / MCP servers. Registry refs (registry://)
diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py
new file mode 100644
index 00000000..11f62deb
--- /dev/null
+++ b/agent/tests/test_self_review.py
@@ -0,0 +1,578 @@
+"""Unit tests for self_review.py — self-review orchestration module."""
+
+from unittest.mock import MagicMock, patch
+
+from models import AgentResult, RepoSetup, TaskConfig
+from self_review import (
+ _MAX_DIFF_CHARS,
+ _SUMMARY_FILENAME,
+ _get_diff,
+ _truncate_diff,
+ read_self_review_summary,
+ run_self_review,
+)
+
+
+def _make_config(**overrides) -> TaskConfig:
+ """Create a minimal TaskConfig for testing."""
+ base = TaskConfig(
+ repo_url="owner/repo",
+ github_token="ghp_test123",
+ aws_region="us-east-1",
+ task_description="Fix the bug",
+ max_turns=10,
+ )
+ return base.model_copy(update=overrides) if overrides else base
+
+
+def _make_setup(**overrides) -> RepoSetup:
+ """Create a minimal RepoSetup for testing."""
+ base = RepoSetup(
+ repo_dir="/workspace/repo",
+ branch="feat/123-fix",
+ default_branch="main",
+ )
+ return base.model_copy(update=overrides) if overrides else base
+
+
+def _make_agent_result(**overrides) -> AgentResult:
+ """Create a minimal AgentResult for testing."""
+ base = AgentResult(
+ status="success",
+ turns=5,
+ num_turns=5,
+ cost_usd=0.50,
+ )
+ return base.model_copy(update=overrides) if overrides else base
+
+
+class TestSkipConditions:
+ """Test all conditions that cause self-review to be skipped."""
+
+ def test_skip_for_read_only_workflow(self):
+ config = _make_config(read_only=True)
+ setup = _make_setup()
+ agent_result = _make_agent_result()
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is None
+
+ def test_skip_when_no_remaining_turns(self):
+ config = _make_config(max_turns=5)
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=5)
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5)
+ assert result is None
+
+ def test_skip_when_no_remaining_budget(self):
+ config = _make_config(max_budget_usd=1.0)
+ setup = _make_setup()
+ agent_result = _make_agent_result(cost_usd=1.0)
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is None
+
+ @patch("self_review._get_diff", return_value="")
+ def test_skip_when_empty_diff(self, mock_diff):
+ config = _make_config()
+ setup = _make_setup()
+ agent_result = _make_agent_result()
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is None
+
+ @patch("self_review._get_diff", return_value=" \n \n ")
+ def test_skip_when_whitespace_only_diff(self, mock_diff):
+ config = _make_config()
+ setup = _make_setup()
+ agent_result = _make_agent_result()
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is None
+
+
+class TestGetDiff:
+ """Test _get_diff helper."""
+
+ @patch("self_review.subprocess.run")
+ def test_returns_diff_output(self, mock_run):
+ mock_run.return_value = MagicMock(returncode=0, stdout="diff --git a/f\n+line\n")
+ result = _get_diff("/repo", "main")
+ assert result == "diff --git a/f\n+line\n"
+ mock_run.assert_called_once_with(
+ ["git", "diff", "origin/main...HEAD"],
+ cwd="/repo",
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+
+ @patch("self_review.subprocess.run")
+ def test_returns_empty_on_failure(self, mock_run):
+ mock_run.return_value = MagicMock(returncode=1, stdout="")
+ result = _get_diff("/repo", "main")
+ assert result == ""
+
+ @patch("self_review.subprocess.run")
+ def test_returns_empty_on_timeout(self, mock_run):
+ import subprocess
+
+ mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=60)
+ result = _get_diff("/repo", "main")
+ assert result == ""
+
+ @patch("self_review.subprocess.run")
+ def test_uses_custom_default_branch(self, mock_run):
+ mock_run.return_value = MagicMock(returncode=0, stdout="some diff")
+ _get_diff("/repo", "develop")
+ mock_run.assert_called_once_with(
+ ["git", "diff", "origin/develop...HEAD"],
+ cwd="/repo",
+ capture_output=True,
+ text=True,
+ timeout=60,
+ )
+
+
+class TestTruncateDiff:
+ """Test _truncate_diff helper."""
+
+ def test_no_truncation_needed(self):
+ short_diff = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n+new line\n"
+ result = _truncate_diff(short_diff)
+ assert result == short_diff
+
+ def test_truncates_at_hunk_boundary(self):
+ # Build a diff that exceeds the limit with multiple hunks
+ hunk1 = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n" + "+line\n" * 100
+ hunk2 = "\n@@ -100,3 +101,4 @@\n" + "+more\n" * 100
+ big_diff = hunk1 + hunk2
+ result = _truncate_diff(big_diff, max_chars=len(hunk1) + 10)
+ # Should truncate before hunk2's @@ marker
+ assert "@@ -100,3" not in result
+ assert "truncated" in result
+
+ def test_hard_cut_when_no_hunk_boundary(self):
+ # Single hunk that exceeds max
+ big_diff = "+x\n" * 30000
+ result = _truncate_diff(big_diff, max_chars=100)
+ assert len(result) < len(big_diff)
+ assert "truncated" in result
+
+ def test_exact_limit_no_truncation(self):
+ diff = "a" * _MAX_DIFF_CHARS
+ result = _truncate_diff(diff)
+ assert result == diff
+
+ def test_one_over_limit_truncates(self):
+ diff = "a" * (_MAX_DIFF_CHARS + 1)
+ result = _truncate_diff(diff)
+ assert "truncated" in result
+
+
+class TestBudgetAndTurnComputation:
+ """Test that budget and turn limits are computed correctly."""
+
+ @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n")
+ @patch("self_review.asyncio.run")
+ def test_review_turns_capped_at_step_max_turns(self, mock_asyncio_run, mock_diff):
+ mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2)
+ config = _make_config(max_turns=100)
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=5)
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ # The step's max_turns (3) caps the review even though 95 turns remain.
+ run_self_review(config, setup, agent_result, trajectory, progress, max_turns=3)
+
+ call_args = mock_asyncio_run.call_args
+ coro = call_args[0][0]
+ # Close the coroutine to avoid warnings
+ coro.close()
+
+ @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n")
+ @patch("self_review.asyncio.run")
+ def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff):
+ mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1)
+ config = _make_config(max_turns=8)
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=6) # Only 2 remaining
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ # Should use min(2 remaining, 5 cap) = 2 turns
+ run_self_review(config, setup, agent_result, trajectory, progress, max_turns=5)
+
+ call_args = mock_asyncio_run.call_args
+ coro = call_args[0][0]
+ coro.close()
+
+
+class TestHappyPath:
+ """Test the full happy path with mocked run_agent."""
+
+ @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+ @patch("self_review.asyncio.run")
+ def test_returns_review_result(self, mock_asyncio_run, mock_diff):
+ review_agent_result = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10)
+ mock_asyncio_run.return_value = review_agent_result
+
+ config = _make_config()
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=5)
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+
+ assert result is not None
+ assert result.status == "success"
+ assert result.turns == 2
+ assert result.cost_usd == 0.10
+
+ @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+ @patch("self_review.asyncio.run")
+ def test_writes_progress_milestones(self, mock_asyncio_run, mock_diff):
+ mock_asyncio_run.return_value = AgentResult(
+ status="success", turns=1, num_turns=1, cost_usd=0.05
+ )
+ config = _make_config()
+ setup = _make_setup()
+ agent_result = _make_agent_result()
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ run_self_review(config, setup, agent_result, trajectory, progress)
+
+ # Should write both started and complete milestones
+ milestone_calls = progress.write_agent_milestone.call_args_list
+ assert len(milestone_calls) == 2
+ assert milestone_calls[0][0][0] == "self_review_started"
+ assert milestone_calls[1][0][0] == "self_review_complete"
+
+ @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+ @patch("self_review.asyncio.run")
+ def test_fail_open_on_agent_error(self, mock_asyncio_run, mock_diff):
+ mock_asyncio_run.side_effect = RuntimeError("SDK crashed")
+
+ config = _make_config()
+ setup = _make_setup()
+ agent_result = _make_agent_result()
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ # Should not raise
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is None
+
+ # Should still write milestones
+ milestone_calls = progress.write_agent_milestone.call_args_list
+ assert milestone_calls[0][0][0] == "self_review_started"
+ assert milestone_calls[1][0][0] == "self_review_complete"
+ assert "error" in milestone_calls[1][0][1]
+
+ @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+ @patch("self_review.asyncio.run")
+ def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff):
+ mock_asyncio_run.return_value = AgentResult(
+ status="success", turns=1, num_turns=1, cost_usd=0.03
+ )
+ config = _make_config(max_budget_usd=None)
+ setup = _make_setup()
+ agent_result = _make_agent_result(cost_usd=None)
+ trajectory = MagicMock()
+ progress = MagicMock()
+
+ result = run_self_review(config, setup, agent_result, trajectory, progress)
+ assert result is not None
+ assert result.status == "success"
+
+
+class TestReadSelfReviewSummary:
+ """Tests for read_self_review_summary — reads and cleans up the summary file."""
+
+ def test_returns_content_when_file_exists(self, tmp_path):
+ summary_content = (
+ "### Self-Review Summary\n\n"
+ "**Findings:** 2\n"
+ "**Fixes applied:** 1\n\n"
+ "#### Issues found\n\n"
+ "- Security: hardcoded token — fixed\n"
+ "- Style: inconsistent naming — not fixed (cosmetic)\n"
+ )
+ (tmp_path / _SUMMARY_FILENAME).write_text(summary_content)
+
+ result = read_self_review_summary(str(tmp_path))
+
+ assert result == summary_content.strip()
+
+ def test_returns_none_when_file_missing(self, tmp_path):
+ result = read_self_review_summary(str(tmp_path))
+ assert result is None
+
+ def test_deletes_file_after_reading(self, tmp_path):
+ (tmp_path / _SUMMARY_FILENAME).write_text("No issues found — code looks good.")
+
+ read_self_review_summary(str(tmp_path))
+
+ assert not (tmp_path / _SUMMARY_FILENAME).exists()
+
+ def test_returns_none_for_empty_file(self, tmp_path):
+ (tmp_path / _SUMMARY_FILENAME).write_text(" \n\n ")
+
+ result = read_self_review_summary(str(tmp_path))
+
+ assert result is None
+
+ def test_returns_none_for_whitespace_only(self, tmp_path):
+ (tmp_path / _SUMMARY_FILENAME).write_text("\t\n ")
+
+ result = read_self_review_summary(str(tmp_path))
+
+ assert result is None
+
+ @patch("self_review.subprocess.run")
+ def test_runs_git_rm_cached_for_cleanup(self, mock_run, tmp_path):
+ (tmp_path / _SUMMARY_FILENAME).write_text("Some findings")
+ mock_run.return_value = MagicMock(returncode=0)
+
+ read_self_review_summary(str(tmp_path))
+
+ mock_run.assert_called_once_with(
+ ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME],
+ cwd=str(tmp_path),
+ capture_output=True,
+ timeout=30,
+ )
+
+ @patch("self_review.subprocess.run", side_effect=OSError("git not found"))
+ def test_git_rm_failure_does_not_block(self, mock_run, tmp_path):
+ (tmp_path / _SUMMARY_FILENAME).write_text("Findings here")
+
+ # Should not raise even if git rm fails
+ result = read_self_review_summary(str(tmp_path))
+
+ assert result == "Findings here"
+
+
+class TestPostSelfReviewComment:
+ """Tests for post_hooks.post_self_review_comment."""
+
+ def test_posts_comment_on_success(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1\n**Fixes applied:** 1")
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/pull/42"
+
+ with patch("post_hooks.subprocess.run") as mock_run:
+ mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+ result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+ assert result is True
+ call_args = mock_run.call_args[0][0]
+ assert call_args[0:3] == ["gh", "pr", "comment"]
+ assert "42" in call_args
+ assert "owner/repo" in call_args
+
+ def test_returns_false_when_no_summary(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/pull/42"
+
+ result = post_self_review_comment(str(tmp_path), pr_url, config)
+ assert result is False
+
+ def test_returns_false_on_invalid_pr_url(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ (tmp_path / _SUMMARY_FILENAME).write_text("Some findings")
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/issues/42"
+
+ result = post_self_review_comment(str(tmp_path), pr_url, config)
+ assert result is False
+
+ def test_returns_false_on_gh_failure(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1")
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/pull/99"
+
+ with patch("post_hooks.subprocess.run") as mock_run:
+ mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found")
+ result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+ assert result is False
+
+ def test_fail_open_on_exception(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1")
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/pull/5"
+
+ with patch("post_hooks.subprocess.run", side_effect=OSError("network error")):
+ result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+ assert result is False
+
+ def test_comment_body_includes_header(self, tmp_path):
+ from post_hooks import post_self_review_comment
+
+ (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 0\nNo issues.")
+
+ config = MagicMock()
+ config.repo_url = "owner/repo"
+ pr_url = "https://github.com/owner/repo/pull/7"
+
+ with patch("post_hooks.subprocess.run") as mock_run:
+ mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+ post_self_review_comment(str(tmp_path), pr_url, config)
+
+ call_args = mock_run.call_args[0][0]
+ body_idx = call_args.index("--body") + 1
+ body = call_args[body_idx]
+ assert "Self-Review Summary" in body
+ assert "**Findings:** 0" in body
+
+
+class TestSelfReviewStepHandler:
+ """Tests for the workflow ``self_review`` step handler (workflow/runner.py).
+
+ The step's presence in a workflow is the enablement signal; the handler
+ wraps ``run_self_review`` and accumulates the review's turns/cost onto the
+ shared ``ctx.agent_result``.
+ """
+
+ def _ctx(self, *, setup, agent_result):
+ from workflow import Step, StepContext, Workflow
+
+ wf = Workflow.model_validate(
+ {
+ "id": "coding/new-task-v1",
+ "version": "1.0.0",
+ "domain": "coding",
+ "prompt": {"template": "registry://prompt/x"},
+ "hydration": {"sources": ["task_description"]},
+ "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]},
+ "steps": [
+ {"kind": "run_agent"},
+ {"kind": "self_review", "name": "review", "max_turns": 4},
+ ],
+ "terminal_outcomes": {"primary": "pr_url"},
+ "status": "production",
+ }
+ )
+ step = next(s for s in wf.steps if s.kind == "self_review")
+ ctx = StepContext(
+ workflow=wf,
+ config=_make_config(),
+ setup=setup,
+ agent_result=agent_result,
+ trajectory=MagicMock(),
+ progress=MagicMock(),
+ )
+ return step, ctx, Step
+
+ def test_passes_step_max_turns_and_accumulates(self):
+ from workflow.runner import _handle_self_review
+
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=5, num_turns=5, cost_usd=0.50)
+ step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result)
+
+ review = AgentResult(status="success", turns=2, num_turns=2, cost_usd=0.10)
+ with patch("self_review.run_self_review", return_value=review) as mock_review:
+ outcome = _handle_self_review(step, ctx)
+
+ # The step's max_turns (4) is forwarded to run_self_review.
+ assert mock_review.call_args.kwargs["max_turns"] == 4
+ assert outcome.status == "succeeded"
+ assert outcome.data["self_review_ran"] is True
+ # Review turns/cost accumulated onto the shared agent_result.
+ assert ctx.agent_result.turns == 7
+ assert ctx.agent_result.num_turns == 7
+ assert abs(ctx.agent_result.cost_usd - 0.60) < 1e-9
+
+ def test_skipped_review_records_not_ran(self):
+ from workflow.runner import _handle_self_review
+
+ setup = _make_setup()
+ agent_result = _make_agent_result(turns=5)
+ step, ctx, _ = self._ctx(setup=setup, agent_result=agent_result)
+
+ with patch("self_review.run_self_review", return_value=None):
+ outcome = _handle_self_review(step, ctx)
+
+ assert outcome.status == "succeeded"
+ assert outcome.data["self_review_ran"] is False
+ # No accumulation when the review was skipped.
+ assert ctx.agent_result.turns == 5
+
+ def test_fails_without_clone(self):
+ from workflow.runner import _handle_self_review
+
+ agent_result = _make_agent_result()
+ step, ctx, _ = self._ctx(setup=None, agent_result=agent_result)
+
+ outcome = _handle_self_review(step, ctx)
+ assert outcome.status == "failed"
+ assert "cloned repo" in (outcome.error or "")
+
+ def test_default_max_turns_when_step_omits_it(self):
+ from workflow import Step, StepContext, Workflow
+ from workflow.runner import _DEFAULT_SELF_REVIEW_MAX_TURNS, _handle_self_review
+
+ wf = Workflow.model_validate(
+ {
+ "id": "coding/new-task-v1",
+ "version": "1.0.0",
+ "domain": "coding",
+ "prompt": {"template": "registry://prompt/x"},
+ "hydration": {"sources": ["task_description"]},
+ "agent_config": {"tier": "standard", "allowed_tools": ["Bash"]},
+ "steps": [{"kind": "run_agent"}, {"kind": "self_review"}],
+ "terminal_outcomes": {"primary": "pr_url"},
+ "status": "production",
+ }
+ )
+ step = next(s for s in wf.steps if s.kind == "self_review")
+ assert step.max_turns is None
+ ctx = StepContext(
+ workflow=wf,
+ config=_make_config(),
+ setup=_make_setup(),
+ agent_result=_make_agent_result(turns=2),
+ trajectory=MagicMock(),
+ progress=MagicMock(),
+ )
+ review = AgentResult(status="success", turns=1, num_turns=1)
+ with patch("self_review.run_self_review", return_value=review) as mock_review:
+ _handle_self_review(step, ctx)
+ assert mock_review.call_args.kwargs["max_turns"] == _DEFAULT_SELF_REVIEW_MAX_TURNS
+ # Step model unused beyond fixture; keep import referenced.
+ assert Step is not None
diff --git a/agent/workflows/coding/new-task-v1.yaml b/agent/workflows/coding/new-task-v1.yaml
index 7219fdf3..2454b007 100644
--- a/agent/workflows/coding/new-task-v1.yaml
+++ b/agent/workflows/coding/new-task-v1.yaml
@@ -42,6 +42,12 @@ steps:
- { kind: clone_repo, name: setup }
- { kind: hydrate_context, name: context }
- { kind: run_agent, name: implement }
+ # Optional pre-PR self-review: the agent critiques its own cumulative diff and
+ # fixes issues before the PR is opened, then posts a summary as a PR comment.
+ # Advisory (on_failure: continue) and fail-open — a review failure/skip never
+ # blocks PR creation. Remove this step to disable self-review for the workflow;
+ # tune the turn budget via max_turns (1-20, default 5).
+ - { kind: self_review, name: review, on_failure: continue, max_turns: 5 }
- { kind: verify_build, name: build, gate: regression_only }
- { kind: ensure_pr, name: open_pr, strategy: create }
terminal_outcomes:
diff --git a/agent/workflows/schema/workflow.schema.json b/agent/workflows/schema/workflow.schema.json
index b4b753bb..f68d867b 100644
--- a/agent/workflows/schema/workflow.schema.json
+++ b/agent/workflows/schema/workflow.schema.json
@@ -187,7 +187,7 @@
"kind": {
"type": "string",
"description": "Dispatches to a registered step handler.",
- "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "ensure_pr", "post_review", "deliver_artifact"]
+ "enum": ["clone_repo", "hydrate_context", "run_agent", "verify_build", "verify_lint", "self_review", "ensure_pr", "post_review", "deliver_artifact"]
},
"name": {
"type": "string",
@@ -213,6 +213,12 @@
"type": "string",
"description": "For deliver_artifact: the name of a registered Python deliverer (workflow/deliverers.py / DELIVERERS) to route the produced output through. Open string, not a closed enum, so a new delivery method is a registered deliverer, not a schema change (ADR-014 addendum 2026-06-08). First-party names: s3, comment, s3_and_comment. Each deliverer declares the terminal outcomes it produces; validator rule 11 consults that registry.",
"minLength": 1
+ },
+ "max_turns": {
+ "type": "integer",
+ "description": "For self_review: cap on agent turns the self-review loop may use (drawn from the task's remaining turn/budget allowance). Defaults to 5 when omitted.",
+ "minimum": 1,
+ "maximum": 20
}
}
}
@@ -295,7 +301,7 @@
"steps": {
"items": {
"properties": {
- "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint"] } }
+ "kind": { "not": { "enum": ["clone_repo", "ensure_pr", "post_review", "verify_build", "verify_lint", "self_review"] } }
}
}
},
diff --git a/contracts/workflow-validation/rule3-repo-less-has-self-review.json b/contracts/workflow-validation/rule3-repo-less-has-self-review.json
new file mode 100644
index 00000000..42378e53
--- /dev/null
+++ b/contracts/workflow-validation/rule3-repo-less-has-self-review.json
@@ -0,0 +1,69 @@
+{
+ "name": "rule3-repo-less-has-self-review",
+ "description": "Repo-less knowledge workflow illegally declares a self_review step. self_review is repo-only (it diffs and re-runs the agent against a cloned tree), so both the schema's requires_repo:false conditional and cross-field rule 3 reject it.",
+ "workflow": {
+ "id": "knowledge/web-research-v1",
+ "version": "1.0.0",
+ "domain": "knowledge",
+ "requires_repo": false,
+ "read_only": false,
+ "prompt": {
+ "template": "registry://prompt/web-research-workflow"
+ },
+ "hydration": {
+ "sources": [
+ "task_description",
+ "urls"
+ ]
+ },
+ "agent_config": {
+ "tier": "elevated",
+ "allowed_tools": [
+ "Read",
+ "WebFetch"
+ ],
+ "cedar_policy_modules": [
+ "builtin/hard_deny",
+ "builtin/soft_deny"
+ ]
+ },
+ "repo_config": {
+ "discover": false
+ },
+ "required_inputs": {
+ "all_of": [
+ "task_description"
+ ]
+ },
+ "steps": [
+ {
+ "kind": "hydrate_context",
+ "name": "context"
+ },
+ {
+ "kind": "run_agent",
+ "name": "research"
+ },
+ {
+ "kind": "self_review",
+ "name": "review"
+ },
+ {
+ "kind": "deliver_artifact",
+ "name": "deliver",
+ "target": "s3_and_comment"
+ }
+ ],
+ "terminal_outcomes": {
+ "primary": "artifact"
+ },
+ "status": "production"
+ },
+ "expected": {
+ "valid": false,
+ "violations": [
+ "rule-3",
+ "schema"
+ ]
+ }
+}
diff --git a/contracts/workflow-validation/valid-coding-self-review-step.json b/contracts/workflow-validation/valid-coding-self-review-step.json
new file mode 100644
index 00000000..3dccb547
--- /dev/null
+++ b/contracts/workflow-validation/valid-coding-self-review-step.json
@@ -0,0 +1,94 @@
+{
+ "name": "valid-coding-self-review-step",
+ "description": "Coding workflow with an optional pre-PR self_review step (advisory, on_failure: continue). self_review is repo-only and non-side-effecting, so on_failure: continue is permitted (unlike ensure_pr).",
+ "workflow": {
+ "id": "coding/new-task-v1",
+ "version": "1.0.0",
+ "domain": "coding",
+ "description": "Implement a GitHub issue, self-review, and open a PR.",
+ "requires_repo": true,
+ "read_only": false,
+ "prompt": {
+ "template": "registry://prompt/coding-new-task-workflow"
+ },
+ "hydration": {
+ "sources": [
+ "issue",
+ "memory",
+ "task_description"
+ ]
+ },
+ "agent_config": {
+ "tier": "standard",
+ "allowed_tools": [
+ "Bash",
+ "Read",
+ "Write",
+ "Edit",
+ "Glob",
+ "Grep",
+ "WebFetch"
+ ],
+ "cedar_policy_modules": [
+ "builtin/hard_deny",
+ "builtin/soft_deny"
+ ]
+ },
+ "repo_config": {
+ "provider": "github",
+ "discover": true
+ },
+ "required_inputs": {
+ "one_of": [
+ "issue_number",
+ "task_description"
+ ]
+ },
+ "steps": [
+ {
+ "kind": "clone_repo",
+ "name": "setup"
+ },
+ {
+ "kind": "hydrate_context",
+ "name": "context"
+ },
+ {
+ "kind": "run_agent",
+ "name": "implement"
+ },
+ {
+ "kind": "self_review",
+ "name": "review",
+ "on_failure": "continue",
+ "max_turns": 5
+ },
+ {
+ "kind": "verify_build",
+ "name": "build",
+ "gate": "regression_only"
+ },
+ {
+ "kind": "ensure_pr",
+ "name": "open_pr",
+ "strategy": "create"
+ }
+ ],
+ "terminal_outcomes": {
+ "primary": "pr_url"
+ },
+ "limits": {
+ "max_turns": 100
+ },
+ "promotion_gate": {
+ "requires": [
+ "tests:agent/new_task"
+ ]
+ },
+ "status": "production"
+ },
+ "expected": {
+ "valid": true,
+ "violations": []
+ }
+}