Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions agent/src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
_extract_agent_notes,
ensure_committed,
ensure_pr,
post_self_review_comment,
verify_build,
verify_lint,
)
Expand Down Expand Up @@ -217,6 +218,50 @@ def _execute_agent_step(
return ctx.agent_result


def _execute_self_review_step(
workflow: Workflow | None,
config,
setup,
agent_result,
hydrated,
trajectory,
progress,
) -> bool:
"""Drive the workflow's ``self_review`` step (if declared) through the runner.

Mirrors ``_execute_agent_step``: only the ``self_review`` step is dispatched
(``only_kinds={"self_review"}``) so clone / build / PR stay on the inline
path. The step's handler accumulates the review loop's turns/cost back onto
``agent_result`` (a shared mutable model), so the terminal result reflects
implement + review.

Returns True when the review actually ran (so the caller posts the summary
PR comment after ``ensure_pr``); False when no ``self_review`` step is
declared, the workflow failed to reload, or the review was skipped (read-only
/ empty diff / no remaining turns). Fully fail-open — a review failure is
recorded as a step outcome and never propagates to block PR creation.
"""
if workflow is None or not any(s.kind == "self_review" for s in workflow.steps):
return False

from workflow import StepContext, run_workflow

ctx = StepContext(
workflow=workflow,
config=config,
hydrated=hydrated,
progress=progress,
trajectory=trajectory,
setup=setup,
# The implement step's result, threaded in so the handler can size the
# review's turn budget and accumulate its turns/cost onto it.
agent_result=agent_result,
)
with task_span("task.self_review"):
run_workflow(workflow, ctx, only_kinds={"self_review"})
return bool(ctx.artifacts.get("self_review_ran", False))


def _run_repoless_task(
*,
config,
Expand Down Expand Up @@ -1006,6 +1051,23 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
)
ensure_pr_strategy = "create"

# Self-review step: if the resolved workflow declares a ``self_review``
# step, drive it through the workflow runner (same pattern as
# ``_execute_agent_step``). The step has the LLM critique its own diff
# and fix issues, accumulating its turns/cost onto ``agent_result``.
# Runs AFTER the cancel short-circuit so a cancelled task never starts
# a second agent loop, and BEFORE post-hooks so fixes land in the PR.
# Fail-open: a review failure/skip never blocks PR creation.
self_review_ran = _execute_self_review_step(
_workflow,
config,
setup,
agent_result,
hc,
trajectory,
progress,
)

# Post-hooks (agent_result is guaranteed set by the try/except above)
with task_span("task.post_hooks") as post_span:
# Safety net: commit any uncommitted tracked changes (skip for read-only tasks)
Expand All @@ -1028,6 +1090,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
if pr_url:
progress.write_agent_milestone("pr_created", pr_url)

# Post self-review summary as PR comment (if the self_review step ran)
if pr_url and self_review_ran:
post_self_review_comment(setup.repo_dir, pr_url, config)

# Memory write — capture task episode and repo learnings
memory_written = False
effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "")
Expand Down
61 changes: 61 additions & 0 deletions agent/src/post_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,67 @@ def ensure_pr(
return None


def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool:
"""Post the self-review summary as a PR comment.

Reads the summary file written by the self-review agent, formats it as a
comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged
but never propagated.

Returns True if a comment was posted, False otherwise.
"""
from self_review import read_self_review_summary

try:
summary = read_self_review_summary(repo_dir)
except Exception as e:
log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}")
return False

if not summary:
log("POST", "post_self_review_comment: no summary file found — skipping")
return False

# Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123)
match = re.search(r"/pull/(\d+)", pr_url)
if not match:
log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}")
return False
pr_number = match.group(1)

comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}"

try:
result = subprocess.run(
[
"gh",
"pr",
"comment",
pr_number,
"--repo",
config.repo_url,
"--body",
comment_body,
],
cwd=repo_dir,
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
log("POST", f"Self-review summary posted as comment on PR #{pr_number}")
return True
stderr = result.stderr.strip()[:200] if result.stderr else ""
log(
"WARN",
f"post_self_review_comment: gh pr comment failed (rc={result.returncode}): {stderr}",
)
return False
except (subprocess.TimeoutExpired, OSError) as e:
log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}")
return False


def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None:
"""Extract the "## Agent notes" section from the PR body.

Expand Down
1 change: 1 addition & 0 deletions agent/src/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .new_task import NEW_TASK_WORKFLOW
from .pr_iteration import PR_ITERATION_WORKFLOW
from .pr_review import PR_REVIEW_WORKFLOW
from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT
from .web_research import WEB_RESEARCH_PROMPT

DEFAULT_WORKFLOW_ID = "coding/new-task-v1"
Expand Down
61 changes: 61 additions & 0 deletions agent/src/prompts/self_review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Self-review prompt template for pre-PR diff critique."""

SELF_REVIEW_PROMPT = """\
You are reviewing your own work before it becomes a pull request. Below is the \
cumulative diff of all changes on this branch compared to the base branch.

<diff>
{diff}
</diff>

## Task context

{task_description}

## Review checklist

Examine the diff carefully for:

1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
incorrect assumptions about data shapes or API contracts.
2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
race conditions.
3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
insecure defaults, OWASP Top 10 issues.
4. **Style & consistency** — Naming conventions, code style violations relative to \
the surrounding codebase, unnecessary complexity.
5. **Test gaps** — Important behaviour that is untested, assertions that don't \
verify the right thing, missing edge-case coverage.

## Instructions

- If you find issues, fix them directly: edit the files, run the build/tests to \
verify your fixes, and commit the changes.
- If no issues are found, stop immediately — do not make changes for the sake of \
making changes.
- Do NOT refactor code that was not part of the original diff unless it has a \
concrete bug or security issue.
- Keep fixes minimal and focused. Each fix should be a separate commit with a \
clear message.

## Summary output

After completing your review (whether you made fixes or not), write a file \
`.self-review-summary.md` in the repository root with your findings in this format:

```markdown
### Self-Review Summary

**Findings:** <number of issues found>
**Fixes applied:** <number of fixes committed>

#### Issues found

- <category>: <brief description of issue> — <fixed | not fixed (reason)>
```

If no issues were found, write the file with: "No issues found — code looks good."

This file is a pipeline artifact and will be deleted automatically — it will NOT \
appear in the pull request.
"""
Loading
Loading