From 874fc6255c59febf95d16b859de38e67f297a9ca Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:38 -0400
Subject: [PATCH 01/15] =?UTF-8?q?feat(cdk):=20integ-tests=20Phase=201=20?=
 =?UTF-8?q?=E2=80=94=20core=20lifecycle=20E2E=20(#317)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add integ.task-lifecycle.ts: deploys the full AgentStack (orchestrator +
AgentCore runtime + agent container) as backgroundagent-integ-lifecycle and
drives a real agent through the four Cedar HITL terminal paths — COMPLETED
(repo-less default/agent-v1), FAILED (coding/new-task-v1 against a nonexistent
repo), and AWAITING_APPROVAL -> approve/deny (config.env write tripping the
write_env_files soft-deny gate). Cost is bounded via low max_turns +
max_budget_usd; teardown is forced on success and failure.

Gate scenarios 3 & 4 read their sandbox repo + pre-seeded PAT secret from two
top-of-file constants (placeholder until provisioned); the PAT is copied into
the stack-created GitHubTokenSecret via a get/putSecretValue assertion, matching
the documented operator flow (QUICK_START §4) with no agent.ts change.

integ.yml: raise the integ job timeout 45->90m for the heavier full-stack
deploy, extend the teardown safety-net to sweep both integ stacks, and add an
always() sandbox bgagent/* branch-cleanup step gated on INTEG_SANDBOX_REPO /
INTEG_PAT_SECRET_ID repo vars (no-op until set).

Update the ROADMAP "Deployed runtime E2E verification" row (+ Starlight mirror).
---
 .github/workflows/integ.yml              |  66 +++-
 cdk/test/integ/integ.task-lifecycle.ts   | 371 +++++++++++++++++++++++
 docs/guides/ROADMAP.md                   |   2 +-
 docs/src/content/docs/roadmap/Roadmap.md |   2 +-
 4 files changed, 428 insertions(+), 13 deletions(-)
 create mode 100644 cdk/test/integ/integ.task-lifecycle.ts
diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index 4ca180b7..44dc363f 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -175,7 +175,12 @@ jobs:
     name: CDK integ smoke (Task API)
     runs-on: ubuntu-latest
     environment: integ
-    timeout-minutes: 45
+    # The lifecycle test (integ.task-lifecycle.ts) deploys the full AgentStack
+    # (orchestrator + AgentCore runtime + Docker image build) and drives real
+    # agent runs through their terminal states before destroying — far heavier
+    # than the Phase-0 trimmed smoke test. 90 min covers deploy + cold Docker
+    # build + agent runs + teardown with margin.
+    timeout-minutes: 90
     permissions:
       id-token: write
       contents: read
@@ -218,26 +223,65 @@ jobs:
         run: mise //cdk:integ
 
       # Safety net: integ-runner forces teardown on success and failure, but if
-      # the run is cancelled or crashes mid-deploy the stack can be stranded in
-      # the shared account. Delete it directly via CloudFormation so we never
+      # the run is cancelled or crashes mid-deploy a stack can be stranded in
+      # the shared account. Delete them directly via CloudFormation so we never
       # leak billable resources.
       #
-      # NOTE: `cdk destroy backgroundagent-integ` would NOT work here — it
-      # synthesizes the main app (src/main.ts), which does not contain the integ
-      # stack, so it exits 0 having deleted nothing. Target the stack by its
-      # literal CloudFormation name instead. delete-stack is idempotent (no-op if
+      # NOTE: `cdk destroy <stack>` would NOT work here — it synthesizes the
+      # main app (src/main.ts), which does not contain the integ stacks, so it
+      # exits 0 having deleted nothing. Target each stack by its literal
+      # CloudFormation name instead. delete-stack is idempotent (no-op if
       # already gone), so `|| true` only guards transient API errors.
-      - name: Ensure stack torn down
+      #
+      # Both integ stacks are swept: backgroundagent-integ (Phase-0 smoke) and
+      # backgroundagent-integ-lifecycle (Phase-1 full-stack lifecycle).
+      - name: Ensure stacks torn down
         if: always()
         env:
           AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
           AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
         run: |
           set -euo pipefail
-          aws cloudformation delete-stack --stack-name backgroundagent-integ || true
-          # No `|| true` on the wait: a DELETE_FAILED must surface loudly so we
+          for stack in backgroundagent-integ backgroundagent-integ-lifecycle; do
+            aws cloudformation delete-stack --stack-name "$stack" || true
+          done
+          # No `|| true` on the waits: a DELETE_FAILED must surface loudly so we
           # never silently leak billable resources in the shared account.
-          aws cloudformation wait stack-delete-complete --stack-name backgroundagent-integ
+          for stack in backgroundagent-integ backgroundagent-integ-lifecycle; do
+            aws cloudformation wait stack-delete-complete --stack-name "$stack"
+          done
+
+      # Sandbox cleanup for the gate scenarios (3 & 4): coding/new-task-v1 pushes
+      # a `bgagent/<task_id>/<slug>` branch and (on approve) opens a PR on the
+      # sandbox repo. The agent never closes these, so each run would accumulate
+      # stale branches/PRs. Reconstructing the exact branch name in the test is
+      # fragile (it depends on the agent-side slug), so we sweep by prefix here:
+      # delete every `bgagent/*` branch on the sandbox, which also closes the
+      # associated PRs. Reads the same PAT the agent used, from the pre-seeded
+      # secret. Gated on the repo vars being set so this is a no-op until the
+      # sandbox + secret are provisioned. Never fails the job — best-effort.
+      - name: Clean up sandbox PRs/branches
+        if: always() && vars.INTEG_SANDBOX_REPO != '' && vars.INTEG_PAT_SECRET_ID != ''
+        env:
+          AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
+          AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
+          SANDBOX_REPO: ${{ vars.INTEG_SANDBOX_REPO }}
+          PAT_SECRET_ID: ${{ vars.INTEG_PAT_SECRET_ID }}
+        run: |
+          set -euo pipefail
+          GH_TOKEN="$(aws secretsmanager get-secret-value \
+            --secret-id "$PAT_SECRET_ID" \
+            --query SecretString --output text)"
+          export GH_TOKEN
+          # List bgagent/* branch refs; delete each (deleting the branch closes
+          # any open PR from it). Best-effort: never fail the job on cleanup.
+          gh api "repos/${SANDBOX_REPO}/git/matching-refs/heads/bgagent/" \
+            --jq '.[].ref | sub("^refs/heads/"; "")' 2>/dev/null \
+          | while read -r branch; do
+              [ -n "$branch" ] || continue
+              echo "Deleting sandbox branch: $branch"
+              gh api -X DELETE "repos/${SANDBOX_REPO}/git/refs/heads/${branch}" || true
+            done || true
 
   # Post the final integ-smoke status back to the PR head so the check flips from
   # pending to success/failure. Skipped for workflow_dispatch (no PR to gate).
diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
new file mode 100644
index 00000000..01b36719
--- /dev/null
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -0,0 +1,371 @@
+/**
+ *  MIT No Attribution
+ *
+ *  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy of
+ *  the Software without restriction, including without limitation the rights to
+ *  use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ *  the Software, and to permit persons to whom the Software is furnished to do so.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ */
+
+/*
+ * Phase-1 deploy-then-verify lifecycle test for issue #317.
+ *
+ * Where Phase 0 (integ.task-api-smoke.ts) deployed a TRIMMED stack and asserted
+ * a task merely persists at SUBMITTED, Phase 1 deploys the REAL, full AgentStack
+ * (orchestrator + AgentCore runtime/memory + agent container) and drives a live
+ * agent through its lifecycle, asserting the four terminal paths from the Cedar
+ * HITL E2E matrix (docs/design/CEDAR_HITL_GATES.md §15.3):
+ *
+ *   1. submit -> run -> COMPLETED                       (repo-less default/agent-v1)
+ *   2. submit -> run -> FAILED                          (coding/new-task-v1, bad repo)
+ *   3. submit -> run -> AWAITING_APPROVAL -> approve    (write_env_files soft-deny gate)
+ *   4. submit -> run -> AWAITING_APPROVAL -> deny       (write_env_files soft-deny gate)
+ *
+ * This runs in the DEDICATED E2E account (<integ-account>), which has no
+ * backgroundagent-dev stack — so the AgentCore account-unique runtime/memory
+ * name collision that forced Phase 0 to trim DOES NOT apply here. We deploy the
+ * committed AgentStack unchanged: it leaves runtimeName/memoryName UNSET, and
+ * CDK auto-generates names that include the stack name (backgroundagent-integ-
+ * lifecycle), guaranteeing uniqueness. (A local developer's uncommitted agent.ts
+ * pin must be stashed before a local `mise //cdk:integ`, or it would collide.)
+ *
+ * Determinism: there is no mock/scripted agent mode — every scenario runs the
+ * real `claude` CLI against Bedrock. We bound cost and wall-clock with low
+ * max_turns and a max_budget_usd cap, and steer terminal states with simple,
+ * purpose-built task descriptions.
+ */
+
+import { randomBytes } from 'node:crypto';
+import { ExpectedResult, IntegTest } from '@aws-cdk/integ-tests-alpha';
+import { App, type CfnOutput, Duration } from 'aws-cdk-lib';
+import { TaskStatus } from '../../src/constructs/task-status';
+import { AgentStack } from '../../src/stacks/agent';
+
+const app = new App();
+
+// The real, full production stack. Environment-agnostic on purpose (same
+// rationale as Phase 0): an explicit env would force the IntegTest DeployAssert
+// stack — always environment-agnostic — into cross-region references it cannot
+// resolve when reading this stack's outputs in the assertions below.
+//
+// DO NOT set runtimeName/memoryName here or pin them in agent.ts for this
+// deploy: the committed defaults auto-generate stack-name-scoped unique names,
+// which is exactly what lets backgroundagent-integ-lifecycle stand alone.
+const stack = new AgentStack(app, 'backgroundagent-integ-lifecycle', {
+  description: 'ABCA Phase-1 integ lifecycle stack (full AgentStack: orchestrator + agent runtime)',
+});
+
+// AgentStack exposes its API URL, Cognito IDs, and table names only as
+// CfnOutputs (its constructs are private consts). Read the output tokens by
+// construct id rather than adding public accessors to the production stack.
+// CfnOutput exposes a `value` getter that returns the underlying token.
+const output = (id: string): string => (stack.node.findChild(id) as CfnOutput).value;
+
+const apiUrl = output('ApiUrl');
+const userPoolId = output('UserPoolId');
+const appClientId = output('AppClientId');
+const taskTableName = output('TaskTableName');
+const taskApprovalsTableName = output('TaskApprovalsTableName');
+// AgentStack creates its OWN empty GitHubTokenSecret (agent.ts:181,
+// RemovalPolicy.DESTROY) — it does not reference an external one. The gate
+// scenarios populate it post-deploy from the pre-seeded secret below, which is
+// exactly the documented operator flow (docs/guides/QUICK_START.md §4: read the
+// GitHubTokenSecretArn output, put-secret-value the PAT into it). Automating
+// that copy here keeps us aligned with the design (no agent.ts change) and the
+// throwaway secret tears down with the stack.
+const githubTokenSecretArn = output('GitHubTokenSecretArn');
+
+// --- Gate-scenario configuration (scenarios 3 & 4) ----------------------------
+// These two constants are the ONLY out-of-band wiring the gate scenarios need.
+// They point at resources an admin provisions once in the E2E account
+// (<integ-account>); scenarios 1 & 2 do NOT depend on them and run regardless.
+//
+//   SANDBOX_REPO  — a throwaway GitHub repo (owner/name) with a committed
+//                   baseline (README + default branch). coding/new-task-v1
+//                   clones it, the agent attempts a `config.env` write that
+//                   trips the write_env_files soft-deny gate, and (on approve)
+//                   pushes a `bgagent/<task_id>/<slug>` branch + opens a PR. The
+//                   CI `always()` cleanup step deletes those branches each run.
+//   PRESEEDED_PAT_SECRET — name (or ARN) of a STABLE Secrets Manager secret in
+//                   the E2E account holding a fine-grained PAT scoped to
+//                   SANDBOX_REPO. Copied into the stack-created GitHubTokenSecret
+//                   by the token-seeding assertion below.
+//
+// Until these hold real values the gate submits will FAIL at clone/preflight
+// (like scenario 2) rather than reaching AWAITING_APPROVAL — so flip them to the
+// provisioned repo/secret before relying on scenarios 3 & 4.
+const SANDBOX_REPO = 'PLACEHOLDER-ORG/abca-integ-sandbox';
+const PRESEEDED_PAT_SECRET = 'bgagent/integ/github-pat';
+
+const integ = new IntegTest(app, 'TaskLifecycle', {
+  testCases: [stack],
+  // Force teardown on success and failure so a failed assertion never strands
+  // the (expensive) full stack in the shared E2E account. The CI workflow keeps
+  // a CloudFormation delete-stack safety net on top of this.
+  cdkCommandOptions: {
+    destroy: { args: { force: true } },
+  },
+});
+
+// --- Authentication (same pattern as Phase 0) ---------------------------------
+// A throwaway user the assertions authenticate as. The pool disables self-signup,
+// so create + confirm it administratively, then mint a token via USER_PASSWORD_AUTH.
+// The password is generated per-synth (no credential-shaped literal in source) and
+// satisfies the Cognito default policy by construction.
+const username = 'integ-lifecycle@example.com';
+const password = `Aa1!${randomBytes(18).toString('base64url')}`;
+
+// Service name MUST be the AWS SDK v2 form 'CognitoIdentityServiceProvider' — the
+// assertion provider maps only the v2 key to the real client package (see the
+// long note in integ.task-api-smoke.ts).
+const cognitoService = 'CognitoIdentityServiceProvider';
+
+const createUser = integ.assertions.awsApiCall(cognitoService, 'adminCreateUser', {
+  UserPoolId: userPoolId,
+  Username: username,
+  MessageAction: 'SUPPRESS',
+  TemporaryPassword: password,
+});
+
+const setPassword = integ.assertions.awsApiCall(cognitoService, 'adminSetUserPassword', {
+  UserPoolId: userPoolId,
+  Username: username,
+  Password: password,
+  Permanent: true,
+});
+
+const auth = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
+  AuthFlow: 'USER_PASSWORD_AUTH',
+  ClientId: appClientId,
+  AuthParameters: { USERNAME: username, PASSWORD: password },
+});
+
+const idToken = auth.getAttString('AuthenticationResult.IdToken');
+
+// Conservative polling windows. Agent runs are real LLM sessions over a freshly
+// cold-started AgentCore runtime; the first invocation pays the cold-start tax.
+const TERMINAL_POLL = { totalTimeout: Duration.minutes(12), interval: Duration.seconds(30) };
+// The interim AWAITING_APPROVAL state appears mid-run, before terminal — poll it
+// on a shorter window so a stuck gate fails fast instead of burning the full
+// terminal budget waiting for a state that will never arrive.
+const GATE_POLL = { totalTimeout: Duration.minutes(8), interval: Duration.seconds(15) };
+
+// --- Scenario 1: COMPLETED (repo-less default/agent-v1) -----------------------
+// The default workflow is read-only (Read/Glob/Grep/WebFetch), requires no repo,
+// and delivers an artifact to S3. A trivial, self-contained instruction completes
+// in a single turn. No GitHub repo or token is involved.
+const submitComplete = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({
+    workflow_ref: 'default/agent-v1',
+    task_description: 'Reply with exactly the single word: done. Do not use any tools.',
+    max_turns: 2,
+    max_budget_usd: 0.5,
+  }),
+});
+
+// Poll the task row until it reaches COMPLETED. No getAttString is read off this
+// call, so flattenResponse stays false and the nested objectLike expect works.
+const pollComplete = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskTableName,
+  Key: { task_id: { S: submitComplete.getAttString('body.data.task_id') } },
+});
+pollComplete
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.COMPLETED } } }))
+  .waitForAssertions(TERMINAL_POLL);
+
+// --- Scenario 2: FAILED (coding/new-task-v1 against a nonexistent repo) --------
+// The coding workflow requires a repo and clones it. Pointing it at a repo that
+// does not exist makes preflight/clone fail fast, so the orchestrator writes a
+// terminal FAILED with an error_message — no agent turn, no runtime spin-up, and
+// no valid GitHub token required.
+const submitFail = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({
+    workflow_ref: 'coding/new-task-v1',
+    repo: `abca-integ-nonexistent/does-not-exist-${randomBytes(6).toString('hex')}`,
+    task_description: 'This task targets a nonexistent repo and must fail at clone/preflight.',
+    max_turns: 1,
+    max_budget_usd: 0.5,
+  }),
+});
+
+const pollFail = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskTableName,
+  Key: { task_id: { S: submitFail.getAttString('body.data.task_id') } },
+});
+pollFail
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.FAILED } } }))
+  .waitForAssertions(TERMINAL_POLL);
+
+// --- Token seeding (prerequisite for gate scenarios) --------------------------
+// Copy the pre-seeded PAT into the stack-created GitHubTokenSecret so the agent
+// runtime can clone SANDBOX_REPO and push a branch. This automates the documented
+// operator step (QUICK_START.md §4). No getAttString is read off seedPut, and the
+// SecretString token is consumed inline by seedPut, never asserted on.
+const seedGet = integ.assertions.awsApiCall('SecretsManager', 'getSecretValue', {
+  SecretId: PRESEEDED_PAT_SECRET,
+});
+
+const seedPut = integ.assertions.awsApiCall('SecretsManager', 'putSecretValue', {
+  SecretId: githubTokenSecretArn,
+  SecretString: seedGet.getAttString('SecretString'),
+});
+
+// --- Scenario 3: AWAITING_APPROVAL -> approve ---------------------------------
+// coding/new-task-v1 against the sandbox. The task asks the agent to write a
+// `config.env` file, which the Write tool routes through the write_env_files
+// soft-deny rule (agent/policies/soft_deny.cedar) -> the task parks at
+// AWAITING_APPROVAL with a PENDING approval row. We approve it, then assert the
+// row flips to APPROVED. (Post-approval the agent may COMPLETE or FAIL — both
+// terminal — so the deterministic assertion is the recorded decision, not a
+// specific terminal status.)
+const submitApprove = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({
+    workflow_ref: 'coding/new-task-v1',
+    repo: SANDBOX_REPO,
+    task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
+    max_turns: 6,
+    max_budget_usd: 0.5,
+  }),
+});
+const approveTaskId = submitApprove.getAttString('body.data.task_id');
+
+// Wait for the gate to open (interim AWAITING_APPROVAL).
+const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskTableName,
+  Key: { task_id: { S: approveTaskId } },
+});
+pollGateApprove
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+  .waitForAssertions(GATE_POLL);
+
+// Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
+// required because we do not know the agent-minted request_id. getAttString here
+// flips this call to a flattened response, so we do NOT .expect() on it — the
+// decision assertion below uses a separate getItem.
+const queryApprove = integ.assertions.awsApiCall('DynamoDB', 'query', {
+  TableName: taskApprovalsTableName,
+  KeyConditionExpression: 'task_id = :tid',
+  ExpressionAttributeValues: { ':tid': { S: approveTaskId } },
+});
+const approveRequestId = queryApprove.getAttString('Items.0.request_id.S');
+
+const approve = integ.assertions.httpApiCall(`${apiUrl}tasks/${approveTaskId}/approve`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({ request_id: approveRequestId, decision: 'approve', scope: 'this_call' }),
+});
+
+// Assert the decision was recorded on the approval row. Now that request_id is
+// known we read the exact row by its full key.
+const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskApprovalsTableName,
+  Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } },
+});
+pollApproveDecision
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } }))
+  .waitForAssertions(GATE_POLL);
+
+// --- Scenario 4: AWAITING_APPROVAL -> deny ------------------------------------
+// Identical trigger to scenario 3; we deny instead and assert the row flips to
+// DENIED.
+const submitDeny = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({
+    workflow_ref: 'coding/new-task-v1',
+    repo: SANDBOX_REPO,
+    task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
+    max_turns: 6,
+    max_budget_usd: 0.5,
+  }),
+});
+const denyTaskId = submitDeny.getAttString('body.data.task_id');
+
+const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskTableName,
+  Key: { task_id: { S: denyTaskId } },
+});
+pollGateDeny
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+  .waitForAssertions(GATE_POLL);
+
+const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
+  TableName: taskApprovalsTableName,
+  KeyConditionExpression: 'task_id = :tid',
+  ExpressionAttributeValues: { ':tid': { S: denyTaskId } },
+});
+const denyRequestId = queryDeny.getAttString('Items.0.request_id.S');
+
+const deny = integ.assertions.httpApiCall(`${apiUrl}tasks/${denyTaskId}/deny`, {
+  method: 'POST',
+  headers: {
+    'Content-Type': 'application/json',
+    'Authorization': idToken,
+  },
+  body: JSON.stringify({ request_id: denyRequestId, decision: 'deny', reason: 'integ: exercising the deny path' }),
+});
+
+const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+  TableName: taskApprovalsTableName,
+  Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } },
+});
+pollDenyDecision
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } }))
+  .waitForAssertions(GATE_POLL);
+
+// --- Execution order ----------------------------------------------------------
+// Auth first, then the two no-repo scenarios (submit both, then wait so their
+// agent runs proceed concurrently). Next seed the GitHub token, then submit both
+// gate tasks so they spin up concurrently and park at their gates; finally drive
+// approve then deny. The approve/deny flows are sequential because each
+// approve/deny POST needs the request_id read from the parked task's approval row.
+createUser
+  .next(setPassword)
+  .next(auth)
+  .next(submitComplete)
+  .next(submitFail)
+  .next(pollComplete)
+  .next(pollFail)
+  .next(seedGet)
+  .next(seedPut)
+  .next(submitApprove)
+  .next(submitDeny)
+  .next(pollGateApprove)
+  .next(queryApprove)
+  .next(approve)
+  .next(pollApproveDecision)
+  .next(pollGateDeny)
+  .next(queryDeny)
+  .next(deny)
+  .next(pollDenyDecision);
diff --git a/docs/guides/ROADMAP.md b/docs/guides/ROADMAP.md
index 2103180c..d34b4dc5 100644
--- a/docs/guides/ROADMAP.md
+++ b/docs/guides/ROADMAP.md
@@ -224,7 +224,7 @@ Planned capabilities, grouped by theme. Items are independent and may ship in an
 
 | Capability | Description |
 |------------|-------------|
-| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). Phase 1 (full lifecycle / real agent runs) and Phase 2 (channels) follow. See [ADR-013](../decisions/ADR-013-tiered-validation-pyramid.md). |
+| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). **Phase 1 landed ([#317](https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/317)):** a second test (`integ.task-lifecycle.ts`) deploys the *full* `AgentStack` (orchestrator + AgentCore runtime + agent container) to the dedicated E2E account and drives a real agent through the four terminal paths from the Cedar HITL matrix — `COMPLETED`, `FAILED`, and `AWAITING_APPROVAL` → approve/deny — capping cost with low `max_turns` + `max_budget_usd`. Phase 2 (channels) follows. See [ADR-013](../decisions/ADR-013-tiered-validation-pyramid.md). |
 | **Admission backlog observability** | Metric and alarm when `SUBMITTED` task depth exceeds an operator threshold (capacity and admission health). |
 | **Admission queue with deferred pickup** | When admission is at capacity, persist tasks in a durable queue instead of failing them. Automatically re-attempt admission and continue processing in FIFO order (with optional priority lanes) as concurrency becomes available. Preserve cancel/idempotency semantics and expose queue position/ETA in task status. |
 | **Safe orchestrator deploys** | Pre-deploy checks for active tasks (drain or warn); blue-green or canary Lambda deploy for the durable orchestrator with rollback on error regressions (`OBSERVABILITY.md`). |
diff --git a/docs/src/content/docs/roadmap/Roadmap.md b/docs/src/content/docs/roadmap/Roadmap.md
index 8fcaf97c..280bed6e 100644
--- a/docs/src/content/docs/roadmap/Roadmap.md
+++ b/docs/src/content/docs/roadmap/Roadmap.md
@@ -228,7 +228,7 @@ Planned capabilities, grouped by theme. Items are independent and may ship in an
 
 | Capability | Description |
 |------------|-------------|
-| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). Phase 1 (full lifecycle / real agent runs) and Phase 2 (channels) follow. See [ADR-013](/architecture/adr-013-tiered-validation-pyramid). |
+| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). **Phase 1 landed ([#317](https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/317)):** a second test (`integ.task-lifecycle.ts`) deploys the *full* `AgentStack` (orchestrator + AgentCore runtime + agent container) to the dedicated E2E account and drives a real agent through the four terminal paths from the Cedar HITL matrix — `COMPLETED`, `FAILED`, and `AWAITING_APPROVAL` → approve/deny — capping cost with low `max_turns` + `max_budget_usd`. Phase 2 (channels) follows. See [ADR-013](/architecture/adr-013-tiered-validation-pyramid). |
 | **Admission backlog observability** | Metric and alarm when `SUBMITTED` task depth exceeds an operator threshold (capacity and admission health). |
 | **Admission queue with deferred pickup** | When admission is at capacity, persist tasks in a durable queue instead of failing them. Automatically re-attempt admission and continue processing in FIFO order (with optional priority lanes) as concurrency becomes available. Preserve cancel/idempotency semantics and expose queue position/ETA in task status. |
 | **Safe orchestrator deploys** | Pre-deploy checks for active tasks (drain or warn); blue-green or canary Lambda deploy for the durable orchestrator with rollback on error regressions (`OBSERVABILITY.md`). |

From b59a09ad1ca72fabf54b356ecf0219ed9be79996 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 02/15] test(cdk): wire Phase 1 gate scenarios to the
 provisioned sandbox (#317)

Point SANDBOX_REPO at ayushtr-aws/abca-integ-sandbox; PRESEEDED_PAT_SECRET stays
bgagent/integ/github-pat (now created in the E2E account 465528542731). CI repo
vars INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID set to match so the always()
branch-cleanup step activates.
---
 cdk/test/integ/integ.task-lifecycle.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index 01b36719..bf4a9082 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -104,7 +104,7 @@ const githubTokenSecretArn = output('GitHubTokenSecretArn');
 // Until these hold real values the gate submits will FAIL at clone/preflight
 // (like scenario 2) rather than reaching AWAITING_APPROVAL — so flip them to the
 // provisioned repo/secret before relying on scenarios 3 & 4.
-const SANDBOX_REPO = 'PLACEHOLDER-ORG/abca-integ-sandbox';
+const SANDBOX_REPO = 'ayushtr-aws/abca-integ-sandbox';
 const PRESEEDED_PAT_SECRET = 'bgagent/integ/github-pat';
 
 const integ = new IntegTest(app, 'TaskLifecycle', {

From 826da1f4050216e2fb52ba700d111efd69270689 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 03/15] fix(agent): exclude integ-runner output from the agent
 build context (#317)

The agent AgentRuntimeArtifact is built with the repo root as its asset/build
context (agent.ts), and integ-runner writes its synth + snapshot output UNDER
that root (cdk/test/integ/cdk-integ.out.<test>.ts[.snapshot]/). Staging the root
then copied its own output dir into itself recursively until the path overflowed
(ENAMETOOLONG), failing synth before any deploy.

The .dockerignore already excluded cdk/cdk.out/ for the same recursive-include
reason but missed the integ-runner dirs. Add them (mirrors .gitignore 70-71).
Verified with `integ-runner --dry-run`: synth + asset staging now succeed
(SUCCESS, 72s) where they previously errored.
---
 .dockerignore | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index 9402c7e7..8e9cc5ab 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,6 +9,14 @@ cdk/cdk.out/
 cdk/lib/
 cdk/node_modules/
 
+# integ-runner output dirs. The agent artifact's build context is the repo
+# root, and integ-runner writes its synth/snapshot output UNDER that root
+# (cdk/test/integ/cdk-integ.out.<test>.ts[.snapshot]/). Without these excludes,
+# staging the root copies its own output dir into itself recursively until the
+# path overflows (ENAMETOOLONG). Mirrors .gitignore lines 70-71.
+cdk/test/integ/cdk-integ.out.*/
+cdk/test/integ/*.snapshot/
+
 # CLI and docs build artifacts
 cli/lib/
 cli/node_modules/

From 02b8fd2d4cf08f166a7544ff2e0cc4067fb39cfd Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 04/15] test(cdk): make Phase 1 integ teardown + diagnostics
 robust (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two test-harness fixes from the first live run (no agent.ts changes):

1. integ-runner --verbose (cdk/mise.toml): the default one-line-per-test output
   hides which assertion failed and its actual-vs-expected payload. The
   lifecycle test polls DynamoDB for terminal task status; without verbose, a
   task stuck at SUBMITTED instead of COMPLETED is undiagnosable. Verbose
   surfaces the assertion diffs.

2. Retry-with-backoff teardown (integ.yml): the AgentCore Runtime runs in VPC
   mode and injects service-managed "agentic_ai" ENIs into the private subnets.
   Deleting the Runtime reclaims those ENIs ASYNCHRONOUSLY, so a single
   delete-stack races the reaper and fails DELETE_FAILED on the subnets +
   RuntimeSG. The first live run stranded the stack exactly this way; a manual
   re-delete minutes later succeeded. The teardown now loops delete→wait→
   re-check with linear backoff (60s..6m) and only fails the step if all
   attempts are exhausted.
---
 .github/workflows/integ.yml | 46 +++++++++++++++++++++++++++++++------
 cdk/mise.toml               |  8 ++++++-
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index 44dc363f..b5e19040 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -235,21 +235,53 @@ jobs:
       #
       # Both integ stacks are swept: backgroundagent-integ (Phase-0 smoke) and
       # backgroundagent-integ-lifecycle (Phase-1 full-stack lifecycle).
+      #
+      # RETRY-WITH-BACKOFF: the full lifecycle stack runs the AgentCore Runtime
+      # in VPC mode, which injects service-managed ("agentic_ai") ENIs into the
+      # private subnets. When CloudFormation deletes the Runtime resource, those
+      # ENIs are reclaimed ASYNCHRONOUSLY by AWS — for a minute or few AFTER the
+      # Runtime is gone. A single delete-stack races that reaper and fails with
+      # DELETE_FAILED on the subnets + RuntimeSG ("has dependencies / dependent
+      # object"). Empirically a second delete a few minutes later succeeds once
+      # the ENIs have drained. So we loop: delete, wait, and on DELETE_FAILED
+      # back off and re-issue. Only after exhausting all attempts do we fail the
+      # step loudly (a genuine leak that needs a human).
       - name: Ensure stacks torn down
         if: always()
         env:
           AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
           AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
         run: |
-          set -euo pipefail
-          for stack in backgroundagent-integ backgroundagent-integ-lifecycle; do
-            aws cloudformation delete-stack --stack-name "$stack" || true
-          done
-          # No `|| true` on the waits: a DELETE_FAILED must surface loudly so we
-          # never silently leak billable resources in the shared account.
+          set -uo pipefail
+
+          # Delete one stack, retrying on the async-ENI-reaper DELETE_FAILED race.
+          # Returns 0 once the stack no longer exists; non-zero only if every
+          # attempt is exhausted while the stack still exists.
+          teardown() {
+            local stack="$1" attempts=6 i status
+            for ((i=1; i<=attempts; i++)); do
+              aws cloudformation delete-stack --stack-name "$stack" || true
+              # wait returns non-zero on DELETE_FAILED or if the stack is gone;
+              # we read the real status afterward rather than trust its exit code.
+              aws cloudformation wait stack-delete-complete --stack-name "$stack" 2>/dev/null || true
+              status="$(aws cloudformation describe-stacks --stack-name "$stack" \
+                --query 'Stacks[0].StackStatus' --output text 2>&1 || true)"
+              if echo "$status" | grep -qiE 'does not exist|ValidationError'; then
+                echo "✅ $stack deleted (attempt $i)"
+                return 0
+              fi
+              echo "⏳ $stack still present after attempt $i (status: $status) — backing off for the ENI reaper"
+              sleep $((i * 60))   # linear backoff: 60s, 120s, ... up to 6 min
+            done
+            echo "❌ $stack still present after $attempts attempts — manual cleanup required (likely orphaned agentic_ai ENIs)"
+            return 1
+          }
+
+          rc=0
           for stack in backgroundagent-integ backgroundagent-integ-lifecycle; do
-            aws cloudformation wait stack-delete-complete --stack-name "$stack"
+            teardown "$stack" || rc=1
           done
+          exit $rc
 
       # Sandbox cleanup for the gate scenarios (3 & 4): coding/new-task-v1 pushes
       # a `bgagent/<task_id>/<slug>` branch and (on approve) opens a PR on the
diff --git a/cdk/mise.toml b/cdk/mise.toml
index 60332012..86dc1b90 100644
--- a/cdk/mise.toml
+++ b/cdk/mise.toml
@@ -66,7 +66,13 @@ run = [
   # No --update-on-failed: .snapshot/ is gitignored, so there is no committed
   # snapshot to diff against or update. --force re-runs the deploy-then-verify
   # unconditionally, which is what we want in CI.
-  "npx integ-runner --language typescript --directory test/integ --force",
+  #
+  # --verbose: integ-runner otherwise prints only a one-line pass/fail per test,
+  # which hides WHICH assertion failed and its actual-vs-expected payload. The
+  # lifecycle test polls DynamoDB for terminal task status; without --verbose a
+  # failure (e.g. task stuck at SUBMITTED instead of COMPLETED) is undiagnosable
+  # from the log alone. Verbose surfaces the assertion diffs we need.
+  "npx integ-runner --language typescript --directory test/integ --force --verbose",
 ]
 
 [tasks.bundle]

From 59953e14c2610905d16e0460d66c4875f920e6fc Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 05/15] test(cdk): assert task-record fields + approval
 metadata, not just status (#317)

#317 asks the terminal-state assertions to cover task_id, user_id, status,
timestamps, and approval metadata. The first cut asserted only `status`. Add a
`presentString` matcher (Match.objectLike on { S: stringLikeRegexp('.+') }) and
extend every poll:

- COMPLETED: + task_id, user_id, created_at, updated_at present
- FAILED:    + the above, plus error_message present (records WHY it failed)
- AWAITING_APPROVAL (both gates): + task_id, user_id, awaiting_approval_request_id
- APPROVED row: + task_id, request_id, user_id, decided_at (proves the owning
  caller recorded the decision, not just a status flip)
- DENIED row:   + the above, plus deny_reason present

Runtime-generated values (user_id = Cognito sub UUID, ISO timestamps) are matched
for presence, not pinned. Validated with integ-runner --force --dry-run (synth +
assertion serialization pass).
---
 cdk/test/integ/integ.task-lifecycle.ts | 77 ++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 6 deletions(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index bf4a9082..a729ca85 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -48,9 +48,19 @@
 import { randomBytes } from 'node:crypto';
 import { ExpectedResult, IntegTest } from '@aws-cdk/integ-tests-alpha';
 import { App, type CfnOutput, Duration } from 'aws-cdk-lib';
+import { Match } from 'aws-cdk-lib/assertions';
 import { TaskStatus } from '../../src/constructs/task-status';
 import { AgentStack } from '../../src/stacks/agent';
 
+// Presence matcher for DynamoDB string attributes ({ S: <non-empty> }). Used to
+// assert task-record fields exist and are populated without pinning their
+// runtime-generated values (e.g. user_id is the caller's Cognito `sub`, a UUID
+// not known at synth time; created_at/updated_at are ISO timestamps). #317 asks
+// the terminal-state assertions to cover task_id, user_id, status, timestamps,
+// and approval metadata — these matchers satisfy the "exists + non-empty" half
+// while `status` is asserted exactly.
+const presentString = Match.objectLike({ S: Match.stringLikeRegexp('.+') });
+
 const app = new App();
 
 // The real, full production stack. Environment-agnostic on purpose (same
@@ -185,7 +195,15 @@ const pollComplete = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: submitComplete.getAttString('body.data.task_id') } },
 });
 pollComplete
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.COMPLETED } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: TaskStatus.COMPLETED },
+      task_id: presentString,
+      user_id: presentString,
+      created_at: presentString,
+      updated_at: presentString,
+    }),
+  }))
   .waitForAssertions(TERMINAL_POLL);
 
 // --- Scenario 2: FAILED (coding/new-task-v1 against a nonexistent repo) --------
@@ -213,7 +231,17 @@ const pollFail = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: submitFail.getAttString('body.data.task_id') } },
 });
 pollFail
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.FAILED } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: TaskStatus.FAILED },
+      task_id: presentString,
+      user_id: presentString,
+      created_at: presentString,
+      updated_at: presentString,
+      // The terminal error path must record WHY it failed (clone/preflight).
+      error_message: presentString,
+    }),
+  }))
   .waitForAssertions(TERMINAL_POLL);
 
 // --- Token seeding (prerequisite for gate scenarios) --------------------------
@@ -260,7 +288,16 @@ const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: approveTaskId } },
 });
 pollGateApprove
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: TaskStatus.AWAITING_APPROVAL },
+      task_id: presentString,
+      user_id: presentString,
+      // Cedar HITL invariant: AWAITING_APPROVAL rows carry the pending request id
+      // that the approve/deny call must reference (task-status.ts §invariant).
+      awaiting_approval_request_id: presentString,
+    }),
+  }))
   .waitForAssertions(GATE_POLL);
 
 // Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
@@ -290,7 +327,18 @@ const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } },
 });
 pollApproveDecision
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: 'APPROVED' },
+      task_id: presentString,
+      request_id: presentString,
+      // ApproveTaskFn writes decided_at + the caller's user_id on the row; their
+      // presence proves the decision was recorded by the owning caller, not just
+      // that a status string flipped (approval metadata, per #317).
+      user_id: presentString,
+      decided_at: presentString,
+    }),
+  }))
   .waitForAssertions(GATE_POLL);
 
 // --- Scenario 4: AWAITING_APPROVAL -> deny ------------------------------------
@@ -317,7 +365,14 @@ const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: denyTaskId } },
 });
 pollGateDeny
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: TaskStatus.AWAITING_APPROVAL },
+      task_id: presentString,
+      user_id: presentString,
+      awaiting_approval_request_id: presentString,
+    }),
+  }))
   .waitForAssertions(GATE_POLL);
 
 const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
@@ -341,7 +396,17 @@ const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } },
 });
 pollDenyDecision
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } }))
+  .expect(ExpectedResult.objectLike({
+    Item: Match.objectLike({
+      status: { S: 'DENIED' },
+      task_id: presentString,
+      request_id: presentString,
+      user_id: presentString,
+      decided_at: presentString,
+      // DenyTaskFn persists the (sanitized) reason; we sent a non-empty one.
+      deny_reason: presentString,
+    }),
+  }))
   .waitForAssertions(GATE_POLL);
 
 // --- Execution order ----------------------------------------------------------

From df73ef809cc483ca8988eb9afec561e7d08f8956 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 06/15] fix(cdk): satisfy onboarding gate + guardrail in Phase
 1 scenarios (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First live run diagnosed via the assertion-provider Lambda logs (which survive
teardown). All three failures were test-side, not platform bugs:

1. Scenario 1 → 400 VALIDATION_ERROR "Task description was blocked by content
   policy". The terse imperative prompt ("Reply with exactly the single word:
   done. Do not use any tools.") tripped the Bedrock guardrail. Replaced with a
   benign natural-language request.

2. Scenarios 2/3/4 → 422 REPO_NOT_ONBOARDED. The submit path enforces a
   RepoTable onboarding gate BEFORE clone/preflight, which the plan never
   accounted for. RepoTable lives in-stack (RemovalPolicy.DESTROY), so it is
   fresh every run — onboard via putItem assertions (minimal active row) rather
   than adding a Blueprint construct to the production stack:
   - onboardFailRepo: onboards the (GitHub-nonexistent) repo so scenario 2 now
     passes admission and fails at CLONE (the intended terminal-FAILED path),
     not at the gate.
   - onboardSandbox: onboards SANDBOX_REPO so the gate scenarios reach the agent.

Reads the RepoTableName CfnOutput; sequences both onboarding steps before their
submits in the .next() chain. Validated with integ-runner --force --dry-run.
---
 cdk/test/integ/integ.task-lifecycle.ts | 66 +++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index a729ca85..24dee1f3 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -86,6 +86,11 @@ const userPoolId = output('UserPoolId');
 const appClientId = output('AppClientId');
 const taskTableName = output('TaskTableName');
 const taskApprovalsTableName = output('TaskApprovalsTableName');
+// The submit path enforces an onboarding gate: a repo must have an active row in
+// RepoTable or POST /tasks returns 422 REPO_NOT_ONBOARDED before clone/preflight.
+// The gate scenarios onboard SANDBOX_REPO here (a putItem assertion) rather than
+// adding a Blueprint construct to the production stack — test-side only.
+const repoTableName = output('RepoTableName');
 // AgentStack creates its OWN empty GitHubTokenSecret (agent.ts:181,
 // RemovalPolicy.DESTROY) — it does not reference an external one. The gate
 // scenarios populate it post-deploy from the pre-seeded secret below, which is
@@ -182,7 +187,11 @@ const submitComplete = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
   },
   body: JSON.stringify({
     workflow_ref: 'default/agent-v1',
-    task_description: 'Reply with exactly the single word: done. Do not use any tools.',
+    // Keep this a plain, benign natural-language request. An earlier terse,
+    // imperative phrasing ("Reply with exactly the single word: done. Do not
+    // use any tools.") tripped the Bedrock content-policy guardrail at submit
+    // (400 VALIDATION_ERROR "Task description was blocked by content policy").
+    task_description: 'Please write a one-sentence summary explaining what a pull request is in software development.',
     max_turns: 2,
     max_budget_usd: 0.5,
   }),
@@ -206,11 +215,26 @@ pollComplete
   }))
   .waitForAssertions(TERMINAL_POLL);
 
-// --- Scenario 2: FAILED (coding/new-task-v1 against a nonexistent repo) --------
-// The coding workflow requires a repo and clones it. Pointing it at a repo that
-// does not exist makes preflight/clone fail fast, so the orchestrator writes a
-// terminal FAILED with an error_message — no agent turn, no runtime spin-up, and
-// no valid GitHub token required.
+// --- Scenario 2: FAILED (coding/new-task-v1, onboarded repo, clone fails) ------
+// The submit path runs the onboarding gate (RepoTable) BEFORE clone/preflight,
+// so an un-onboarded repo is rejected at submit (422 REPO_NOT_ONBOARDED) and the
+// task never reaches a terminal FAILED. To exercise the terminal-error path we
+// must therefore ONBOARD the repo first, then make CLONE fail: the onboarding
+// gate only checks RepoTable, not GitHub, so we onboard a repo slug that does
+// not exist on GitHub. Submit then passes admission, preflight/clone 404s, and
+// the orchestrator writes terminal FAILED + error_message — no agent turn, no
+// runtime spin-up. (onboardFailRepo is sequenced before this submit.)
+const failRepo = `abca-integ-nonexistent/does-not-exist-${randomBytes(6).toString('hex')}`;
+const onboardFailRepo = integ.assertions.awsApiCall('DynamoDB', 'putItem', {
+  TableName: repoTableName,
+  Item: {
+    repo: { S: failRepo },
+    status: { S: 'active' },
+    onboarded_at: { S: '2026-01-01T00:00:00.000Z' },
+    updated_at: { S: '2026-01-01T00:00:00.000Z' },
+  },
+});
+
 const submitFail = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
   method: 'POST',
   headers: {
@@ -219,7 +243,7 @@ const submitFail = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
   },
   body: JSON.stringify({
     workflow_ref: 'coding/new-task-v1',
-    repo: `abca-integ-nonexistent/does-not-exist-${randomBytes(6).toString('hex')}`,
+    repo: failRepo,
     task_description: 'This task targets a nonexistent repo and must fail at clone/preflight.',
     max_turns: 1,
     max_budget_usd: 0.5,
@@ -258,6 +282,20 @@ const seedPut = integ.assertions.awsApiCall('SecretsManager', 'putSecretValue',
   SecretString: seedGet.getAttString('SecretString'),
 });
 
+// Onboard SANDBOX_REPO so the gate submits pass the onboarding gate (otherwise
+// 422 REPO_NOT_ONBOARDED at submit, before the agent ever runs). A minimal active
+// row is enough — the agent reads the GitHub token from the platform-default
+// GitHubTokenSecret we seeded above, so the blueprint needs no per-repo token.
+const onboardSandbox = integ.assertions.awsApiCall('DynamoDB', 'putItem', {
+  TableName: repoTableName,
+  Item: {
+    repo: { S: SANDBOX_REPO },
+    status: { S: 'active' },
+    onboarded_at: { S: '2026-01-01T00:00:00.000Z' },
+    updated_at: { S: '2026-01-01T00:00:00.000Z' },
+  },
+});
+
 // --- Scenario 3: AWAITING_APPROVAL -> approve ---------------------------------
 // coding/new-task-v1 against the sandbox. The task asks the agent to write a
 // `config.env` file, which the Write tool routes through the write_env_files
@@ -410,20 +448,24 @@ pollDenyDecision
   .waitForAssertions(GATE_POLL);
 
 // --- Execution order ----------------------------------------------------------
-// Auth first, then the two no-repo scenarios (submit both, then wait so their
-// agent runs proceed concurrently). Next seed the GitHub token, then submit both
-// gate tasks so they spin up concurrently and park at their gates; finally drive
-// approve then deny. The approve/deny flows are sequential because each
-// approve/deny POST needs the request_id read from the parked task's approval row.
+// Auth first. Scenario 1 (no repo) can submit immediately. Scenario 2 needs its
+// repo onboarded BEFORE submit (else 422 at the onboarding gate), so onboardFail
+// precedes submitFail. Both no-repo/clone-fail terminals are then awaited so they
+// proceed concurrently. Next seed the GitHub token AND onboard the sandbox, then
+// submit both gate tasks so they spin up concurrently and park at their gates;
+// finally drive approve then deny. The approve/deny flows are sequential because
+// each POST needs the request_id read from the parked task's approval row.
 createUser
   .next(setPassword)
   .next(auth)
   .next(submitComplete)
+  .next(onboardFailRepo)
   .next(submitFail)
   .next(pollComplete)
   .next(pollFail)
   .next(seedGet)
   .next(seedPut)
+  .next(onboardSandbox)
   .next(submitApprove)
   .next(submitDeny)
   .next(pollGateApprove)

From e605feec2805b16487a0cffa843029d0517cfbff Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:39 -0400
Subject: [PATCH 07/15] test(cdk): disable two-phase update workflow for the
 lifecycle test (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

integ-runner defaults stackUpdateWorkflow=true, which deploys the snapshot then
re-deploys the current version to verify in-place updates. The AgentCore Runtime
takes minutes to reach READY and is partly immutable, so the second phase races
the first (Runtime still CREATING) → 409 "agent is currently being modified" →
integ-runner aborts mid-deploy and teardown strands a CREATING Runtime
(DELETE_FAILED). Observed live: 334/335 resources deployed, only the Runtime was
mid-create when the run aborted ~95s in.

This test validates runtime behavior, not stack-update safety, so a single clean
deploy is correct. Set stackUpdateWorkflow: false on the IntegTest.
---
 cdk/test/integ/integ.task-lifecycle.ts | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index 24dee1f3..a1033ec4 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -124,6 +124,15 @@ const PRESEEDED_PAT_SECRET = 'bgagent/integ/github-pat';
 
 const integ = new IntegTest(app, 'TaskLifecycle', {
   testCases: [stack],
+  // Disable the two-phase update workflow. By default integ-runner deploys the
+  // committed snapshot first, then re-deploys the current version to verify
+  // in-place updates don't break. The AgentCore Runtime takes several minutes to
+  // go CREATING -> READY and is partly immutable; the second deploy phase races
+  // the first (Runtime still CREATING) -> 409 "agent is currently being modified"
+  // -> integ-runner aborts mid-deploy and teardown strands a CREATING Runtime.
+  // We validate runtime BEHAVIOR, not stack-update safety, so a single clean
+  // deploy is correct here.
+  stackUpdateWorkflow: false,
   // Force teardown on success and failure so a failed assertion never strands
   // the (expensive) full stack in the shared E2E account. The CI workflow keeps
   // a CloudFormation delete-stack safety net on top of this.

From d754b7d4e05401d75ff5068a7a31dc851cfbd17c Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:40 -0400
Subject: [PATCH 08/15] test(cdk): per-run unique stack name + tolerate ENI
 teardown failure (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AgentCore Runtime injects service-managed `agentic_ai` ENIs that AWS releases
asynchronously, so `cdk destroy` reliably fails the subnet/SG/VPC deletes
(DependencyViolation) and strands the stack. With the old fixed name
(backgroundagent-integ-lifecycle) a stranded stack BLOCKED the next run (name
conflict) and falsely failed the run on teardown alone.

Two changes:
- Unique per-commit stack name `int-<short-sha>`, sourced from the COMMIT_HASH
  env var (read directly via process.env — integ-runner synths in a subprocess
  that inherits the env but not our shell's CDK context). CI sets COMMIT_HASH
  from the resolved head SHA; mise //cdk:integ falls back to the local git SHA.
  A stranded stack now never blocks a later run; an out-of-band ephemeral sweeper
  reclaims int-* stacks once their ENIs detach.
- destroy.expectError:true (scoped to the dependency-violation message) so the
  expected ENI teardown failure no longer marks the whole run FAILED — the result
  reflects the ASSERTIONS, not the teardown race.

integ.yml: pass COMMIT_HASH to the run; rework the safety-net teardown to target
the per-run int-<sha> stack, best-effort (no retry-until-deleted, no job failure)
since unique names mean stranded stacks are harmless and swept out of band.

Verified: integ-runner --force --dry-run synthesizes `int-abcdef12` from
COMMIT_HASH (was `int-local` before the env-read fix).
---
 .github/workflows/integ.yml            | 70 +++++++++++---------------
 cdk/mise.toml                          | 12 ++++-
 cdk/test/integ/integ.task-lifecycle.ts | 42 ++++++++++++++--
 3 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index b5e19040..6c5f3257 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -220,6 +220,12 @@ jobs:
         run: yarn install --immutable
 
       - name: Run integ tests (deploy → assert → destroy)
+        # COMMIT_HASH drives the per-run unique stack name `int-<hash>` (see
+        # cdk/test/integ/integ.task-lifecycle.ts + cdk/mise.toml). Using the
+        # resolved head SHA means a stranded stack from a failed teardown never
+        # collides with / blocks a later run on a different commit.
+        env:
+          COMMIT_HASH: ${{ needs.resolve.outputs.head_sha }}
         run: mise //cdk:integ
 
       # Safety net: integ-runner forces teardown on success and failure, but if
@@ -233,55 +239,37 @@ jobs:
       # CloudFormation name instead. delete-stack is idempotent (no-op if
       # already gone), so `|| true` only guards transient API errors.
       #
-      # Both integ stacks are swept: backgroundagent-integ (Phase-0 smoke) and
-      # backgroundagent-integ-lifecycle (Phase-1 full-stack lifecycle).
+      # Best-effort delete-stack safety net for crash/cancel cases. integ-runner
+      # already runs its own destroy (and tolerates the expected ENI DELETE_FAILED
+      # via expectError); this only catches a run that died BEFORE integ-runner's
+      # own teardown (e.g. the job was cancelled mid-deploy).
+      #
+      # Stacks swept: backgroundagent-integ (Phase-0 smoke, fixed name) and the
+      # Phase-1 per-run stack `int-<short-sha>` (matches the name computed in
+      # cdk/test/integ/integ.task-lifecycle.ts from the same head SHA).
       #
-      # RETRY-WITH-BACKOFF: the full lifecycle stack runs the AgentCore Runtime
-      # in VPC mode, which injects service-managed ("agentic_ai") ENIs into the
-      # private subnets. When CloudFormation deletes the Runtime resource, those
-      # ENIs are reclaimed ASYNCHRONOUSLY by AWS — for a minute or few AFTER the
-      # Runtime is gone. A single delete-stack races that reaper and fails with
-      # DELETE_FAILED on the subnets + RuntimeSG ("has dependencies / dependent
-      # object"). Empirically a second delete a few minutes later succeeds once
-      # the ENIs have drained. So we loop: delete, wait, and on DELETE_FAILED
-      # back off and re-issue. Only after exhausting all attempts do we fail the
-      # step loudly (a genuine leak that needs a human).
-      - name: Ensure stacks torn down
+      # IMPORTANT — this step does NOT retry-until-deleted and does NOT fail the
+      # job on a stranded stack. The AgentCore Runtime's service-managed
+      # `agentic_ai` ENIs are released asynchronously by AWS (minutes to hours),
+      # so an immediate delete reliably hits DELETE_FAILED on the subnets/SG/VPC.
+      # Because the stack name is now per-commit-UNIQUE, a stranded `int-<sha>`
+      # stack never blocks a future run, so we leave it for the out-of-band
+      # ephemeral sweeper to reclaim once the ENIs detach. We fire one delete to
+      # start the teardown and move on.
+      - name: Ensure stacks torn down (best effort)
         if: always()
         env:
           AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
           AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
+          HEAD_SHA: ${{ needs.resolve.outputs.head_sha }}
         run: |
           set -uo pipefail
-
-          # Delete one stack, retrying on the async-ENI-reaper DELETE_FAILED race.
-          # Returns 0 once the stack no longer exists; non-zero only if every
-          # attempt is exhausted while the stack still exists.
-          teardown() {
-            local stack="$1" attempts=6 i status
-            for ((i=1; i<=attempts; i++)); do
-              aws cloudformation delete-stack --stack-name "$stack" || true
-              # wait returns non-zero on DELETE_FAILED or if the stack is gone;
-              # we read the real status afterward rather than trust its exit code.
-              aws cloudformation wait stack-delete-complete --stack-name "$stack" 2>/dev/null || true
-              status="$(aws cloudformation describe-stacks --stack-name "$stack" \
-                --query 'Stacks[0].StackStatus' --output text 2>&1 || true)"
-              if echo "$status" | grep -qiE 'does not exist|ValidationError'; then
-                echo "✅ $stack deleted (attempt $i)"
-                return 0
-              fi
-              echo "⏳ $stack still present after attempt $i (status: $status) — backing off for the ENI reaper"
-              sleep $((i * 60))   # linear backoff: 60s, 120s, ... up to 6 min
-            done
-            echo "❌ $stack still present after $attempts attempts — manual cleanup required (likely orphaned agentic_ai ENIs)"
-            return 1
-          }
-
-          rc=0
-          for stack in backgroundagent-integ backgroundagent-integ-lifecycle; do
-            teardown "$stack" || rc=1
+          INT_STACK="int-$(printf '%s' "$HEAD_SHA" | cut -c1-8)"
+          for stack in backgroundagent-integ "$INT_STACK"; do
+            echo "Best-effort delete-stack: $stack"
+            aws cloudformation delete-stack --stack-name "$stack" || true
           done
-          exit $rc
+          echo "Initiated teardown; stranded int-* stacks (if any) are reclaimed by the ephemeral sweeper once their ENIs detach."
 
       # Sandbox cleanup for the gate scenarios (3 & 4): coding/new-task-v1 pushes
       # a `bgagent/<task_id>/<slug>` branch and (on approve) opens a PR on the
diff --git a/cdk/mise.toml b/cdk/mise.toml
index 86dc1b90..990db7fb 100644
--- a/cdk/mise.toml
+++ b/cdk/mise.toml
@@ -63,6 +63,13 @@ description = "CDK deploy-then-verify integration tests (integ-runner). Needs AW
 depends = [":compile"]
 run = [
   "mkdir -p $TMPDIR",
+  # Per-run unique stack naming: the lifecycle test names its stack `int-<hash>`
+  # from the COMMIT_HASH env var (read directly via process.env in the test —
+  # integ-runner synths in a subprocess that inherits the env but not our shell's
+  # CDK context). A stranded stack (the AgentCore ENI teardown race) then never
+  # blocks the next run. Source: COMMIT_HASH (set by CI from the resolved head
+  # SHA), falling back to the local git SHA, then "local" outside a checkout.
+  #
   # No --update-on-failed: .snapshot/ is gitignored, so there is no committed
   # snapshot to diff against or update. --force re-runs the deploy-then-verify
   # unconditionally, which is what we want in CI.
@@ -72,7 +79,10 @@ run = [
   # lifecycle test polls DynamoDB for terminal task status; without --verbose a
   # failure (e.g. task stuck at SUBMITTED instead of COMPLETED) is undiagnosable
   # from the log alone. Verbose surfaces the assertion diffs we need.
-  "npx integ-runner --language typescript --directory test/integ --force --verbose",
+  '''
+  export COMMIT_HASH="${COMMIT_HASH:-$(git rev-parse HEAD 2>/dev/null || echo local)}"
+  npx integ-runner --language typescript --directory test/integ --force --verbose
+  ''',
 ]
 
 [tasks.bundle]
diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index a1033ec4..c1dfb35c 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -63,6 +63,25 @@ const presentString = Match.objectLike({ S: Match.stringLikeRegexp('.+') });
 
 const app = new App();
 
+// Per-run UNIQUE stack name: `int-<commit-hash>`. A fixed name is a trap for this
+// stack — the AgentCore Runtime injects service-managed `agentic_ai` ENIs that AWS
+// releases ASYNCHRONOUSLY, so `cdk destroy` reliably fails the subnet/SG/VPC
+// deletes (DependencyViolation) and strands the stack. With a fixed name that
+// stranded stack BLOCKS the next run (name conflict). A unique per-commit name
+// means a failed teardown never blocks a later run, and an out-of-band ephemeral
+// sweeper can reclaim `int-*` stacks once their ENIs detach.
+//
+// The hash comes from the COMMIT_HASH env var (set by CI from the resolved head
+// SHA; the mise //cdk:integ task falls back to the local git SHA). We read the
+// ENV directly rather than CDK context: integ-runner synthesizes the test app in
+// its own subprocess and does NOT forward CDK_CONTEXT_JSON / `-c` from our shell
+// to that synth, but the subprocess DOES inherit the environment — so the env var
+// reaches `process.env` here reliably where `tryGetContext` would not. Falls back
+// to 'local' outside CI/git. (Date.now()/random are avoided — they'd break integ
+// snapshot determinism; CI always supplies a real sha.)
+const commitHash = (process.env.COMMIT_HASH ?? '').slice(0, 8) || 'local';
+const stackName = `int-${commitHash}`;
+
 // The real, full production stack. Environment-agnostic on purpose (same
 // rationale as Phase 0): an explicit env would force the IntegTest DeployAssert
 // stack — always environment-agnostic — into cross-region references it cannot
@@ -70,8 +89,8 @@ const app = new App();
 //
 // DO NOT set runtimeName/memoryName here or pin them in agent.ts for this
 // deploy: the committed defaults auto-generate stack-name-scoped unique names,
-// which is exactly what lets backgroundagent-integ-lifecycle stand alone.
-const stack = new AgentStack(app, 'backgroundagent-integ-lifecycle', {
+// so each `int-<hash>` stack gets its own non-colliding AgentCore names.
+const stack = new AgentStack(app, stackName, {
   description: 'ABCA Phase-1 integ lifecycle stack (full AgentStack: orchestrator + agent runtime)',
 });
 
@@ -134,10 +153,23 @@ const integ = new IntegTest(app, 'TaskLifecycle', {
   // deploy is correct here.
   stackUpdateWorkflow: false,
   // Force teardown on success and failure so a failed assertion never strands
-  // the (expensive) full stack in the shared E2E account. The CI workflow keeps
-  // a CloudFormation delete-stack safety net on top of this.
+  // the (expensive) full stack in the shared E2E account.
+  //
+  // expectError on destroy: `cdk destroy` RELIABLY fails this stack — the
+  // AgentCore Runtime's service-managed `agentic_ai` ENIs are released
+  // asynchronously by AWS, so the subnet/SG/VPC deletes hit DependencyViolation
+  // ("has dependencies and cannot be deleted" / "has a dependent object") while
+  // the ENIs linger. Without expectError, integ-runner would mark the whole run
+  // FAILED on teardown alone — masking whether the ASSERTIONS passed. We tolerate
+  // the teardown failure (scoped to the dependency-violation message so unrelated
+  // teardown bugs still surface) and hand the stranded `int-<hash>` stack to the
+  // out-of-band ephemeral sweeper, which reclaims it once AWS detaches the ENIs.
   cdkCommandOptions: {
-    destroy: { args: { force: true } },
+    destroy: {
+      args: { force: true },
+      expectError: true,
+      expectedMessage: 'cannot be deleted|dependent object|DELETE_FAILED',
+    },
   },
 });
 

From 056d0481d1b5a3fed25b3a79290e7e2e7ee98a65 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:40 -0400
Subject: [PATCH 09/15] =?UTF-8?q?fix(cdk):=20drop=20nested=20Match=20from?=
 =?UTF-8?q?=20polled=20assertions=20=E2=80=94=20they=20never=20match=20(#3?=
 =?UTF-8?q?17)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live run proved the platform works: scenario-1 task reached COMPLETED in 3.4s
with all fields correct. But the assertion polled it 25× and timed out anyway.

Root cause: nested Match.* matchers (Match.objectLike / Match.stringLikeRegexp,
the field-presence strengthening added earlier) do NOT survive serialization into
the waitForAssertions Step Functions waiter. The provider serializes the Match
object's internals — {name, partial, pattern} — into the EXPECTED pattern, and
the waiter then treats those as literal required keys that never exist on the
DynamoDB row. Observed in the waiter log: "Missing key 'name' / 'partial' /
'pattern'", failing every poll despite status=COMPLETED being present.

Fix: every polled assertion (waitForAssertions) now uses ONLY a flat, exact
scalar — `Item.status.S` (or the APPROVED/DENIED decision string) — which
serializes cleanly. Removed the unused Match import. Field-presence assertions
(task_id/user_id/timestamps/approval metadata, #317) need a separate non-polled
getItem + assertAtPath — re-noted as a follow-up on #317.
---
 cdk/test/integ/integ.task-lifecycle.ts | 87 +++++---------------------
 1 file changed, 17 insertions(+), 70 deletions(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index c1dfb35c..6d9714f7 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -48,18 +48,20 @@
 import { randomBytes } from 'node:crypto';
 import { ExpectedResult, IntegTest } from '@aws-cdk/integ-tests-alpha';
 import { App, type CfnOutput, Duration } from 'aws-cdk-lib';
-import { Match } from 'aws-cdk-lib/assertions';
 import { TaskStatus } from '../../src/constructs/task-status';
 import { AgentStack } from '../../src/stacks/agent';
 
-// Presence matcher for DynamoDB string attributes ({ S: <non-empty> }). Used to
-// assert task-record fields exist and are populated without pinning their
-// runtime-generated values (e.g. user_id is the caller's Cognito `sub`, a UUID
-// not known at synth time; created_at/updated_at are ISO timestamps). #317 asks
-// the terminal-state assertions to cover task_id, user_id, status, timestamps,
-// and approval metadata — these matchers satisfy the "exists + non-empty" half
-// while `status` is asserted exactly.
-const presentString = Match.objectLike({ S: Match.stringLikeRegexp('.+') });
+// NOTE on assertion shape: every terminal/gate check below runs inside
+// `waitForAssertions` (a polling Step Functions waiter). Nested `Match.*`
+// matchers (objectLike / stringLikeRegexp) CANNOT be used there — the assertion
+// provider serializes the Match object's internals ({name, partial, pattern})
+// into the expected pattern, and the waiter then treats those as literal
+// required keys that never exist on the row, so the assertion fails forever even
+// when the data is correct (observed live: a COMPLETED task polled 25× and timed
+// out). Polled assertions therefore use ONLY flat, exact scalar values (the
+// `status`/decision string), which serialize cleanly. Asserting field PRESENCE
+// (task_id/user_id/timestamps/approval metadata, #317) needs a non-polled
+// getItem with assertAtPath — tracked as a follow-up on #317.
 
 const app = new App();
 
@@ -245,15 +247,7 @@ const pollComplete = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: submitComplete.getAttString('body.data.task_id') } },
 });
 pollComplete
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: TaskStatus.COMPLETED },
-      task_id: presentString,
-      user_id: presentString,
-      created_at: presentString,
-      updated_at: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.COMPLETED } } }))
   .waitForAssertions(TERMINAL_POLL);
 
 // --- Scenario 2: FAILED (coding/new-task-v1, onboarded repo, clone fails) ------
@@ -296,17 +290,7 @@ const pollFail = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: submitFail.getAttString('body.data.task_id') } },
 });
 pollFail
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: TaskStatus.FAILED },
-      task_id: presentString,
-      user_id: presentString,
-      created_at: presentString,
-      updated_at: presentString,
-      // The terminal error path must record WHY it failed (clone/preflight).
-      error_message: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.FAILED } } }))
   .waitForAssertions(TERMINAL_POLL);
 
 // --- Token seeding (prerequisite for gate scenarios) --------------------------
@@ -367,16 +351,7 @@ const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: approveTaskId } },
 });
 pollGateApprove
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: TaskStatus.AWAITING_APPROVAL },
-      task_id: presentString,
-      user_id: presentString,
-      // Cedar HITL invariant: AWAITING_APPROVAL rows carry the pending request id
-      // that the approve/deny call must reference (task-status.ts §invariant).
-      awaiting_approval_request_id: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
   .waitForAssertions(GATE_POLL);
 
 // Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
@@ -406,18 +381,7 @@ const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } },
 });
 pollApproveDecision
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: 'APPROVED' },
-      task_id: presentString,
-      request_id: presentString,
-      // ApproveTaskFn writes decided_at + the caller's user_id on the row; their
-      // presence proves the decision was recorded by the owning caller, not just
-      // that a status string flipped (approval metadata, per #317).
-      user_id: presentString,
-      decided_at: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } }))
   .waitForAssertions(GATE_POLL);
 
 // --- Scenario 4: AWAITING_APPROVAL -> deny ------------------------------------
@@ -444,14 +408,7 @@ const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: denyTaskId } },
 });
 pollGateDeny
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: TaskStatus.AWAITING_APPROVAL },
-      task_id: presentString,
-      user_id: presentString,
-      awaiting_approval_request_id: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
   .waitForAssertions(GATE_POLL);
 
 const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
@@ -475,17 +432,7 @@ const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
   Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } },
 });
 pollDenyDecision
-  .expect(ExpectedResult.objectLike({
-    Item: Match.objectLike({
-      status: { S: 'DENIED' },
-      task_id: presentString,
-      request_id: presentString,
-      user_id: presentString,
-      decided_at: presentString,
-      // DenyTaskFn persists the (sanitized) reason; we sent a non-empty one.
-      deny_reason: presentString,
-    }),
-  }))
+  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } }))
   .waitForAssertions(GATE_POLL);
 
 // --- Execution order ----------------------------------------------------------

From f53544924682f19e436989f5c528c4529c5ad47f Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:40 -0400
Subject: [PATCH 10/15] fix(cdk): seed GitHub token before any preflight to
 avoid cached-empty 401 (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live run: gate tasks failed preflight with 401 GITHUB_UNREACHABLE despite a valid
PAT, so they never reached AWAITING_APPROVAL and the gate assertions timed out.

Root cause: resolveGitHubToken (context-hydration.ts) caches the secret value for
5 min keyed by ARN. Scenario 2 (coding/new-task-v1) runs GitHub preflight, and in
the old order it ran BEFORE the seedGet/seedPut step — so it read and cached the
stack's INITIAL EMPTY GitHubTokenSecret. Every later gate task reused that cached
empty token → 401 → FAILED before the Cedar gate could fire. (Scenario 2 itself
still "passed" because it targets a nonexistent repo and fails regardless, hiding
the poisoning. Scenario 1 is repo-less so its preflight short-circuits and never
touches the token.)

Fix: move seedGet→seedPut to immediately after auth, before ANY submit, so the
secret is populated before the first token read and no empty value is ever
cached. Stays true to the design (QUICK_START §4: operator populates the secret
before submitting tasks) — no agent.ts change, no import-by-ARN deviation.
Onboarding steps moved ahead of submits too.
---
 cdk/test/integ/integ.task-lifecycle.ts | 31 +++++++++++++++++---------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index 6d9714f7..af695411 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -436,24 +436,33 @@ pollDenyDecision
   .waitForAssertions(GATE_POLL);
 
 // --- Execution order ----------------------------------------------------------
-// Auth first. Scenario 1 (no repo) can submit immediately. Scenario 2 needs its
-// repo onboarded BEFORE submit (else 422 at the onboarding gate), so onboardFail
-// precedes submitFail. Both no-repo/clone-fail terminals are then awaited so they
-// proceed concurrently. Next seed the GitHub token AND onboard the sandbox, then
-// submit both gate tasks so they spin up concurrently and park at their gates;
-// finally drive approve then deny. The approve/deny flows are sequential because
-// each POST needs the request_id read from the parked task's approval row.
+// Auth first, then SEED THE GITHUB TOKEN BEFORE ANY SUBMIT. This ordering is
+// load-bearing: the orchestrator's resolveGitHubToken caches the secret value
+// for 5 min keyed by ARN (context-hydration.ts). Any coding-workflow task that
+// runs GitHub preflight reads + caches the token. Scenario 2 (coding/new-task-v1)
+// runs preflight too — so if it ran BEFORE seedPut, it would cache the stack's
+// INITIAL EMPTY secret and every later gate task would reuse that empty token →
+// preflight 401 GITHUB_UNREACHABLE → FAILED before ever reaching the gate
+// (observed live). Seeding right after auth means the secret is populated before
+// the first token read, so no empty value is ever cached. This is exactly the
+// documented operator flow (QUICK_START §4: populate the secret before submitting
+// tasks) — no agent.ts change.
+//
+// Onboarding: scenario 2's repo and the sandbox both need a RepoTable row before
+// submit (else 422 REPO_NOT_ONBOARDED), so both onboard steps precede their
+// submits. Gate approve/deny run sequentially since each POST needs the
+// request_id read from the parked task's approval row.
 createUser
   .next(setPassword)
   .next(auth)
-  .next(submitComplete)
+  .next(seedGet)
+  .next(seedPut)
   .next(onboardFailRepo)
+  .next(onboardSandbox)
+  .next(submitComplete)
   .next(submitFail)
   .next(pollComplete)
   .next(pollFail)
-  .next(seedGet)
-  .next(seedPut)
-  .next(onboardSandbox)
   .next(submitApprove)
   .next(submitDeny)
   .next(pollGateApprove)

From 309afdf09e58293ea08f619022f6af93059556bf Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 14:15:40 -0400
Subject: [PATCH 11/15] docs(cdk): clarify Phase 1 integ test is
 environment-agnostic (#317)

The header + gate-config comments implied the test is pinned to a specific E2E
account. Reword to reflect reality: it deploys to whatever account/region the
caller's creds resolve to, config is name-based (not ARN-pinned), and note the
PAT needs Contents+PR write for the approve scenario's git push.
---
 cdk/test/integ/integ.task-lifecycle.ts | 32 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index af695411..0b6e61c0 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -31,13 +31,15 @@
  *   3. submit -> run -> AWAITING_APPROVAL -> approve    (write_env_files soft-deny gate)
  *   4. submit -> run -> AWAITING_APPROVAL -> deny       (write_env_files soft-deny gate)
  *
- * This runs in the DEDICATED E2E account (<integ-account>), which has no
- * backgroundagent-dev stack — so the AgentCore account-unique runtime/memory
- * name collision that forced Phase 0 to trim DOES NOT apply here. We deploy the
- * committed AgentStack unchanged: it leaves runtimeName/memoryName UNSET, and
- * CDK auto-generates names that include the stack name (backgroundagent-integ-
- * lifecycle), guaranteeing uniqueness. (A local developer's uncommitted agent.ts
- * pin must be stashed before a local `mise //cdk:integ`, or it would collide.)
+ * This is environment-agnostic: it deploys to whatever account/region the
+ * caller's AWS credentials resolve to (CI assumes the integ role; local runs use
+ * your own creds). It should run in a DEDICATED integ account with no
+ * backgroundagent-dev/main stack, so the AgentCore account-unique runtime/memory
+ * names don't collide. We deploy the committed AgentStack unchanged: it leaves
+ * runtimeName/memoryName UNSET and CDK auto-generates names scoped to the
+ * per-run stack name (int-<commit-hash>, see below), guaranteeing uniqueness.
+ * (A local developer's uncommitted agent.ts name pin must be stashed before a
+ * local `mise //cdk:integ`, or it would collide.)
  *
  * Determinism: there is no mock/scripted agent mode — every scenario runs the
  * real `claude` CLI against Bedrock. We bound cost and wall-clock with low
@@ -123,8 +125,9 @@ const githubTokenSecretArn = output('GitHubTokenSecretArn');
 
 // --- Gate-scenario configuration (scenarios 3 & 4) ----------------------------
 // These two constants are the ONLY out-of-band wiring the gate scenarios need.
-// They point at resources an admin provisions once in the E2E account
-// (<integ-account>); scenarios 1 & 2 do NOT depend on them and run regardless.
+// They point at resources an operator provisions once in the integ account
+// (whichever account the run deploys to); scenarios 1 & 2 do NOT depend on them
+// and run regardless.
 //
 //   SANDBOX_REPO  — a throwaway GitHub repo (owner/name) with a committed
 //                   baseline (README + default branch). coding/new-task-v1
@@ -132,10 +135,13 @@ const githubTokenSecretArn = output('GitHubTokenSecretArn');
 //                   trips the write_env_files soft-deny gate, and (on approve)
 //                   pushes a `bgagent/<task_id>/<slug>` branch + opens a PR. The
 //                   CI `always()` cleanup step deletes those branches each run.
-//   PRESEEDED_PAT_SECRET — name (or ARN) of a STABLE Secrets Manager secret in
-//                   the E2E account holding a fine-grained PAT scoped to
-//                   SANDBOX_REPO. Copied into the stack-created GitHubTokenSecret
-//                   by the token-seeding assertion below.
+//                   The PAT below must have Contents+PR WRITE on this repo (a
+//                   read-only token clones fine but the agent's `git push` 403s).
+//   PRESEEDED_PAT_SECRET — name of a STABLE Secrets Manager secret in the integ
+//                   account holding a fine-grained PAT scoped to SANDBOX_REPO.
+//                   Resolved by NAME (not ARN) so it is account-agnostic; copied
+//                   into the stack-created GitHubTokenSecret by the token-seeding
+//                   assertion below.
 //
 // Until these hold real values the gate submits will FAIL at clone/preflight
 // (like scenario 2) rather than reaching AWAITING_APPROVAL — so flip them to the

From 3f82e3948a466496c915ee66cf128af5609afd6e Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Mon, 15 Jun 2026 15:00:28 -0400
Subject: [PATCH 12/15] fix(security): force js-yaml >=4.2.0 to clear
 GHSA-h67p-54hq-rp68 (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

osv.dev recently widened GHSA-h67p-54hq-rp68 to cover js-yaml <4.2.0 (was <4.x),
so the transitive js-yaml@3.14.2 pulled in by @istanbuljs/load-nyc-config (Jest
coverage tooling) now fails the required security-pr.yml dependency scan — main
itself is currently red on this.

Add a js-yaml "^4.2.0" resolution (same pattern the repo already uses for
esbuild/vite/etc.). 4.2.0 only consolidates the existing 3.14.2 + 4.1.1 trees
onto one version; load-nyc-config calls js-yaml.load(), unchanged across 3→4.
Dev/test-only dependency — no source imports js-yaml, nothing shipped changes.

Verified: osv-scanner → No issues found; cdk compile clean; full jest suite
2039/2039 pass.
---
 package.json |  1 +
 yarn.lock    | 24 ++----------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/package.json b/package.json
index 2f8c9f59..714aef1e 100644
--- a/package.json
+++ b/package.json
@@ -24,6 +24,7 @@
     "defu": "^6.1.6",
     "esbuild": "^0.28.1",
     "fast-xml-parser": "^5.7.0",
+    "js-yaml": "^4.2.0",
     "postcss": "^8.5.10",
     "uuid": "^14.0.0",
     "vite": "^7.3.2",
diff --git a/yarn.lock b/yarn.lock
index 9621a92e..01c9666b 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -3429,13 +3429,6 @@ arg@^5.0.0:
   resolved "https://registry.yarnpkg.com/arg/-/arg-5.0.2.tgz#c81433cc427c92c4dcf4865142dbca6f15acd59c"
   integrity sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==
 
-argparse@^1.0.7:
-  version "1.0.10"
-  resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911"
-  integrity sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==
-  dependencies:
-    sprintf-js "~1.0.2"
-
 argparse@^2.0.1:
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/argparse/-/argparse-2.0.1.tgz#246f50f3ca78a3240f6c997e8a9bd1eac49e4b38"
@@ -4422,7 +4415,7 @@ espree@^11.2.0:
     acorn-jsx "^5.3.2"
     eslint-visitor-keys "^5.0.1"
 
-esprima@^4.0.0, esprima@^4.0.1:
+esprima@^4.0.1:
   version "4.0.1"
   resolved "https://registry.yarnpkg.com/esprima/-/esprima-4.0.1.tgz#13b04cdb3e6c5d19df91ab6987a8695619b0aa71"
   integrity sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==
@@ -5695,15 +5688,7 @@ js-tokens@^4.0.0:
   resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
   integrity sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==
 
-js-yaml@^3.13.1:
-  version "3.14.2"
-  resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.14.2.tgz#77485ce1dd7f33c061fd1b16ecea23b55fcb04b0"
-  integrity sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==
-  dependencies:
-    argparse "^1.0.7"
-    esprima "^4.0.0"
-
-js-yaml@^4.1.1:
+js-yaml@^3.13.1, js-yaml@^4.1.1, js-yaml@^4.2.0:
   version "4.2.0"
   resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-4.2.0.tgz#2bd9e85682dd91bd469afb809d816043b3d49524"
   integrity sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==
@@ -7584,11 +7569,6 @@ spdx-license-ids@^3.0.0:
   resolved "https://registry.yarnpkg.com/spdx-license-ids/-/spdx-license-ids-3.0.23.tgz#b069e687b1291a32f126893ed76a27a745ee2133"
   integrity sha512-CWLcCCH7VLu13TgOH+r8p1O/Znwhqv/dbb6lqWy67G+pT1kHmeD/+V36AVb/vq8QMIQwVShJ6Ssl5FPh0fuSdw==
 
-sprintf-js@~1.0.2:
-  version "1.0.3"
-  resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c"
-  integrity sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==
-
 stable-hash-x@^0.2.0:
   version "0.2.0"
   resolved "https://registry.yarnpkg.com/stable-hash-x/-/stable-hash-x-0.2.0.tgz#dfd76bfa5d839a7470125c6a6b3c8b22061793e9"

From 34e7343b0fa49f123bf27dbb46396bcb5639f660 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Thu, 18 Jun 2026 11:12:25 -0400
Subject: [PATCH 13/15] =?UTF-8?q?test(cdk):=20address=20PR=20review=20?=
 =?UTF-8?q?=E2=80=94=20fresh=20gate=20tokens,=20PENDING=20filter,=20env-dr?=
 =?UTF-8?q?iven=20sandbox=20(#317)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review fixes for PR #348:

- Token-TTL race (blocker): the single idToken minted at auth was reused for the
  approve/deny POSTs ~48 min later, past Cognito's 60-min default validity → risk
  of 401 → decision never recorded → false timeout. Re-mint a fresh token right
  before each gate POST (reAuthApprove/reAuthDeny in the .next() chain). Test-side
  only — no idTokenValidity change to the production app client.

- Approval-row determinism (nit): queryApprove/queryDeny now filter
  status=PENDING so Items[0] is the live row even if a task trips the gate more
  than once or carries already-decided rows.

- Env-driven sandbox config (nit): SANDBOX_REPO / PRESEEDED_PAT_SECRET now read
  INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID (the CI vars the cleanup step already
  uses), falling back to the literals, so the gate scenarios bind to the running
  account's sandbox instead of a hardcoded contributor repo. integ.yml passes the
  vars to the run step.

Verified: compile, eslint, zizmor, and integ-runner --force --dry-run all pass.
---
 .github/workflows/integ.yml            |  8 ++++
 cdk/test/integ/integ.task-lifecycle.ts | 62 +++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index 6c5f3257..967a7acb 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -224,8 +224,16 @@ jobs:
         # cdk/test/integ/integ.task-lifecycle.ts + cdk/mise.toml). Using the
         # resolved head SHA means a stranded stack from a failed teardown never
         # collides with / blocks a later run on a different commit.
+        #
+        # INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID bind the gate scenarios (3 & 4)
+        # to the account's provisioned sandbox repo + PAT secret instead of a
+        # hardcoded contributor repo. Same vars the sandbox-cleanup step reads.
+        # When unset, the test falls back to its literals and the gates degrade to
+        # clone-failures (still synthesizes).
         env:
           COMMIT_HASH: ${{ needs.resolve.outputs.head_sha }}
+          INTEG_SANDBOX_REPO: ${{ vars.INTEG_SANDBOX_REPO }}
+          INTEG_PAT_SECRET_ID: ${{ vars.INTEG_PAT_SECRET_ID }}
         run: mise //cdk:integ
 
       # Safety net: integ-runner forces teardown on success and failure, but if
diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index 0b6e61c0..ab193aaf 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -143,11 +143,15 @@ const githubTokenSecretArn = output('GitHubTokenSecretArn');
 //                   into the stack-created GitHubTokenSecret by the token-seeding
 //                   assertion below.
 //
-// Until these hold real values the gate submits will FAIL at clone/preflight
-// (like scenario 2) rather than reaching AWAITING_APPROVAL — so flip them to the
-// provisioned repo/secret before relying on scenarios 3 & 4.
-const SANDBOX_REPO = 'ayushtr-aws/abca-integ-sandbox';
-const PRESEEDED_PAT_SECRET = 'bgagent/integ/github-pat';
+// Sourced from CI repo vars (INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID — the same
+// vars the integ.yml sandbox-cleanup step reads), so the gate scenarios bind to
+// whatever sandbox+secret the running account provisioned rather than one
+// contributor's. Fall back to the original literals for local runs in that
+// account. When unset in another account, scenarios 3 & 4 degrade to
+// clone-failures (the comment-config still synthesizes); set the vars to exercise
+// the Cedar gates.
+const SANDBOX_REPO = process.env.INTEG_SANDBOX_REPO || 'ayushtr-aws/abca-integ-sandbox';
+const PRESEEDED_PAT_SECRET = process.env.INTEG_PAT_SECRET_ID || 'bgagent/integ/github-pat';
 
 const integ = new IntegTest(app, 'TaskLifecycle', {
   testCases: [stack],
@@ -216,6 +220,29 @@ const auth = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
 
 const idToken = auth.getAttString('AuthenticationResult.IdToken');
 
+// Re-mint a FRESH token right before the approve/deny POSTs. The Cognito app
+// client uses the default 60-min ID-token validity (task-api.ts sets no
+// idTokenValidity), but the strictly-serial .next() chain reaches the gate POSTs
+// only after ~32 min (approve) / ~48 min (deny) of polling budget PLUS real agent
+// cold-start + runtime — the live run took ~54 min. Reusing the original token
+// would risk a 401 (expired) → the decision never records → false timeout keyed
+// to agent latency. These re-auths run just before their POSTs in the chain, so
+// each token is minted minutes (not ~50 min) before use. The user/password are
+// permanent (adminSetUserPassword above), so re-auth needs no new setup.
+const reAuthApprove = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
+  AuthFlow: 'USER_PASSWORD_AUTH',
+  ClientId: appClientId,
+  AuthParameters: { USERNAME: username, PASSWORD: password },
+});
+const approveToken = reAuthApprove.getAttString('AuthenticationResult.IdToken');
+
+const reAuthDeny = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
+  AuthFlow: 'USER_PASSWORD_AUTH',
+  ClientId: appClientId,
+  AuthParameters: { USERNAME: username, PASSWORD: password },
+});
+const denyToken = reAuthDeny.getAttString('AuthenticationResult.IdToken');
+
 // Conservative polling windows. Agent runs are real LLM sessions over a freshly
 // cold-started AgentCore runtime; the first invocation pays the cold-start tax.
 const TERMINAL_POLL = { totalTimeout: Duration.minutes(12), interval: Duration.seconds(30) };
@@ -361,13 +388,18 @@ pollGateApprove
   .waitForAssertions(GATE_POLL);
 
 // Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
-// required because we do not know the agent-minted request_id. getAttString here
-// flips this call to a flattened response, so we do NOT .expect() on it — the
-// decision assertion below uses a separate getItem.
+// required because we do not know the agent-minted request_id. The status=PENDING
+// FilterExpression makes Items[0] deterministic: a task could trip the gate more
+// than once (or carry already-decided rows), and an unfiltered query orders only
+// by SK, so without the filter Items[0] could be the wrong/decided row and the
+// POST would target the wrong request_id. getAttString here flips this call to a
+// flattened response, so we do NOT .expect() on it.
 const queryApprove = integ.assertions.awsApiCall('DynamoDB', 'query', {
   TableName: taskApprovalsTableName,
   KeyConditionExpression: 'task_id = :tid',
-  ExpressionAttributeValues: { ':tid': { S: approveTaskId } },
+  FilterExpression: '#st = :pending',
+  ExpressionAttributeNames: { '#st': 'status' },
+  ExpressionAttributeValues: { ':tid': { S: approveTaskId }, ':pending': { S: 'PENDING' } },
 });
 const approveRequestId = queryApprove.getAttString('Items.0.request_id.S');
 
@@ -375,7 +407,8 @@ const approve = integ.assertions.httpApiCall(`${apiUrl}tasks/${approveTaskId}/ap
   method: 'POST',
   headers: {
     'Content-Type': 'application/json',
-    'Authorization': idToken,
+    // Fresh token (see reAuthApprove) — the original idToken may be expired by now.
+    'Authorization': approveToken,
   },
   body: JSON.stringify({ request_id: approveRequestId, decision: 'approve', scope: 'this_call' }),
 });
@@ -420,7 +453,9 @@ pollGateDeny
 const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
   TableName: taskApprovalsTableName,
   KeyConditionExpression: 'task_id = :tid',
-  ExpressionAttributeValues: { ':tid': { S: denyTaskId } },
+  FilterExpression: '#st = :pending',
+  ExpressionAttributeNames: { '#st': 'status' },
+  ExpressionAttributeValues: { ':tid': { S: denyTaskId }, ':pending': { S: 'PENDING' } },
 });
 const denyRequestId = queryDeny.getAttString('Items.0.request_id.S');
 
@@ -428,7 +463,8 @@ const deny = integ.assertions.httpApiCall(`${apiUrl}tasks/${denyTaskId}/deny`, {
   method: 'POST',
   headers: {
     'Content-Type': 'application/json',
-    'Authorization': idToken,
+    // Fresh token (see reAuthDeny) — the original idToken may be expired by now.
+    'Authorization': denyToken,
   },
   body: JSON.stringify({ request_id: denyRequestId, decision: 'deny', reason: 'integ: exercising the deny path' }),
 });
@@ -473,9 +509,11 @@ createUser
   .next(submitDeny)
   .next(pollGateApprove)
   .next(queryApprove)
+  .next(reAuthApprove)
   .next(approve)
   .next(pollApproveDecision)
   .next(pollGateDeny)
   .next(queryDeny)
+  .next(reAuthDeny)
   .next(deny)
   .next(pollDenyDecision);

From a5a188fe2b8132e920eb494de1de8287e7749046 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Thu, 18 Jun 2026 11:33:49 -0400
Subject: [PATCH 14/15] feat(ci): add integ-sweeper to reclaim stranded int-*
 stacks + alarm on leaks (#317)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses PR #348 blocker #1: the teardown comments claimed an 'ephemeral
sweeper' reclaims stranded int-* stacks, but none existed — so failed-teardown
stacks (the expected AgentCore agentic_ai ENI race) accumulated silently, each
leaking a VPC + NAT gateway + interface endpoints + runtime in the shared account.

New scheduled workflow .github/workflows/integ-sweeper.yml (every 2h):
- lists all int-* CloudFormation stacks and best-effort delete-stacks each
  (idempotent; the ENIs have had time to detach by the time it runs, unlike the
  in-run teardown which races them);
- for any int-* stack still undeletable past ALARM_AGE_HOURS (6h — comfortably
  past the observed ~1-2h ENI-release window, so normal lag never false-alarms),
  FAILS THE JOB and opens a tracking issue (mirrors security.yml's pattern), so a
  genuine leak surfaces loudly instead of hiding.

Runs under environment: integ with OIDC (id-token: write) to assume the same
AWS_ROLE_TO_ASSUME integ.yml uses — keeps the secret in a dedicated environment
(zizmor secrets-outside-env clean). Issue body passed via env, not inline
interpolation (zizmor template-injection clean).

Also corrects the integ.yml + integ.task-lifecycle.ts comments to point at the
now-real sweeper instead of overclaiming. Hybrid teardown story: in-run delete
stays best-effort/quiet (expectError), the sweeper does reliable async cleanup +
the loud alarm.

Verified: YAML parse, bash -n, zizmor (new file 0 findings), compile, eslint.
---
 .github/workflows/integ-sweeper.yml    | 161 +++++++++++++++++++++++++
 .github/workflows/integ.yml            |   6 +-
 cdk/test/integ/integ.task-lifecycle.ts |   8 +-
 3 files changed, 170 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/integ-sweeper.yml

diff --git a/.github/workflows/integ-sweeper.yml b/.github/workflows/integ-sweeper.yml
new file mode 100644
index 00000000..20a1335d
--- /dev/null
+++ b/.github/workflows/integ-sweeper.yml
@@ -0,0 +1,161 @@
+name: integ-sweeper
+# Reclaims stranded ephemeral integ stacks (issue #317 / PR #348 follow-up).
+#
+# The Phase-1 lifecycle integ test (integ.yml + cdk/test/integ/integ.task-lifecycle.ts)
+# deploys a per-run `int-<commit-sha>` stack running the AgentCore Runtime in VPC
+# mode. That runtime injects AWS-service-managed `agentic_ai` ENIs into the private
+# subnets, which AWS releases only ASYNCHRONOUSLY (observed: 1+ hours after the
+# runtime is deleted). So the in-run `cdk destroy` reliably fails the subnet/SG/VPC
+# deletes (DependencyViolation) and the integ run tolerates that failure
+# (destroy.expectError) rather than blocking on a wait it can't win. The per-run
+# UNIQUE stack name means a stranded stack never blocks a later run — but nothing
+# in the run reclaims it either.
+#
+# THIS workflow is that reclaimer: on a schedule (after the ENIs have had time to
+# detach), it deletes every `int-*` stack, and FAILS LOUDLY + opens a tracking
+# issue for any `int-*` stack older than the alarm threshold that still won't
+# delete — so a genuine leak (cost in the shared account) surfaces instead of
+# accumulating silently.
+on:
+  workflow_dispatch: {}
+  schedule:
+    # Every 2 hours. Frequent enough that a normal stranded stack (ENIs release in
+    # ~1-2h) is reclaimed within a cycle or two, well before the 6h alarm age.
+    - cron: "0 */2 * * *"
+
+concurrency:
+  group: integ-sweeper
+  cancel-in-progress: false
+
+permissions:
+  contents: none
+
+jobs:
+  sweep:
+    name: Reclaim stranded int-* stacks
+    runs-on: ubuntu-latest
+    # The integ deploy role (secrets.AWS_ROLE_TO_ASSUME) is scoped to the `integ`
+    # environment — same as integ.yml. The environment's protection rules must
+    # permit this scheduled run to assume the role (no manual approval is possible
+    # on a cron trigger).
+    environment: integ
+    timeout-minutes: 30
+    permissions:
+      id-token: write   # OIDC role assumption
+      contents: read
+      issues: write     # open a tracking issue on a genuine leak
+    env:
+      # Stacks older than this (hours) that STILL fail to delete are treated as a
+      # genuine leak → fail the job + file an issue. Comfortably past the observed
+      # ENI-release window so normal teardown lag never false-alarms.
+      ALARM_AGE_HOURS: "6"
+      AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
+      AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
+          aws-region: ${{ vars.AWS_REGION || 'us-east-1' }}
+
+      - name: Sweep int-* stacks
+        id: sweep
+        run: |
+          set -uo pipefail
+
+          # All non-deleted int-* stacks (active, DELETE_FAILED, or rollback states).
+          mapfile -t stacks < <(
+            aws cloudformation list-stacks \
+              --stack-status-filter CREATE_COMPLETE CREATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \
+                UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE UPDATE_ROLLBACK_FAILED DELETE_FAILED \
+              --query 'StackSummaries[?starts_with(StackName, `int-`)].StackName' \
+              --output text 2>/dev/null | tr '\t' '\n' | sort -u
+          )
+
+          if [ "${#stacks[@]}" -eq 0 ]; then
+            echo "No int-* stacks present. Nothing to sweep."
+            exit 0
+          fi
+
+          echo "Found ${#stacks[@]} int-* stack(s): ${stacks[*]}"
+          now_epoch="$(date -u +%s)"
+          alarm_secs=$(( ALARM_AGE_HOURS * 3600 ))
+          leaked=""
+
+          for stack in "${stacks[@]}"; do
+            [ -n "$stack" ] || continue
+            echo "::group::$stack"
+
+            # Best-effort delete (idempotent; no-op if already deleting/gone).
+            aws cloudformation delete-stack --stack-name "$stack" || true
+            # Give CloudFormation a moment, then read the resulting status.
+            sleep 15
+            status="$(aws cloudformation describe-stacks --stack-name "$stack" \
+              --query 'Stacks[0].StackStatus' --output text 2>&1 || true)"
+
+            if echo "$status" | grep -qiE 'does not exist|ValidationError'; then
+              echo "✅ $stack deleted (or gone)."
+              echo "::endgroup::"
+              continue
+            fi
+
+            # Still present — how old is it? Alarm only if past the threshold.
+            created="$(aws cloudformation describe-stacks --stack-name "$stack" \
+              --query 'Stacks[0].CreationTime' --output text 2>/dev/null || true)"
+            created_epoch="$(date -u -d "$created" +%s 2>/dev/null || echo 0)"
+            age_secs=$(( now_epoch - created_epoch ))
+            age_hours=$(( age_secs / 3600 ))
+
+            if [ "$created_epoch" -gt 0 ] && [ "$age_secs" -ge "$alarm_secs" ]; then
+              echo "❌ $stack still present (status: $status), age ${age_hours}h ≥ ${ALARM_AGE_HOURS}h — LEAK."
+              leaked="${leaked}\n- \`${stack}\` — status \`${status}\`, age ~${age_hours}h"
+            else
+              echo "⏳ $stack still present (status: $status), age ~${age_hours}h — within ${ALARM_AGE_HOURS}h window; ENIs likely not yet released. Will retry next cycle."
+            fi
+            echo "::endgroup::"
+          done
+
+          if [ -n "$leaked" ]; then
+            {
+              echo "leaked<<EOF"
+              echo -e "$leaked"
+              echo "EOF"
+            } >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Open issue on genuine leak
+        if: steps.sweep.outputs.leaked != ''
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # Pass via env (not inline ${{ }} interpolation) so the value never
+          # expands into the shell script body — avoids template injection
+          # (zizmor template-injection). Stack names are AWS-controlled, but env
+          # is the correct, lint-clean pattern regardless.
+          LEAKED: ${{ steps.sweep.outputs.leaked }}
+        run: |
+          set -euo pipefail
+          body_file="$(mktemp)"
+          {
+            echo "The integ-sweeper found stranded \`int-*\` CloudFormation stacks older than ${ALARM_AGE_HOURS}h that still fail to delete — likely a real leak in the shared integ account (each carries a VPC + NAT gateway + interface endpoints + the AgentCore runtime, billing hourly)."
+            echo ""
+            echo "These are normally reclaimed automatically once the AgentCore \`agentic_ai\` ENIs detach (~1-2h). Past ${ALARM_AGE_HOURS}h, investigate: the ENIs may be genuinely stuck (needs manual ENI/VPC cleanup) or the deploy role lacks teardown permissions."
+            echo ""
+            echo "### Stranded stacks"
+            echo -e "${LEAKED}"
+            echo ""
+            echo "| Field | Value |"
+            echo "| --- | --- |"
+            echo "| Workflow run | [integ-sweeper #${GITHUB_RUN_NUMBER}](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}) |"
+            echo "| Region | \`${AWS_REGION}\` |"
+            echo ""
+            echo "Close this issue once the stacks are deleted and the sweeper run is green."
+          } > "${body_file}"
+
+          gh issue create \
+            --title "Stranded integ stacks not reclaimed (>${ALARM_AGE_HOURS}h)" \
+            --label bug \
+            --body-file "${body_file}"
+
+      - name: Fail job on genuine leak
+        if: steps.sweep.outputs.leaked != ''
+        run: exit 1
diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index 967a7acb..623cc834 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -262,8 +262,10 @@ jobs:
       # so an immediate delete reliably hits DELETE_FAILED on the subnets/SG/VPC.
       # Because the stack name is now per-commit-UNIQUE, a stranded `int-<sha>`
       # stack never blocks a future run, so we leave it for the out-of-band
-      # ephemeral sweeper to reclaim once the ENIs detach. We fire one delete to
-      # start the teardown and move on.
+      # ephemeral sweeper (.github/workflows/integ-sweeper.yml) to reclaim once the
+      # ENIs detach — that sweeper FAILS LOUDLY + files an issue for any int-*
+      # stack still stuck past its alarm age, so leaks surface rather than
+      # accumulate. Here we just fire one delete to start the teardown and move on.
       - name: Ensure stacks torn down (best effort)
         if: always()
         env:
diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index ab193aaf..e7d58d67 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -72,8 +72,9 @@ const app = new App();
 // releases ASYNCHRONOUSLY, so `cdk destroy` reliably fails the subnet/SG/VPC
 // deletes (DependencyViolation) and strands the stack. With a fixed name that
 // stranded stack BLOCKS the next run (name conflict). A unique per-commit name
-// means a failed teardown never blocks a later run, and an out-of-band ephemeral
-// sweeper can reclaim `int-*` stacks once their ENIs detach.
+// means a failed teardown never blocks a later run, and the out-of-band ephemeral
+// sweeper (.github/workflows/integ-sweeper.yml) reclaims `int-*` stacks once their
+// ENIs detach, alarming if any stays stuck past its age threshold.
 //
 // The hash comes from the COMMIT_HASH env var (set by CI from the resolved head
 // SHA; the mise //cdk:integ task falls back to the local git SHA). We read the
@@ -175,7 +176,8 @@ const integ = new IntegTest(app, 'TaskLifecycle', {
   // FAILED on teardown alone — masking whether the ASSERTIONS passed. We tolerate
   // the teardown failure (scoped to the dependency-violation message so unrelated
   // teardown bugs still surface) and hand the stranded `int-<hash>` stack to the
-  // out-of-band ephemeral sweeper, which reclaims it once AWS detaches the ENIs.
+  // out-of-band ephemeral sweeper (.github/workflows/integ-sweeper.yml), which
+  // reclaims it once AWS detaches the ENIs and alarms if it stays stuck.
   cdkCommandOptions: {
     destroy: {
       args: { force: true },

From 51bdde603c0e77796fcc7347caebb03ca501cf59 Mon Sep 17 00:00:00 2001
From: ayushtr-aws <ayushtr@amazon.com>
Date: Thu, 18 Jun 2026 13:58:31 -0400
Subject: [PATCH 15/15] =?UTF-8?q?test(cdk):=20address=20re-review=20?=
 =?UTF-8?q?=E2=80=94=20sweeper=20hardening,=20gate-skip-on-unset,=20correc?=
 =?UTF-8?q?t=20env-gate=20comments=20(#317)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-review residuals from PR #348 (isadeks):

- Sweeper issue-spam (nit #2): dedup on a stable `integ-leak` label —
  comment on the existing open issue instead of filing a duplicate each
  2h cycle. Label is created idempotently before use.
- Sweeper prefix breadth (nit #3): only sweep `int-<8 hex>` (the CI
  stack-name shape) via an authoritative regex guard, not a bare
  `int-*` glob, so an unrelated stack sharing the 4-char prefix is never
  deleted in the shared account.
- Personal-repo fallback (nit #4): drop the hardcoded
  ayushtr-aws/abca-integ-sandbox / secret literals. Gate scenarios 3 & 4
  now run ONLY when INTEG_SANDBOX_REPO + INTEG_PAT_SECRET_ID are both
  set; otherwise they cleanly skip (warn) instead of routing a write-PAT
  clone into a contributor's personal repo. The gate assertion calls are
  constructed only inside the enabled branch so nothing is registered
  when skipped.
- integ env comments (in-PR part of #1): correct comments that
  overstated the `integ` environment approval as the fork-code gate. The
  enforced gate is the `safe-to-test` label; the environment adds an
  approval ONLY if required reviewers are configured (currently none).
  Note that the cron-triggered sweeper depends on the env staying
  approval-free. Whether to add required reviewers is a maintainer/admin
  action outside this PR.

Validated: compile, eslint, zizmor (both workflows clean), and
integ-runner --force --dry-run pass with gates both disabled and enabled.
---
 .github/workflows/integ-sweeper.yml    |  57 +++-
 .github/workflows/integ.yml            |  58 ++--
 cdk/test/integ/integ.task-lifecycle.ts | 433 +++++++++++++------------
 3 files changed, 320 insertions(+), 228 deletions(-)

diff --git a/.github/workflows/integ-sweeper.yml b/.github/workflows/integ-sweeper.yml
index 20a1335d..30f80d5f 100644
--- a/.github/workflows/integ-sweeper.yml
+++ b/.github/workflows/integ-sweeper.yml
@@ -63,8 +63,20 @@ jobs:
         run: |
           set -uo pipefail
 
-          # All non-deleted int-* stacks (active, DELETE_FAILED, or rollback states).
-          mapfile -t stacks < <(
+          # Only the integ test's own per-run stacks are eligible. The test names
+          # them `int-<commit-hash>` where the hash is the 8-char short SHA
+          # (integ.task-lifecycle.ts: COMMIT_HASH.slice(0,8)). We therefore sweep
+          # ONLY names matching `int-<8 lowercase hex>` — NOT a bare `int-*` glob.
+          # `int-` is a short prefix; an unguarded glob in a shared account could
+          # delete an unrelated stack that merely starts with those 4 chars. The
+          # `int-local` fallback name (local dev runs) is intentionally NOT swept:
+          # CI never produces it, so a match would be someone's local stack.
+          STACK_RE='^int-[0-9a-f]{8}$'
+
+          # All non-deleted int-* stacks (active, DELETE_FAILED, or rollback states);
+          # the JMESPath prefilter narrows the API page, the regex below is the
+          # authoritative guard.
+          mapfile -t candidates < <(
             aws cloudformation list-stacks \
               --stack-status-filter CREATE_COMPLETE CREATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \
                 UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE UPDATE_ROLLBACK_FAILED DELETE_FAILED \
@@ -72,6 +84,16 @@ jobs:
               --output text 2>/dev/null | tr '\t' '\n' | sort -u
           )
 
+          stacks=()
+          for c in "${candidates[@]}"; do
+            [ -n "$c" ] || continue
+            if [[ "$c" =~ $STACK_RE ]]; then
+              stacks+=("$c")
+            else
+              echo "Skipping '$c' — does not match ${STACK_RE} (not a sweepable integ stack)."
+            fi
+          done
+
           if [ "${#stacks[@]}" -eq 0 ]; then
             echo "No int-* stacks present. Nothing to sweep."
             exit 0
@@ -132,6 +154,9 @@ jobs:
           # (zizmor template-injection). Stack names are AWS-controlled, but env
           # is the correct, lint-clean pattern regardless.
           LEAKED: ${{ steps.sweep.outputs.leaked }}
+          # Stable label used both to tag the tracking issue and to find an
+          # existing open one — this is the dedup key, so it must not change.
+          LEAK_LABEL: integ-leak
         run: |
           set -euo pipefail
           body_file="$(mktemp)"
@@ -140,7 +165,7 @@ jobs:
             echo ""
             echo "These are normally reclaimed automatically once the AgentCore \`agentic_ai\` ENIs detach (~1-2h). Past ${ALARM_AGE_HOURS}h, investigate: the ENIs may be genuinely stuck (needs manual ENI/VPC cleanup) or the deploy role lacks teardown permissions."
             echo ""
-            echo "### Stranded stacks"
+            echo "### Stranded stacks (as of this run)"
             echo -e "${LEAKED}"
             echo ""
             echo "| Field | Value |"
@@ -151,10 +176,28 @@ jobs:
             echo "Close this issue once the stacks are deleted and the sweeper run is green."
           } > "${body_file}"
 
-          gh issue create \
-            --title "Stranded integ stacks not reclaimed (>${ALARM_AGE_HOURS}h)" \
-            --label bug \
-            --body-file "${body_file}"
+          # Dedup: a stuck stack re-alarms every 2h cycle. Without this guard each
+          # cycle files a fresh duplicate. Find an existing OPEN issue carrying the
+          # stable leak label and comment on it instead of opening another; only
+          # open a new issue when none exists. `--search` scopes to open issues with
+          # the label; `--json number --jq '.[0].number'` yields the first match (or
+          # empty). Ensure the label exists first (idempotent; ignore "already exists").
+          gh label create "${LEAK_LABEL}" \
+            --description "Stranded integ stacks flagged by integ-sweeper" \
+            --color B60205 2>/dev/null || true
+
+          existing="$(gh issue list --state open --label "${LEAK_LABEL}" \
+            --json number --jq '.[0].number // empty' 2>/dev/null || true)"
+
+          if [ -n "${existing}" ]; then
+            echo "Existing open leak issue #${existing} — commenting instead of opening a duplicate."
+            gh issue comment "${existing}" --body-file "${body_file}"
+          else
+            gh issue create \
+              --title "Stranded integ stacks not reclaimed (>${ALARM_AGE_HOURS}h)" \
+              --label "${LEAK_LABEL}" \
+              --body-file "${body_file}"
+          fi
 
       - name: Fail job on genuine leak
         if: steps.sweep.outputs.leaked != ''
diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml
index 623cc834..5819c239 100644
--- a/.github/workflows/integ.yml
+++ b/.github/workflows/integ.yml
@@ -7,10 +7,20 @@ name: integ
 #
 # Trigger model mirrors deploy.yml: build.yml completes -> workflow_run picks it
 # up in the trusted base-repo context (secrets/OIDC available even for fork PRs)
-# -> we resolve whether the PR touches cdk/** or agent/** -> an admin approves
-# the `integ` environment gate -> deploy/assert/destroy runs against the shared
-# account -> a commit status `integ-smoke` is posted back to the PR head so it
-# shows up as a (required) check that blocks merge.
+# -> we resolve whether the PR touches cdk/** or agent/** -> deploy/assert/destroy
+# runs against the shared account -> a commit status `integ-smoke` is posted back
+# to the PR head so it shows up as a (required) check that blocks merge.
+#
+# Fork-code gate: the ENFORCED gate on fork-authored test code is the
+# `safe-to-test` label check in the `resolve` job below — a maintainer must apply
+# it before this workflow will run a fork PR. The `integ` GitHub environment is a
+# SECOND, OPTIONAL layer: it only adds a manual approval if required reviewers are
+# configured on it, and at time of writing NONE are. Two consequences worth
+# knowing: (a) do not rely on the environment as the fork-code gate — that is the
+# label's job; (b) the scheduled integ-sweeper (.github/workflows/integ-sweeper.yml)
+# also runs under `environment: integ`, and a cron trigger CANNOT satisfy a manual
+# approval, so adding required reviewers here would silently break the sweeper.
+# If reviewers are ever wanted for PR runs, give the sweeper its own environment.
 #
 # Local dev path is unchanged: run `mise //cdk:integ` with your own AWS creds.
 #
@@ -19,9 +29,10 @@ name: integ
 on:
   # zizmor: ignore[dangerous-triggers] — intentional; workflow_run is required so
   # fork PRs can run against the shared account (a fork `pull_request` job gets no
-  # secrets/OIDC). Mitigations: build-success guard, path-filter, `integ`
-  # environment approval gate (admin reviews fork test code before it runs with
-  # the privileged role), least-privilege role, status-only tokens per job.
+  # secrets/OIDC). Mitigations: build-success guard, path-filter, the
+  # `safe-to-test` label gate (the enforced fork-code review gate), the `integ`
+  # environment (an optional second approval layer IF reviewers are configured —
+  # currently none), least-privilege role, status-only tokens per job.
   workflow_run:
     workflows: [build]
     types: [completed]
@@ -41,8 +52,9 @@ jobs:
   # docs/cli-only PRs get an immediate green (skipped) status and never deadlock
   # the required check.
   resolve:
-    # Manual dispatch is restricted to main (defence in depth — the `integ`
-    # environment approval is the primary gate). PR runs come via workflow_run.
+    # Manual dispatch is restricted to main (defence in depth). For fork PRs the
+    # primary gate is the `safe-to-test` label check below. PR runs come via
+    # workflow_run.
     if: >-
       (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') ||
       (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
@@ -129,10 +141,12 @@ jobs:
             exit 0
           fi
 
-          # Fork-PR safety: only run fork-authored code after a maintainer has
-          # applied the `safe-to-test` label (defence in depth on top of the
-          # `integ` environment approval). If it's absent, leave the status
-          # pending and don't run — re-trigger once the label is added.
+          # Fork-PR safety: this is the ENFORCED gate on fork-authored code — only
+          # run it after a maintainer has applied the `safe-to-test` label. (The
+          # `integ` environment can add a second approval layer, but only if
+          # required reviewers are configured on it — currently none, so this label
+          # is the effective gate.) If absent, leave the status pending and don't
+          # run — re-trigger once the label is added.
           if [[ "$WF_HEAD_REPO" != "$REPO" ]]; then
             if ! LABELS=$(gh api "repos/$REPO/issues/$PR_NUMBER/labels" --jq '.[].name'); then
               echo "::error::Failed to read labels for PR #$PR_NUMBER."
@@ -157,7 +171,7 @@ jobs:
             exit 1
           fi
           if echo "$CHANGED" | grep -Eq '^(cdk|agent)/'; then
-            post_status pending "awaiting admin approval / running"
+            post_status pending "awaiting integ run"
             echo "applicable=true" >> "$GITHUB_OUTPUT"
             echo "PR #$PR_NUMBER touches cdk/** or agent/** — integ applies."
           else
@@ -166,9 +180,11 @@ jobs:
             echo "PR #$PR_NUMBER has no cdk/** or agent/** changes — integ skipped (green)."
           fi
 
-  # The admin-gated deploy -> assert -> destroy. The `integ` environment's
-  # required reviewer is the approval gate; while it waits, the integ-smoke
-  # status stays pending and merge stays blocked.
+  # The deploy -> assert -> destroy job. It runs in the `integ` environment; if
+  # required reviewers are ever configured there, a pending approval holds this
+  # job (and the integ-smoke status stays pending / merge stays blocked) until
+  # approved. With no reviewers configured (current state), it proceeds directly
+  # once `resolve` marks it applicable and the `safe-to-test` gate has passed.
   integ:
     needs: resolve
     if: needs.resolve.outputs.applicable == 'true'
@@ -191,9 +207,11 @@ jobs:
       - name: Checkout PR head (incl. forks)
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
-          # Approving the `integ` environment authorizes this fork-authored test
-          # code to run with the privileged role — the approver MUST review
-          # cdk/test/integ/** changes before approving.
+          # This checks out fork-authored test code that then runs with the
+          # privileged role. The `safe-to-test` label (gated in `resolve`) is the
+          # enforced review point for cdk/test/integ/** changes; if required
+          # reviewers are configured on the `integ` environment they add a second
+          # manual review before this runs.
           repository: ${{ needs.resolve.outputs.head_repo }}
           ref: ${{ needs.resolve.outputs.head_sha }}
           persist-credentials: false
diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts
index e7d58d67..bee378d3 100644
--- a/cdk/test/integ/integ.task-lifecycle.ts
+++ b/cdk/test/integ/integ.task-lifecycle.ts
@@ -146,13 +146,27 @@ const githubTokenSecretArn = output('GitHubTokenSecretArn');
 //
 // Sourced from CI repo vars (INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID — the same
 // vars the integ.yml sandbox-cleanup step reads), so the gate scenarios bind to
-// whatever sandbox+secret the running account provisioned rather than one
-// contributor's. Fall back to the original literals for local runs in that
-// account. When unset in another account, scenarios 3 & 4 degrade to
-// clone-failures (the comment-config still synthesizes); set the vars to exercise
-// the Cedar gates.
-const SANDBOX_REPO = process.env.INTEG_SANDBOX_REPO || 'ayushtr-aws/abca-integ-sandbox';
-const PRESEEDED_PAT_SECRET = process.env.INTEG_PAT_SECRET_ID || 'bgagent/integ/github-pat';
+// whatever sandbox+secret the running account provisioned. There is deliberately
+// NO fallback literal: an account that hasn't provisioned a sandbox (e.g. upstream
+// aws-samples, or any fork) leaves both unset, and scenarios 3 & 4 SKIP with a
+// clear message (see the chain-assembly block at the bottom) rather than silently
+// routing the gate runs — which clone and push with a write-PAT — into one
+// contributor's personal repo. Set both vars to exercise the Cedar gates;
+// scenarios 1 & 2 always run regardless.
+const SANDBOX_REPO = process.env.INTEG_SANDBOX_REPO;
+const PRESEEDED_PAT_SECRET = process.env.INTEG_PAT_SECRET_ID;
+
+// Gate scenarios (3 & 4) require BOTH a sandbox repo and its pre-seeded PAT. When
+// either is unset, skip them (scenarios 1 & 2 still run). This keeps the test
+// account-agnostic: it never falls back to a hardcoded personal repo.
+const gatesEnabled = Boolean(SANDBOX_REPO && PRESEEDED_PAT_SECRET);
+if (!gatesEnabled) {
+  // eslint-disable-next-line no-console
+  console.warn(
+    '[integ.task-lifecycle] INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID not set — ' +
+      'skipping Cedar gate scenarios 3 & 4 (approve/deny). Set both to exercise the gates.',
+  );
+}
 
 const integ = new IntegTest(app, 'TaskLifecycle', {
   testCases: [stack],
@@ -222,29 +236,6 @@ const auth = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
 
 const idToken = auth.getAttString('AuthenticationResult.IdToken');
 
-// Re-mint a FRESH token right before the approve/deny POSTs. The Cognito app
-// client uses the default 60-min ID-token validity (task-api.ts sets no
-// idTokenValidity), but the strictly-serial .next() chain reaches the gate POSTs
-// only after ~32 min (approve) / ~48 min (deny) of polling budget PLUS real agent
-// cold-start + runtime — the live run took ~54 min. Reusing the original token
-// would risk a 401 (expired) → the decision never records → false timeout keyed
-// to agent latency. These re-auths run just before their POSTs in the chain, so
-// each token is minted minutes (not ~50 min) before use. The user/password are
-// permanent (adminSetUserPassword above), so re-auth needs no new setup.
-const reAuthApprove = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
-  AuthFlow: 'USER_PASSWORD_AUTH',
-  ClientId: appClientId,
-  AuthParameters: { USERNAME: username, PASSWORD: password },
-});
-const approveToken = reAuthApprove.getAttString('AuthenticationResult.IdToken');
-
-const reAuthDeny = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
-  AuthFlow: 'USER_PASSWORD_AUTH',
-  ClientId: appClientId,
-  AuthParameters: { USERNAME: username, PASSWORD: password },
-});
-const denyToken = reAuthDeny.getAttString('AuthenticationResult.IdToken');
-
 // Conservative polling windows. Agent runs are real LLM sessions over a freshly
 // cold-started AgentCore runtime; the first invocation pays the cold-start tax.
 const TERMINAL_POLL = { totalTimeout: Duration.minutes(12), interval: Duration.seconds(30) };
@@ -328,194 +319,234 @@ pollFail
   .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.FAILED } } }))
   .waitForAssertions(TERMINAL_POLL);
 
-// --- Token seeding (prerequisite for gate scenarios) --------------------------
-// Copy the pre-seeded PAT into the stack-created GitHubTokenSecret so the agent
-// runtime can clone SANDBOX_REPO and push a branch. This automates the documented
-// operator step (QUICK_START.md §4). No getAttString is read off seedPut, and the
-// SecretString token is consumed inline by seedPut, never asserted on.
-const seedGet = integ.assertions.awsApiCall('SecretsManager', 'getSecretValue', {
-  SecretId: PRESEEDED_PAT_SECRET,
-});
-
-const seedPut = integ.assertions.awsApiCall('SecretsManager', 'putSecretValue', {
-  SecretId: githubTokenSecretArn,
-  SecretString: seedGet.getAttString('SecretString'),
-});
-
-// Onboard SANDBOX_REPO so the gate submits pass the onboarding gate (otherwise
-// 422 REPO_NOT_ONBOARDED at submit, before the agent ever runs). A minimal active
-// row is enough — the agent reads the GitHub token from the platform-default
-// GitHubTokenSecret we seeded above, so the blueprint needs no per-repo token.
-const onboardSandbox = integ.assertions.awsApiCall('DynamoDB', 'putItem', {
-  TableName: repoTableName,
-  Item: {
-    repo: { S: SANDBOX_REPO },
-    status: { S: 'active' },
-    onboarded_at: { S: '2026-01-01T00:00:00.000Z' },
-    updated_at: { S: '2026-01-01T00:00:00.000Z' },
-  },
-});
-
-// --- Scenario 3: AWAITING_APPROVAL -> approve ---------------------------------
-// coding/new-task-v1 against the sandbox. The task asks the agent to write a
-// `config.env` file, which the Write tool routes through the write_env_files
-// soft-deny rule (agent/policies/soft_deny.cedar) -> the task parks at
-// AWAITING_APPROVAL with a PENDING approval row. We approve it, then assert the
-// row flips to APPROVED. (Post-approval the agent may COMPLETE or FAIL — both
-// terminal — so the deterministic assertion is the recorded decision, not a
-// specific terminal status.)
-const submitApprove = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
-  method: 'POST',
-  headers: {
-    'Content-Type': 'application/json',
-    'Authorization': idToken,
-  },
-  body: JSON.stringify({
-    workflow_ref: 'coding/new-task-v1',
-    repo: SANDBOX_REPO,
-    task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
-    max_turns: 6,
-    max_budget_usd: 0.5,
-  }),
-});
-const approveTaskId = submitApprove.getAttString('body.data.task_id');
-
-// Wait for the gate to open (interim AWAITING_APPROVAL).
-const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
-  TableName: taskTableName,
-  Key: { task_id: { S: approveTaskId } },
-});
-pollGateApprove
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
-  .waitForAssertions(GATE_POLL);
-
-// Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
-// required because we do not know the agent-minted request_id. The status=PENDING
-// FilterExpression makes Items[0] deterministic: a task could trip the gate more
-// than once (or carry already-decided rows), and an unfiltered query orders only
-// by SK, so without the filter Items[0] could be the wrong/decided row and the
-// POST would target the wrong request_id. getAttString here flips this call to a
-// flattened response, so we do NOT .expect() on it.
-const queryApprove = integ.assertions.awsApiCall('DynamoDB', 'query', {
-  TableName: taskApprovalsTableName,
-  KeyConditionExpression: 'task_id = :tid',
-  FilterExpression: '#st = :pending',
-  ExpressionAttributeNames: { '#st': 'status' },
-  ExpressionAttributeValues: { ':tid': { S: approveTaskId }, ':pending': { S: 'PENDING' } },
-});
-const approveRequestId = queryApprove.getAttString('Items.0.request_id.S');
-
-const approve = integ.assertions.httpApiCall(`${apiUrl}tasks/${approveTaskId}/approve`, {
-  method: 'POST',
-  headers: {
-    'Content-Type': 'application/json',
-    // Fresh token (see reAuthApprove) — the original idToken may be expired by now.
-    'Authorization': approveToken,
-  },
-  body: JSON.stringify({ request_id: approveRequestId, decision: 'approve', scope: 'this_call' }),
-});
-
-// Assert the decision was recorded on the approval row. Now that request_id is
-// known we read the exact row by its full key.
-const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
-  TableName: taskApprovalsTableName,
-  Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } },
-});
-pollApproveDecision
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } }))
-  .waitForAssertions(GATE_POLL);
-
-// --- Scenario 4: AWAITING_APPROVAL -> deny ------------------------------------
-// Identical trigger to scenario 3; we deny instead and assert the row flips to
-// DENIED.
-const submitDeny = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
-  method: 'POST',
-  headers: {
-    'Content-Type': 'application/json',
-    'Authorization': idToken,
-  },
-  body: JSON.stringify({
-    workflow_ref: 'coding/new-task-v1',
-    repo: SANDBOX_REPO,
-    task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
-    max_turns: 6,
-    max_budget_usd: 0.5,
-  }),
-});
-const denyTaskId = submitDeny.getAttString('body.data.task_id');
-
-const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
-  TableName: taskTableName,
-  Key: { task_id: { S: denyTaskId } },
-});
-pollGateDeny
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
-  .waitForAssertions(GATE_POLL);
-
-const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
-  TableName: taskApprovalsTableName,
-  KeyConditionExpression: 'task_id = :tid',
-  FilterExpression: '#st = :pending',
-  ExpressionAttributeNames: { '#st': 'status' },
-  ExpressionAttributeValues: { ':tid': { S: denyTaskId }, ':pending': { S: 'PENDING' } },
-});
-const denyRequestId = queryDeny.getAttString('Items.0.request_id.S');
-
-const deny = integ.assertions.httpApiCall(`${apiUrl}tasks/${denyTaskId}/deny`, {
-  method: 'POST',
-  headers: {
-    'Content-Type': 'application/json',
-    // Fresh token (see reAuthDeny) — the original idToken may be expired by now.
-    'Authorization': denyToken,
-  },
-  body: JSON.stringify({ request_id: denyRequestId, decision: 'deny', reason: 'integ: exercising the deny path' }),
-});
-
-const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
-  TableName: taskApprovalsTableName,
-  Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } },
-});
-pollDenyDecision
-  .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } }))
-  .waitForAssertions(GATE_POLL);
-
-// --- Execution order ----------------------------------------------------------
+// --- Execution order (scenarios 1 & 2) ----------------------------------------
 // Auth first, then SEED THE GITHUB TOKEN BEFORE ANY SUBMIT. This ordering is
 // load-bearing: the orchestrator's resolveGitHubToken caches the secret value
 // for 5 min keyed by ARN (context-hydration.ts). Any coding-workflow task that
 // runs GitHub preflight reads + caches the token. Scenario 2 (coding/new-task-v1)
-// runs preflight too — so if it ran BEFORE seedPut, it would cache the stack's
+// runs preflight too — so if it ran BEFORE the seed, it would cache the stack's
 // INITIAL EMPTY secret and every later gate task would reuse that empty token →
 // preflight 401 GITHUB_UNREACHABLE → FAILED before ever reaching the gate
 // (observed live). Seeding right after auth means the secret is populated before
 // the first token read, so no empty value is ever cached. This is exactly the
 // documented operator flow (QUICK_START §4: populate the secret before submitting
-// tasks) — no agent.ts change.
+// tasks) — no agent.ts change. The seed only happens when the gates are enabled
+// (it is sourced from the pre-seeded PAT secret); scenario 2 targets a
+// nonexistent repo and fails at clone regardless of token, so it is unaffected.
 //
 // Onboarding: scenario 2's repo and the sandbox both need a RepoTable row before
 // submit (else 422 REPO_NOT_ONBOARDED), so both onboard steps precede their
 // submits. Gate approve/deny run sequentially since each POST needs the
 // request_id read from the parked task's approval row.
-createUser
+let chain = createUser
   .next(setPassword)
   .next(auth)
-  .next(seedGet)
-  .next(seedPut)
   .next(onboardFailRepo)
-  .next(onboardSandbox)
   .next(submitComplete)
   .next(submitFail)
   .next(pollComplete)
-  .next(pollFail)
-  .next(submitApprove)
-  .next(submitDeny)
-  .next(pollGateApprove)
-  .next(queryApprove)
-  .next(reAuthApprove)
-  .next(approve)
-  .next(pollApproveDecision)
-  .next(pollGateDeny)
-  .next(queryDeny)
-  .next(reAuthDeny)
-  .next(deny)
-  .next(pollDenyDecision);
+  .next(pollFail);
+
+// --- Scenarios 3 & 4 (Cedar gates) — only when a sandbox is configured --------
+// Every assertion call below is CONSTRUCTED only inside this block, so when the
+// gates are disabled nothing is registered with the integ provider and the run
+// reduces cleanly to scenarios 1 & 2 (no skipped/failing gate steps, no PAT seed
+// into the stack secret, no clone of a personal repo).
+if (gatesEnabled) {
+  // Narrow the env-sourced config to non-null for this block.
+  const sandboxRepo = SANDBOX_REPO as string;
+  const patSecretId = PRESEEDED_PAT_SECRET as string;
+
+  // Re-mint a FRESH token right before each approve/deny POST. The Cognito app
+  // client uses the default 60-min ID-token validity (task-api.ts sets no
+  // idTokenValidity), but the strictly-serial .next() chain reaches the gate POSTs
+  // only after ~32 min (approve) / ~48 min (deny) of polling budget PLUS real agent
+  // cold-start + runtime — the live run took ~54 min. Reusing the original token
+  // would risk a 401 (expired) → the decision never records → false timeout keyed
+  // to agent latency. These re-auths run just before their POSTs in the chain, so
+  // each token is minted minutes (not ~50 min) before use. The user/password are
+  // permanent (adminSetUserPassword above), so re-auth needs no new setup.
+  const reAuthApprove = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
+    AuthFlow: 'USER_PASSWORD_AUTH',
+    ClientId: appClientId,
+    AuthParameters: { USERNAME: username, PASSWORD: password },
+  });
+  const approveToken = reAuthApprove.getAttString('AuthenticationResult.IdToken');
+
+  const reAuthDeny = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', {
+    AuthFlow: 'USER_PASSWORD_AUTH',
+    ClientId: appClientId,
+    AuthParameters: { USERNAME: username, PASSWORD: password },
+  });
+  const denyToken = reAuthDeny.getAttString('AuthenticationResult.IdToken');
+
+  // --- Token seeding (prerequisite for gate scenarios) ------------------------
+  // Copy the pre-seeded PAT into the stack-created GitHubTokenSecret so the agent
+  // runtime can clone the sandbox and push a branch. This automates the documented
+  // operator step (QUICK_START.md §4). No getAttString is read off seedPut, and the
+  // SecretString token is consumed inline by seedPut, never asserted on.
+  const seedGet = integ.assertions.awsApiCall('SecretsManager', 'getSecretValue', {
+    SecretId: patSecretId,
+  });
+
+  const seedPut = integ.assertions.awsApiCall('SecretsManager', 'putSecretValue', {
+    SecretId: githubTokenSecretArn,
+    SecretString: seedGet.getAttString('SecretString'),
+  });
+
+  // Onboard the sandbox so the gate submits pass the onboarding gate (otherwise
+  // 422 REPO_NOT_ONBOARDED at submit, before the agent ever runs). A minimal active
+  // row is enough — the agent reads the GitHub token from the platform-default
+  // GitHubTokenSecret we seeded above, so the blueprint needs no per-repo token.
+  const onboardSandbox = integ.assertions.awsApiCall('DynamoDB', 'putItem', {
+    TableName: repoTableName,
+    Item: {
+      repo: { S: sandboxRepo },
+      status: { S: 'active' },
+      onboarded_at: { S: '2026-01-01T00:00:00.000Z' },
+      updated_at: { S: '2026-01-01T00:00:00.000Z' },
+    },
+  });
+
+  // --- Scenario 3: AWAITING_APPROVAL -> approve -------------------------------
+  // coding/new-task-v1 against the sandbox. The task asks the agent to write a
+  // `config.env` file, which the Write tool routes through the write_env_files
+  // soft-deny rule (agent/policies/soft_deny.cedar) -> the task parks at
+  // AWAITING_APPROVAL with a PENDING approval row. We approve it, then assert the
+  // row flips to APPROVED. (Post-approval the agent may COMPLETE or FAIL — both
+  // terminal — so the deterministic assertion is the recorded decision, not a
+  // specific terminal status.)
+  const submitApprove = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': idToken,
+    },
+    body: JSON.stringify({
+      workflow_ref: 'coding/new-task-v1',
+      repo: sandboxRepo,
+      task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
+      max_turns: 6,
+      max_budget_usd: 0.5,
+    }),
+  });
+  const approveTaskId = submitApprove.getAttString('body.data.task_id');
+
+  // Wait for the gate to open (interim AWAITING_APPROVAL).
+  const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+    TableName: taskTableName,
+    Key: { task_id: { S: approveTaskId } },
+  });
+  pollGateApprove
+    .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+    .waitForAssertions(GATE_POLL);
+
+  // Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is
+  // required because we do not know the agent-minted request_id. The status=PENDING
+  // FilterExpression makes Items[0] deterministic: a task could trip the gate more
+  // than once (or carry already-decided rows), and an unfiltered query orders only
+  // by SK, so without the filter Items[0] could be the wrong/decided row and the
+  // POST would target the wrong request_id. getAttString here flips this call to a
+  // flattened response, so we do NOT .expect() on it.
+  const queryApprove = integ.assertions.awsApiCall('DynamoDB', 'query', {
+    TableName: taskApprovalsTableName,
+    KeyConditionExpression: 'task_id = :tid',
+    FilterExpression: '#st = :pending',
+    ExpressionAttributeNames: { '#st': 'status' },
+    ExpressionAttributeValues: { ':tid': { S: approveTaskId }, ':pending': { S: 'PENDING' } },
+  });
+  const approveRequestId = queryApprove.getAttString('Items.0.request_id.S');
+
+  const approve = integ.assertions.httpApiCall(`${apiUrl}tasks/${approveTaskId}/approve`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      // Fresh token (see reAuthApprove) — the original idToken may be expired by now.
+      'Authorization': approveToken,
+    },
+    body: JSON.stringify({ request_id: approveRequestId, decision: 'approve', scope: 'this_call' }),
+  });
+
+  // Assert the decision was recorded on the approval row. Now that request_id is
+  // known we read the exact row by its full key.
+  const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+    TableName: taskApprovalsTableName,
+    Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } },
+  });
+  pollApproveDecision
+    .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } }))
+    .waitForAssertions(GATE_POLL);
+
+  // --- Scenario 4: AWAITING_APPROVAL -> deny ----------------------------------
+  // Identical trigger to scenario 3; we deny instead and assert the row flips to
+  // DENIED.
+  const submitDeny = integ.assertions.httpApiCall(`${apiUrl}tasks`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      'Authorization': idToken,
+    },
+    body: JSON.stringify({
+      workflow_ref: 'coding/new-task-v1',
+      repo: sandboxRepo,
+      task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.',
+      max_turns: 6,
+      max_budget_usd: 0.5,
+    }),
+  });
+  const denyTaskId = submitDeny.getAttString('body.data.task_id');
+
+  const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+    TableName: taskTableName,
+    Key: { task_id: { S: denyTaskId } },
+  });
+  pollGateDeny
+    .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } }))
+    .waitForAssertions(GATE_POLL);
+
+  const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', {
+    TableName: taskApprovalsTableName,
+    KeyConditionExpression: 'task_id = :tid',
+    FilterExpression: '#st = :pending',
+    ExpressionAttributeNames: { '#st': 'status' },
+    ExpressionAttributeValues: { ':tid': { S: denyTaskId }, ':pending': { S: 'PENDING' } },
+  });
+  const denyRequestId = queryDeny.getAttString('Items.0.request_id.S');
+
+  const deny = integ.assertions.httpApiCall(`${apiUrl}tasks/${denyTaskId}/deny`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      // Fresh token (see reAuthDeny) — the original idToken may be expired by now.
+      'Authorization': denyToken,
+    },
+    body: JSON.stringify({ request_id: denyRequestId, decision: 'deny', reason: 'integ: exercising the deny path' }),
+  });
+
+  const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', {
+    TableName: taskApprovalsTableName,
+    Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } },
+  });
+  pollDenyDecision
+    .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } }))
+    .waitForAssertions(GATE_POLL);
+
+  // Splice the gate steps into the chain. seedPut/onboardSandbox precede the gate
+  // submits (token + onboarding must exist first); approve/deny run sequentially.
+  chain = chain
+    .next(seedGet)
+    .next(seedPut)
+    .next(onboardSandbox)
+    .next(submitApprove)
+    .next(submitDeny)
+    .next(pollGateApprove)
+    .next(queryApprove)
+    .next(reAuthApprove)
+    .next(approve)
+    .next(pollApproveDecision)
+    .next(pollGateDeny)
+    .next(queryDeny)
+    .next(reAuthDeny)
+    .next(deny)
+    .next(pollDenyDecision);
+}