diff --git a/.dockerignore b/.dockerignore index 9402c7e7..8e9cc5ab 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,6 +9,14 @@ cdk/cdk.out/ cdk/lib/ cdk/node_modules/ +# integ-runner output dirs. The agent artifact's build context is the repo +# root, and integ-runner writes its synth/snapshot output UNDER that root +# (cdk/test/integ/cdk-integ.out..ts[.snapshot]/). Without these excludes, +# staging the root copies its own output dir into itself recursively until the +# path overflows (ENAMETOOLONG). Mirrors .gitignore lines 70-71. +cdk/test/integ/cdk-integ.out.*/ +cdk/test/integ/*.snapshot/ + # CLI and docs build artifacts cli/lib/ cli/node_modules/ diff --git a/.github/workflows/integ-sweeper.yml b/.github/workflows/integ-sweeper.yml new file mode 100644 index 00000000..30f80d5f --- /dev/null +++ b/.github/workflows/integ-sweeper.yml @@ -0,0 +1,204 @@ +name: integ-sweeper +# Reclaims stranded ephemeral integ stacks (issue #317 / PR #348 follow-up). +# +# The Phase-1 lifecycle integ test (integ.yml + cdk/test/integ/integ.task-lifecycle.ts) +# deploys a per-run `int-` stack running the AgentCore Runtime in VPC +# mode. That runtime injects AWS-service-managed `agentic_ai` ENIs into the private +# subnets, which AWS releases only ASYNCHRONOUSLY (observed: 1+ hours after the +# runtime is deleted). So the in-run `cdk destroy` reliably fails the subnet/SG/VPC +# deletes (DependencyViolation) and the integ run tolerates that failure +# (destroy.expectError) rather than blocking on a wait it can't win. The per-run +# UNIQUE stack name means a stranded stack never blocks a later run — but nothing +# in the run reclaims it either. +# +# THIS workflow is that reclaimer: on a schedule (after the ENIs have had time to +# detach), it deletes every `int-*` stack, and FAILS LOUDLY + opens a tracking +# issue for any `int-*` stack older than the alarm threshold that still won't +# delete — so a genuine leak (cost in the shared account) surfaces instead of +# accumulating silently. +on: + workflow_dispatch: {} + schedule: + # Every 2 hours. Frequent enough that a normal stranded stack (ENIs release in + # ~1-2h) is reclaimed within a cycle or two, well before the 6h alarm age. + - cron: "0 */2 * * *" + +concurrency: + group: integ-sweeper + cancel-in-progress: false + +permissions: + contents: none + +jobs: + sweep: + name: Reclaim stranded int-* stacks + runs-on: ubuntu-latest + # The integ deploy role (secrets.AWS_ROLE_TO_ASSUME) is scoped to the `integ` + # environment — same as integ.yml. The environment's protection rules must + # permit this scheduled run to assume the role (no manual approval is possible + # on a cron trigger). + environment: integ + timeout-minutes: 30 + permissions: + id-token: write # OIDC role assumption + contents: read + issues: write # open a tracking issue on a genuine leak + env: + # Stacks older than this (hours) that STILL fail to delete are treated as a + # genuine leak → fail the job + file an issue. Comfortably past the observed + # ENI-release window so normal teardown lag never false-alarms. + ALARM_AGE_HOURS: "6" + AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} + AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 + with: + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION || 'us-east-1' }} + + - name: Sweep int-* stacks + id: sweep + run: | + set -uo pipefail + + # Only the integ test's own per-run stacks are eligible. The test names + # them `int-` where the hash is the 8-char short SHA + # (integ.task-lifecycle.ts: COMMIT_HASH.slice(0,8)). We therefore sweep + # ONLY names matching `int-<8 lowercase hex>` — NOT a bare `int-*` glob. + # `int-` is a short prefix; an unguarded glob in a shared account could + # delete an unrelated stack that merely starts with those 4 chars. The + # `int-local` fallback name (local dev runs) is intentionally NOT swept: + # CI never produces it, so a match would be someone's local stack. + STACK_RE='^int-[0-9a-f]{8}$' + + # All non-deleted int-* stacks (active, DELETE_FAILED, or rollback states); + # the JMESPath prefilter narrows the API page, the regex below is the + # authoritative guard. + mapfile -t candidates < <( + aws cloudformation list-stacks \ + --stack-status-filter CREATE_COMPLETE CREATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \ + UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE UPDATE_ROLLBACK_FAILED DELETE_FAILED \ + --query 'StackSummaries[?starts_with(StackName, `int-`)].StackName' \ + --output text 2>/dev/null | tr '\t' '\n' | sort -u + ) + + stacks=() + for c in "${candidates[@]}"; do + [ -n "$c" ] || continue + if [[ "$c" =~ $STACK_RE ]]; then + stacks+=("$c") + else + echo "Skipping '$c' — does not match ${STACK_RE} (not a sweepable integ stack)." + fi + done + + if [ "${#stacks[@]}" -eq 0 ]; then + echo "No int-* stacks present. Nothing to sweep." + exit 0 + fi + + echo "Found ${#stacks[@]} int-* stack(s): ${stacks[*]}" + now_epoch="$(date -u +%s)" + alarm_secs=$(( ALARM_AGE_HOURS * 3600 )) + leaked="" + + for stack in "${stacks[@]}"; do + [ -n "$stack" ] || continue + echo "::group::$stack" + + # Best-effort delete (idempotent; no-op if already deleting/gone). + aws cloudformation delete-stack --stack-name "$stack" || true + # Give CloudFormation a moment, then read the resulting status. + sleep 15 + status="$(aws cloudformation describe-stacks --stack-name "$stack" \ + --query 'Stacks[0].StackStatus' --output text 2>&1 || true)" + + if echo "$status" | grep -qiE 'does not exist|ValidationError'; then + echo "✅ $stack deleted (or gone)." + echo "::endgroup::" + continue + fi + + # Still present — how old is it? Alarm only if past the threshold. + created="$(aws cloudformation describe-stacks --stack-name "$stack" \ + --query 'Stacks[0].CreationTime' --output text 2>/dev/null || true)" + created_epoch="$(date -u -d "$created" +%s 2>/dev/null || echo 0)" + age_secs=$(( now_epoch - created_epoch )) + age_hours=$(( age_secs / 3600 )) + + if [ "$created_epoch" -gt 0 ] && [ "$age_secs" -ge "$alarm_secs" ]; then + echo "❌ $stack still present (status: $status), age ${age_hours}h ≥ ${ALARM_AGE_HOURS}h — LEAK." + leaked="${leaked}\n- \`${stack}\` — status \`${status}\`, age ~${age_hours}h" + else + echo "⏳ $stack still present (status: $status), age ~${age_hours}h — within ${ALARM_AGE_HOURS}h window; ENIs likely not yet released. Will retry next cycle." + fi + echo "::endgroup::" + done + + if [ -n "$leaked" ]; then + { + echo "leaked<> "$GITHUB_OUTPUT" + fi + + - name: Open issue on genuine leak + if: steps.sweep.outputs.leaked != '' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Pass via env (not inline ${{ }} interpolation) so the value never + # expands into the shell script body — avoids template injection + # (zizmor template-injection). Stack names are AWS-controlled, but env + # is the correct, lint-clean pattern regardless. + LEAKED: ${{ steps.sweep.outputs.leaked }} + # Stable label used both to tag the tracking issue and to find an + # existing open one — this is the dedup key, so it must not change. + LEAK_LABEL: integ-leak + run: | + set -euo pipefail + body_file="$(mktemp)" + { + echo "The integ-sweeper found stranded \`int-*\` CloudFormation stacks older than ${ALARM_AGE_HOURS}h that still fail to delete — likely a real leak in the shared integ account (each carries a VPC + NAT gateway + interface endpoints + the AgentCore runtime, billing hourly)." + echo "" + echo "These are normally reclaimed automatically once the AgentCore \`agentic_ai\` ENIs detach (~1-2h). Past ${ALARM_AGE_HOURS}h, investigate: the ENIs may be genuinely stuck (needs manual ENI/VPC cleanup) or the deploy role lacks teardown permissions." + echo "" + echo "### Stranded stacks (as of this run)" + echo -e "${LEAKED}" + echo "" + echo "| Field | Value |" + echo "| --- | --- |" + echo "| Workflow run | [integ-sweeper #${GITHUB_RUN_NUMBER}](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}) |" + echo "| Region | \`${AWS_REGION}\` |" + echo "" + echo "Close this issue once the stacks are deleted and the sweeper run is green." + } > "${body_file}" + + # Dedup: a stuck stack re-alarms every 2h cycle. Without this guard each + # cycle files a fresh duplicate. Find an existing OPEN issue carrying the + # stable leak label and comment on it instead of opening another; only + # open a new issue when none exists. `--search` scopes to open issues with + # the label; `--json number --jq '.[0].number'` yields the first match (or + # empty). Ensure the label exists first (idempotent; ignore "already exists"). + gh label create "${LEAK_LABEL}" \ + --description "Stranded integ stacks flagged by integ-sweeper" \ + --color B60205 2>/dev/null || true + + existing="$(gh issue list --state open --label "${LEAK_LABEL}" \ + --json number --jq '.[0].number // empty' 2>/dev/null || true)" + + if [ -n "${existing}" ]; then + echo "Existing open leak issue #${existing} — commenting instead of opening a duplicate." + gh issue comment "${existing}" --body-file "${body_file}" + else + gh issue create \ + --title "Stranded integ stacks not reclaimed (>${ALARM_AGE_HOURS}h)" \ + --label "${LEAK_LABEL}" \ + --body-file "${body_file}" + fi + + - name: Fail job on genuine leak + if: steps.sweep.outputs.leaked != '' + run: exit 1 diff --git a/.github/workflows/integ.yml b/.github/workflows/integ.yml index 4ca180b7..5819c239 100644 --- a/.github/workflows/integ.yml +++ b/.github/workflows/integ.yml @@ -7,10 +7,20 @@ name: integ # # Trigger model mirrors deploy.yml: build.yml completes -> workflow_run picks it # up in the trusted base-repo context (secrets/OIDC available even for fork PRs) -# -> we resolve whether the PR touches cdk/** or agent/** -> an admin approves -# the `integ` environment gate -> deploy/assert/destroy runs against the shared -# account -> a commit status `integ-smoke` is posted back to the PR head so it -# shows up as a (required) check that blocks merge. +# -> we resolve whether the PR touches cdk/** or agent/** -> deploy/assert/destroy +# runs against the shared account -> a commit status `integ-smoke` is posted back +# to the PR head so it shows up as a (required) check that blocks merge. +# +# Fork-code gate: the ENFORCED gate on fork-authored test code is the +# `safe-to-test` label check in the `resolve` job below — a maintainer must apply +# it before this workflow will run a fork PR. The `integ` GitHub environment is a +# SECOND, OPTIONAL layer: it only adds a manual approval if required reviewers are +# configured on it, and at time of writing NONE are. Two consequences worth +# knowing: (a) do not rely on the environment as the fork-code gate — that is the +# label's job; (b) the scheduled integ-sweeper (.github/workflows/integ-sweeper.yml) +# also runs under `environment: integ`, and a cron trigger CANNOT satisfy a manual +# approval, so adding required reviewers here would silently break the sweeper. +# If reviewers are ever wanted for PR runs, give the sweeper its own environment. # # Local dev path is unchanged: run `mise //cdk:integ` with your own AWS creds. # @@ -19,9 +29,10 @@ name: integ on: # zizmor: ignore[dangerous-triggers] — intentional; workflow_run is required so # fork PRs can run against the shared account (a fork `pull_request` job gets no - # secrets/OIDC). Mitigations: build-success guard, path-filter, `integ` - # environment approval gate (admin reviews fork test code before it runs with - # the privileged role), least-privilege role, status-only tokens per job. + # secrets/OIDC). Mitigations: build-success guard, path-filter, the + # `safe-to-test` label gate (the enforced fork-code review gate), the `integ` + # environment (an optional second approval layer IF reviewers are configured — + # currently none), least-privilege role, status-only tokens per job. workflow_run: workflows: [build] types: [completed] @@ -41,8 +52,9 @@ jobs: # docs/cli-only PRs get an immediate green (skipped) status and never deadlock # the required check. resolve: - # Manual dispatch is restricted to main (defence in depth — the `integ` - # environment approval is the primary gate). PR runs come via workflow_run. + # Manual dispatch is restricted to main (defence in depth). For fork PRs the + # primary gate is the `safe-to-test` label check below. PR runs come via + # workflow_run. if: >- (github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') @@ -129,10 +141,12 @@ jobs: exit 0 fi - # Fork-PR safety: only run fork-authored code after a maintainer has - # applied the `safe-to-test` label (defence in depth on top of the - # `integ` environment approval). If it's absent, leave the status - # pending and don't run — re-trigger once the label is added. + # Fork-PR safety: this is the ENFORCED gate on fork-authored code — only + # run it after a maintainer has applied the `safe-to-test` label. (The + # `integ` environment can add a second approval layer, but only if + # required reviewers are configured on it — currently none, so this label + # is the effective gate.) If absent, leave the status pending and don't + # run — re-trigger once the label is added. if [[ "$WF_HEAD_REPO" != "$REPO" ]]; then if ! LABELS=$(gh api "repos/$REPO/issues/$PR_NUMBER/labels" --jq '.[].name'); then echo "::error::Failed to read labels for PR #$PR_NUMBER." @@ -157,7 +171,7 @@ jobs: exit 1 fi if echo "$CHANGED" | grep -Eq '^(cdk|agent)/'; then - post_status pending "awaiting admin approval / running" + post_status pending "awaiting integ run" echo "applicable=true" >> "$GITHUB_OUTPUT" echo "PR #$PR_NUMBER touches cdk/** or agent/** — integ applies." else @@ -166,16 +180,23 @@ jobs: echo "PR #$PR_NUMBER has no cdk/** or agent/** changes — integ skipped (green)." fi - # The admin-gated deploy -> assert -> destroy. The `integ` environment's - # required reviewer is the approval gate; while it waits, the integ-smoke - # status stays pending and merge stays blocked. + # The deploy -> assert -> destroy job. It runs in the `integ` environment; if + # required reviewers are ever configured there, a pending approval holds this + # job (and the integ-smoke status stays pending / merge stays blocked) until + # approved. With no reviewers configured (current state), it proceeds directly + # once `resolve` marks it applicable and the `safe-to-test` gate has passed. integ: needs: resolve if: needs.resolve.outputs.applicable == 'true' name: CDK integ smoke (Task API) runs-on: ubuntu-latest environment: integ - timeout-minutes: 45 + # The lifecycle test (integ.task-lifecycle.ts) deploys the full AgentStack + # (orchestrator + AgentCore runtime + Docker image build) and drives real + # agent runs through their terminal states before destroying — far heavier + # than the Phase-0 trimmed smoke test. 90 min covers deploy + cold Docker + # build + agent runs + teardown with margin. + timeout-minutes: 90 permissions: id-token: write contents: read @@ -186,9 +207,11 @@ jobs: - name: Checkout PR head (incl. forks) uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: - # Approving the `integ` environment authorizes this fork-authored test - # code to run with the privileged role — the approver MUST review - # cdk/test/integ/** changes before approving. + # This checks out fork-authored test code that then runs with the + # privileged role. The `safe-to-test` label (gated in `resolve`) is the + # enforced review point for cdk/test/integ/** changes; if required + # reviewers are configured on the `integ` environment they add a second + # manual review before this runs. repository: ${{ needs.resolve.outputs.head_repo }} ref: ${{ needs.resolve.outputs.head_sha }} persist-credentials: false @@ -215,29 +238,98 @@ jobs: run: yarn install --immutable - name: Run integ tests (deploy → assert → destroy) + # COMMIT_HASH drives the per-run unique stack name `int-` (see + # cdk/test/integ/integ.task-lifecycle.ts + cdk/mise.toml). Using the + # resolved head SHA means a stranded stack from a failed teardown never + # collides with / blocks a later run on a different commit. + # + # INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID bind the gate scenarios (3 & 4) + # to the account's provisioned sandbox repo + PAT secret instead of a + # hardcoded contributor repo. Same vars the sandbox-cleanup step reads. + # When unset, the test falls back to its literals and the gates degrade to + # clone-failures (still synthesizes). + env: + COMMIT_HASH: ${{ needs.resolve.outputs.head_sha }} + INTEG_SANDBOX_REPO: ${{ vars.INTEG_SANDBOX_REPO }} + INTEG_PAT_SECRET_ID: ${{ vars.INTEG_PAT_SECRET_ID }} run: mise //cdk:integ # Safety net: integ-runner forces teardown on success and failure, but if - # the run is cancelled or crashes mid-deploy the stack can be stranded in - # the shared account. Delete it directly via CloudFormation so we never + # the run is cancelled or crashes mid-deploy a stack can be stranded in + # the shared account. Delete them directly via CloudFormation so we never # leak billable resources. # - # NOTE: `cdk destroy backgroundagent-integ` would NOT work here — it - # synthesizes the main app (src/main.ts), which does not contain the integ - # stack, so it exits 0 having deleted nothing. Target the stack by its - # literal CloudFormation name instead. delete-stack is idempotent (no-op if + # NOTE: `cdk destroy ` would NOT work here — it synthesizes the + # main app (src/main.ts), which does not contain the integ stacks, so it + # exits 0 having deleted nothing. Target each stack by its literal + # CloudFormation name instead. delete-stack is idempotent (no-op if # already gone), so `|| true` only guards transient API errors. - - name: Ensure stack torn down + # + # Best-effort delete-stack safety net for crash/cancel cases. integ-runner + # already runs its own destroy (and tolerates the expected ENI DELETE_FAILED + # via expectError); this only catches a run that died BEFORE integ-runner's + # own teardown (e.g. the job was cancelled mid-deploy). + # + # Stacks swept: backgroundagent-integ (Phase-0 smoke, fixed name) and the + # Phase-1 per-run stack `int-` (matches the name computed in + # cdk/test/integ/integ.task-lifecycle.ts from the same head SHA). + # + # IMPORTANT — this step does NOT retry-until-deleted and does NOT fail the + # job on a stranded stack. The AgentCore Runtime's service-managed + # `agentic_ai` ENIs are released asynchronously by AWS (minutes to hours), + # so an immediate delete reliably hits DELETE_FAILED on the subnets/SG/VPC. + # Because the stack name is now per-commit-UNIQUE, a stranded `int-` + # stack never blocks a future run, so we leave it for the out-of-band + # ephemeral sweeper (.github/workflows/integ-sweeper.yml) to reclaim once the + # ENIs detach — that sweeper FAILS LOUDLY + files an issue for any int-* + # stack still stuck past its alarm age, so leaks surface rather than + # accumulate. Here we just fire one delete to start the teardown and move on. + - name: Ensure stacks torn down (best effort) if: always() env: AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} + HEAD_SHA: ${{ needs.resolve.outputs.head_sha }} + run: | + set -uo pipefail + INT_STACK="int-$(printf '%s' "$HEAD_SHA" | cut -c1-8)" + for stack in backgroundagent-integ "$INT_STACK"; do + echo "Best-effort delete-stack: $stack" + aws cloudformation delete-stack --stack-name "$stack" || true + done + echo "Initiated teardown; stranded int-* stacks (if any) are reclaimed by the ephemeral sweeper once their ENIs detach." + + # Sandbox cleanup for the gate scenarios (3 & 4): coding/new-task-v1 pushes + # a `bgagent//` branch and (on approve) opens a PR on the + # sandbox repo. The agent never closes these, so each run would accumulate + # stale branches/PRs. Reconstructing the exact branch name in the test is + # fragile (it depends on the agent-side slug), so we sweep by prefix here: + # delete every `bgagent/*` branch on the sandbox, which also closes the + # associated PRs. Reads the same PAT the agent used, from the pre-seeded + # secret. Gated on the repo vars being set so this is a no-op until the + # sandbox + secret are provisioned. Never fails the job — best-effort. + - name: Clean up sandbox PRs/branches + if: always() && vars.INTEG_SANDBOX_REPO != '' && vars.INTEG_PAT_SECRET_ID != '' + env: + AWS_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} + AWS_DEFAULT_REGION: ${{ vars.AWS_REGION || 'us-east-1' }} + SANDBOX_REPO: ${{ vars.INTEG_SANDBOX_REPO }} + PAT_SECRET_ID: ${{ vars.INTEG_PAT_SECRET_ID }} run: | set -euo pipefail - aws cloudformation delete-stack --stack-name backgroundagent-integ || true - # No `|| true` on the wait: a DELETE_FAILED must surface loudly so we - # never silently leak billable resources in the shared account. - aws cloudformation wait stack-delete-complete --stack-name backgroundagent-integ + GH_TOKEN="$(aws secretsmanager get-secret-value \ + --secret-id "$PAT_SECRET_ID" \ + --query SecretString --output text)" + export GH_TOKEN + # List bgagent/* branch refs; delete each (deleting the branch closes + # any open PR from it). Best-effort: never fail the job on cleanup. + gh api "repos/${SANDBOX_REPO}/git/matching-refs/heads/bgagent/" \ + --jq '.[].ref | sub("^refs/heads/"; "")' 2>/dev/null \ + | while read -r branch; do + [ -n "$branch" ] || continue + echo "Deleting sandbox branch: $branch" + gh api -X DELETE "repos/${SANDBOX_REPO}/git/refs/heads/${branch}" || true + done || true # Post the final integ-smoke status back to the PR head so the check flips from # pending to success/failure. Skipped for workflow_dispatch (no PR to gate). diff --git a/cdk/mise.toml b/cdk/mise.toml index 60332012..990db7fb 100644 --- a/cdk/mise.toml +++ b/cdk/mise.toml @@ -63,10 +63,26 @@ description = "CDK deploy-then-verify integration tests (integ-runner). Needs AW depends = [":compile"] run = [ "mkdir -p $TMPDIR", + # Per-run unique stack naming: the lifecycle test names its stack `int-` + # from the COMMIT_HASH env var (read directly via process.env in the test — + # integ-runner synths in a subprocess that inherits the env but not our shell's + # CDK context). A stranded stack (the AgentCore ENI teardown race) then never + # blocks the next run. Source: COMMIT_HASH (set by CI from the resolved head + # SHA), falling back to the local git SHA, then "local" outside a checkout. + # # No --update-on-failed: .snapshot/ is gitignored, so there is no committed # snapshot to diff against or update. --force re-runs the deploy-then-verify # unconditionally, which is what we want in CI. - "npx integ-runner --language typescript --directory test/integ --force", + # + # --verbose: integ-runner otherwise prints only a one-line pass/fail per test, + # which hides WHICH assertion failed and its actual-vs-expected payload. The + # lifecycle test polls DynamoDB for terminal task status; without --verbose a + # failure (e.g. task stuck at SUBMITTED instead of COMPLETED) is undiagnosable + # from the log alone. Verbose surfaces the assertion diffs we need. + ''' + export COMMIT_HASH="${COMMIT_HASH:-$(git rev-parse HEAD 2>/dev/null || echo local)}" + npx integ-runner --language typescript --directory test/integ --force --verbose + ''', ] [tasks.bundle] diff --git a/cdk/test/integ/integ.task-lifecycle.ts b/cdk/test/integ/integ.task-lifecycle.ts new file mode 100644 index 00000000..bee378d3 --- /dev/null +++ b/cdk/test/integ/integ.task-lifecycle.ts @@ -0,0 +1,552 @@ +/** + * MIT No Attribution + * + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Phase-1 deploy-then-verify lifecycle test for issue #317. + * + * Where Phase 0 (integ.task-api-smoke.ts) deployed a TRIMMED stack and asserted + * a task merely persists at SUBMITTED, Phase 1 deploys the REAL, full AgentStack + * (orchestrator + AgentCore runtime/memory + agent container) and drives a live + * agent through its lifecycle, asserting the four terminal paths from the Cedar + * HITL E2E matrix (docs/design/CEDAR_HITL_GATES.md §15.3): + * + * 1. submit -> run -> COMPLETED (repo-less default/agent-v1) + * 2. submit -> run -> FAILED (coding/new-task-v1, bad repo) + * 3. submit -> run -> AWAITING_APPROVAL -> approve (write_env_files soft-deny gate) + * 4. submit -> run -> AWAITING_APPROVAL -> deny (write_env_files soft-deny gate) + * + * This is environment-agnostic: it deploys to whatever account/region the + * caller's AWS credentials resolve to (CI assumes the integ role; local runs use + * your own creds). It should run in a DEDICATED integ account with no + * backgroundagent-dev/main stack, so the AgentCore account-unique runtime/memory + * names don't collide. We deploy the committed AgentStack unchanged: it leaves + * runtimeName/memoryName UNSET and CDK auto-generates names scoped to the + * per-run stack name (int-, see below), guaranteeing uniqueness. + * (A local developer's uncommitted agent.ts name pin must be stashed before a + * local `mise //cdk:integ`, or it would collide.) + * + * Determinism: there is no mock/scripted agent mode — every scenario runs the + * real `claude` CLI against Bedrock. We bound cost and wall-clock with low + * max_turns and a max_budget_usd cap, and steer terminal states with simple, + * purpose-built task descriptions. + */ + +import { randomBytes } from 'node:crypto'; +import { ExpectedResult, IntegTest } from '@aws-cdk/integ-tests-alpha'; +import { App, type CfnOutput, Duration } from 'aws-cdk-lib'; +import { TaskStatus } from '../../src/constructs/task-status'; +import { AgentStack } from '../../src/stacks/agent'; + +// NOTE on assertion shape: every terminal/gate check below runs inside +// `waitForAssertions` (a polling Step Functions waiter). Nested `Match.*` +// matchers (objectLike / stringLikeRegexp) CANNOT be used there — the assertion +// provider serializes the Match object's internals ({name, partial, pattern}) +// into the expected pattern, and the waiter then treats those as literal +// required keys that never exist on the row, so the assertion fails forever even +// when the data is correct (observed live: a COMPLETED task polled 25× and timed +// out). Polled assertions therefore use ONLY flat, exact scalar values (the +// `status`/decision string), which serialize cleanly. Asserting field PRESENCE +// (task_id/user_id/timestamps/approval metadata, #317) needs a non-polled +// getItem with assertAtPath — tracked as a follow-up on #317. + +const app = new App(); + +// Per-run UNIQUE stack name: `int-`. A fixed name is a trap for this +// stack — the AgentCore Runtime injects service-managed `agentic_ai` ENIs that AWS +// releases ASYNCHRONOUSLY, so `cdk destroy` reliably fails the subnet/SG/VPC +// deletes (DependencyViolation) and strands the stack. With a fixed name that +// stranded stack BLOCKS the next run (name conflict). A unique per-commit name +// means a failed teardown never blocks a later run, and the out-of-band ephemeral +// sweeper (.github/workflows/integ-sweeper.yml) reclaims `int-*` stacks once their +// ENIs detach, alarming if any stays stuck past its age threshold. +// +// The hash comes from the COMMIT_HASH env var (set by CI from the resolved head +// SHA; the mise //cdk:integ task falls back to the local git SHA). We read the +// ENV directly rather than CDK context: integ-runner synthesizes the test app in +// its own subprocess and does NOT forward CDK_CONTEXT_JSON / `-c` from our shell +// to that synth, but the subprocess DOES inherit the environment — so the env var +// reaches `process.env` here reliably where `tryGetContext` would not. Falls back +// to 'local' outside CI/git. (Date.now()/random are avoided — they'd break integ +// snapshot determinism; CI always supplies a real sha.) +const commitHash = (process.env.COMMIT_HASH ?? '').slice(0, 8) || 'local'; +const stackName = `int-${commitHash}`; + +// The real, full production stack. Environment-agnostic on purpose (same +// rationale as Phase 0): an explicit env would force the IntegTest DeployAssert +// stack — always environment-agnostic — into cross-region references it cannot +// resolve when reading this stack's outputs in the assertions below. +// +// DO NOT set runtimeName/memoryName here or pin them in agent.ts for this +// deploy: the committed defaults auto-generate stack-name-scoped unique names, +// so each `int-` stack gets its own non-colliding AgentCore names. +const stack = new AgentStack(app, stackName, { + description: 'ABCA Phase-1 integ lifecycle stack (full AgentStack: orchestrator + agent runtime)', +}); + +// AgentStack exposes its API URL, Cognito IDs, and table names only as +// CfnOutputs (its constructs are private consts). Read the output tokens by +// construct id rather than adding public accessors to the production stack. +// CfnOutput exposes a `value` getter that returns the underlying token. +const output = (id: string): string => (stack.node.findChild(id) as CfnOutput).value; + +const apiUrl = output('ApiUrl'); +const userPoolId = output('UserPoolId'); +const appClientId = output('AppClientId'); +const taskTableName = output('TaskTableName'); +const taskApprovalsTableName = output('TaskApprovalsTableName'); +// The submit path enforces an onboarding gate: a repo must have an active row in +// RepoTable or POST /tasks returns 422 REPO_NOT_ONBOARDED before clone/preflight. +// The gate scenarios onboard SANDBOX_REPO here (a putItem assertion) rather than +// adding a Blueprint construct to the production stack — test-side only. +const repoTableName = output('RepoTableName'); +// AgentStack creates its OWN empty GitHubTokenSecret (agent.ts:181, +// RemovalPolicy.DESTROY) — it does not reference an external one. The gate +// scenarios populate it post-deploy from the pre-seeded secret below, which is +// exactly the documented operator flow (docs/guides/QUICK_START.md §4: read the +// GitHubTokenSecretArn output, put-secret-value the PAT into it). Automating +// that copy here keeps us aligned with the design (no agent.ts change) and the +// throwaway secret tears down with the stack. +const githubTokenSecretArn = output('GitHubTokenSecretArn'); + +// --- Gate-scenario configuration (scenarios 3 & 4) ---------------------------- +// These two constants are the ONLY out-of-band wiring the gate scenarios need. +// They point at resources an operator provisions once in the integ account +// (whichever account the run deploys to); scenarios 1 & 2 do NOT depend on them +// and run regardless. +// +// SANDBOX_REPO — a throwaway GitHub repo (owner/name) with a committed +// baseline (README + default branch). coding/new-task-v1 +// clones it, the agent attempts a `config.env` write that +// trips the write_env_files soft-deny gate, and (on approve) +// pushes a `bgagent//` branch + opens a PR. The +// CI `always()` cleanup step deletes those branches each run. +// The PAT below must have Contents+PR WRITE on this repo (a +// read-only token clones fine but the agent's `git push` 403s). +// PRESEEDED_PAT_SECRET — name of a STABLE Secrets Manager secret in the integ +// account holding a fine-grained PAT scoped to SANDBOX_REPO. +// Resolved by NAME (not ARN) so it is account-agnostic; copied +// into the stack-created GitHubTokenSecret by the token-seeding +// assertion below. +// +// Sourced from CI repo vars (INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID — the same +// vars the integ.yml sandbox-cleanup step reads), so the gate scenarios bind to +// whatever sandbox+secret the running account provisioned. There is deliberately +// NO fallback literal: an account that hasn't provisioned a sandbox (e.g. upstream +// aws-samples, or any fork) leaves both unset, and scenarios 3 & 4 SKIP with a +// clear message (see the chain-assembly block at the bottom) rather than silently +// routing the gate runs — which clone and push with a write-PAT — into one +// contributor's personal repo. Set both vars to exercise the Cedar gates; +// scenarios 1 & 2 always run regardless. +const SANDBOX_REPO = process.env.INTEG_SANDBOX_REPO; +const PRESEEDED_PAT_SECRET = process.env.INTEG_PAT_SECRET_ID; + +// Gate scenarios (3 & 4) require BOTH a sandbox repo and its pre-seeded PAT. When +// either is unset, skip them (scenarios 1 & 2 still run). This keeps the test +// account-agnostic: it never falls back to a hardcoded personal repo. +const gatesEnabled = Boolean(SANDBOX_REPO && PRESEEDED_PAT_SECRET); +if (!gatesEnabled) { + // eslint-disable-next-line no-console + console.warn( + '[integ.task-lifecycle] INTEG_SANDBOX_REPO / INTEG_PAT_SECRET_ID not set — ' + + 'skipping Cedar gate scenarios 3 & 4 (approve/deny). Set both to exercise the gates.', + ); +} + +const integ = new IntegTest(app, 'TaskLifecycle', { + testCases: [stack], + // Disable the two-phase update workflow. By default integ-runner deploys the + // committed snapshot first, then re-deploys the current version to verify + // in-place updates don't break. The AgentCore Runtime takes several minutes to + // go CREATING -> READY and is partly immutable; the second deploy phase races + // the first (Runtime still CREATING) -> 409 "agent is currently being modified" + // -> integ-runner aborts mid-deploy and teardown strands a CREATING Runtime. + // We validate runtime BEHAVIOR, not stack-update safety, so a single clean + // deploy is correct here. + stackUpdateWorkflow: false, + // Force teardown on success and failure so a failed assertion never strands + // the (expensive) full stack in the shared E2E account. + // + // expectError on destroy: `cdk destroy` RELIABLY fails this stack — the + // AgentCore Runtime's service-managed `agentic_ai` ENIs are released + // asynchronously by AWS, so the subnet/SG/VPC deletes hit DependencyViolation + // ("has dependencies and cannot be deleted" / "has a dependent object") while + // the ENIs linger. Without expectError, integ-runner would mark the whole run + // FAILED on teardown alone — masking whether the ASSERTIONS passed. We tolerate + // the teardown failure (scoped to the dependency-violation message so unrelated + // teardown bugs still surface) and hand the stranded `int-` stack to the + // out-of-band ephemeral sweeper (.github/workflows/integ-sweeper.yml), which + // reclaims it once AWS detaches the ENIs and alarms if it stays stuck. + cdkCommandOptions: { + destroy: { + args: { force: true }, + expectError: true, + expectedMessage: 'cannot be deleted|dependent object|DELETE_FAILED', + }, + }, +}); + +// --- Authentication (same pattern as Phase 0) --------------------------------- +// A throwaway user the assertions authenticate as. The pool disables self-signup, +// so create + confirm it administratively, then mint a token via USER_PASSWORD_AUTH. +// The password is generated per-synth (no credential-shaped literal in source) and +// satisfies the Cognito default policy by construction. +const username = 'integ-lifecycle@example.com'; +const password = `Aa1!${randomBytes(18).toString('base64url')}`; + +// Service name MUST be the AWS SDK v2 form 'CognitoIdentityServiceProvider' — the +// assertion provider maps only the v2 key to the real client package (see the +// long note in integ.task-api-smoke.ts). +const cognitoService = 'CognitoIdentityServiceProvider'; + +const createUser = integ.assertions.awsApiCall(cognitoService, 'adminCreateUser', { + UserPoolId: userPoolId, + Username: username, + MessageAction: 'SUPPRESS', + TemporaryPassword: password, +}); + +const setPassword = integ.assertions.awsApiCall(cognitoService, 'adminSetUserPassword', { + UserPoolId: userPoolId, + Username: username, + Password: password, + Permanent: true, +}); + +const auth = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', { + AuthFlow: 'USER_PASSWORD_AUTH', + ClientId: appClientId, + AuthParameters: { USERNAME: username, PASSWORD: password }, +}); + +const idToken = auth.getAttString('AuthenticationResult.IdToken'); + +// Conservative polling windows. Agent runs are real LLM sessions over a freshly +// cold-started AgentCore runtime; the first invocation pays the cold-start tax. +const TERMINAL_POLL = { totalTimeout: Duration.minutes(12), interval: Duration.seconds(30) }; +// The interim AWAITING_APPROVAL state appears mid-run, before terminal — poll it +// on a shorter window so a stuck gate fails fast instead of burning the full +// terminal budget waiting for a state that will never arrive. +const GATE_POLL = { totalTimeout: Duration.minutes(8), interval: Duration.seconds(15) }; + +// --- Scenario 1: COMPLETED (repo-less default/agent-v1) ----------------------- +// The default workflow is read-only (Read/Glob/Grep/WebFetch), requires no repo, +// and delivers an artifact to S3. A trivial, self-contained instruction completes +// in a single turn. No GitHub repo or token is involved. +const submitComplete = integ.assertions.httpApiCall(`${apiUrl}tasks`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': idToken, + }, + body: JSON.stringify({ + workflow_ref: 'default/agent-v1', + // Keep this a plain, benign natural-language request. An earlier terse, + // imperative phrasing ("Reply with exactly the single word: done. Do not + // use any tools.") tripped the Bedrock content-policy guardrail at submit + // (400 VALIDATION_ERROR "Task description was blocked by content policy"). + task_description: 'Please write a one-sentence summary explaining what a pull request is in software development.', + max_turns: 2, + max_budget_usd: 0.5, + }), +}); + +// Poll the task row until it reaches COMPLETED. No getAttString is read off this +// call, so flattenResponse stays false and the nested objectLike expect works. +const pollComplete = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskTableName, + Key: { task_id: { S: submitComplete.getAttString('body.data.task_id') } }, +}); +pollComplete + .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.COMPLETED } } })) + .waitForAssertions(TERMINAL_POLL); + +// --- Scenario 2: FAILED (coding/new-task-v1, onboarded repo, clone fails) ------ +// The submit path runs the onboarding gate (RepoTable) BEFORE clone/preflight, +// so an un-onboarded repo is rejected at submit (422 REPO_NOT_ONBOARDED) and the +// task never reaches a terminal FAILED. To exercise the terminal-error path we +// must therefore ONBOARD the repo first, then make CLONE fail: the onboarding +// gate only checks RepoTable, not GitHub, so we onboard a repo slug that does +// not exist on GitHub. Submit then passes admission, preflight/clone 404s, and +// the orchestrator writes terminal FAILED + error_message — no agent turn, no +// runtime spin-up. (onboardFailRepo is sequenced before this submit.) +const failRepo = `abca-integ-nonexistent/does-not-exist-${randomBytes(6).toString('hex')}`; +const onboardFailRepo = integ.assertions.awsApiCall('DynamoDB', 'putItem', { + TableName: repoTableName, + Item: { + repo: { S: failRepo }, + status: { S: 'active' }, + onboarded_at: { S: '2026-01-01T00:00:00.000Z' }, + updated_at: { S: '2026-01-01T00:00:00.000Z' }, + }, +}); + +const submitFail = integ.assertions.httpApiCall(`${apiUrl}tasks`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': idToken, + }, + body: JSON.stringify({ + workflow_ref: 'coding/new-task-v1', + repo: failRepo, + task_description: 'This task targets a nonexistent repo and must fail at clone/preflight.', + max_turns: 1, + max_budget_usd: 0.5, + }), +}); + +const pollFail = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskTableName, + Key: { task_id: { S: submitFail.getAttString('body.data.task_id') } }, +}); +pollFail + .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.FAILED } } })) + .waitForAssertions(TERMINAL_POLL); + +// --- Execution order (scenarios 1 & 2) ---------------------------------------- +// Auth first, then SEED THE GITHUB TOKEN BEFORE ANY SUBMIT. This ordering is +// load-bearing: the orchestrator's resolveGitHubToken caches the secret value +// for 5 min keyed by ARN (context-hydration.ts). Any coding-workflow task that +// runs GitHub preflight reads + caches the token. Scenario 2 (coding/new-task-v1) +// runs preflight too — so if it ran BEFORE the seed, it would cache the stack's +// INITIAL EMPTY secret and every later gate task would reuse that empty token → +// preflight 401 GITHUB_UNREACHABLE → FAILED before ever reaching the gate +// (observed live). Seeding right after auth means the secret is populated before +// the first token read, so no empty value is ever cached. This is exactly the +// documented operator flow (QUICK_START §4: populate the secret before submitting +// tasks) — no agent.ts change. The seed only happens when the gates are enabled +// (it is sourced from the pre-seeded PAT secret); scenario 2 targets a +// nonexistent repo and fails at clone regardless of token, so it is unaffected. +// +// Onboarding: scenario 2's repo and the sandbox both need a RepoTable row before +// submit (else 422 REPO_NOT_ONBOARDED), so both onboard steps precede their +// submits. Gate approve/deny run sequentially since each POST needs the +// request_id read from the parked task's approval row. +let chain = createUser + .next(setPassword) + .next(auth) + .next(onboardFailRepo) + .next(submitComplete) + .next(submitFail) + .next(pollComplete) + .next(pollFail); + +// --- Scenarios 3 & 4 (Cedar gates) — only when a sandbox is configured -------- +// Every assertion call below is CONSTRUCTED only inside this block, so when the +// gates are disabled nothing is registered with the integ provider and the run +// reduces cleanly to scenarios 1 & 2 (no skipped/failing gate steps, no PAT seed +// into the stack secret, no clone of a personal repo). +if (gatesEnabled) { + // Narrow the env-sourced config to non-null for this block. + const sandboxRepo = SANDBOX_REPO as string; + const patSecretId = PRESEEDED_PAT_SECRET as string; + + // Re-mint a FRESH token right before each approve/deny POST. The Cognito app + // client uses the default 60-min ID-token validity (task-api.ts sets no + // idTokenValidity), but the strictly-serial .next() chain reaches the gate POSTs + // only after ~32 min (approve) / ~48 min (deny) of polling budget PLUS real agent + // cold-start + runtime — the live run took ~54 min. Reusing the original token + // would risk a 401 (expired) → the decision never records → false timeout keyed + // to agent latency. These re-auths run just before their POSTs in the chain, so + // each token is minted minutes (not ~50 min) before use. The user/password are + // permanent (adminSetUserPassword above), so re-auth needs no new setup. + const reAuthApprove = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', { + AuthFlow: 'USER_PASSWORD_AUTH', + ClientId: appClientId, + AuthParameters: { USERNAME: username, PASSWORD: password }, + }); + const approveToken = reAuthApprove.getAttString('AuthenticationResult.IdToken'); + + const reAuthDeny = integ.assertions.awsApiCall(cognitoService, 'initiateAuth', { + AuthFlow: 'USER_PASSWORD_AUTH', + ClientId: appClientId, + AuthParameters: { USERNAME: username, PASSWORD: password }, + }); + const denyToken = reAuthDeny.getAttString('AuthenticationResult.IdToken'); + + // --- Token seeding (prerequisite for gate scenarios) ------------------------ + // Copy the pre-seeded PAT into the stack-created GitHubTokenSecret so the agent + // runtime can clone the sandbox and push a branch. This automates the documented + // operator step (QUICK_START.md §4). No getAttString is read off seedPut, and the + // SecretString token is consumed inline by seedPut, never asserted on. + const seedGet = integ.assertions.awsApiCall('SecretsManager', 'getSecretValue', { + SecretId: patSecretId, + }); + + const seedPut = integ.assertions.awsApiCall('SecretsManager', 'putSecretValue', { + SecretId: githubTokenSecretArn, + SecretString: seedGet.getAttString('SecretString'), + }); + + // Onboard the sandbox so the gate submits pass the onboarding gate (otherwise + // 422 REPO_NOT_ONBOARDED at submit, before the agent ever runs). A minimal active + // row is enough — the agent reads the GitHub token from the platform-default + // GitHubTokenSecret we seeded above, so the blueprint needs no per-repo token. + const onboardSandbox = integ.assertions.awsApiCall('DynamoDB', 'putItem', { + TableName: repoTableName, + Item: { + repo: { S: sandboxRepo }, + status: { S: 'active' }, + onboarded_at: { S: '2026-01-01T00:00:00.000Z' }, + updated_at: { S: '2026-01-01T00:00:00.000Z' }, + }, + }); + + // --- Scenario 3: AWAITING_APPROVAL -> approve ------------------------------- + // coding/new-task-v1 against the sandbox. The task asks the agent to write a + // `config.env` file, which the Write tool routes through the write_env_files + // soft-deny rule (agent/policies/soft_deny.cedar) -> the task parks at + // AWAITING_APPROVAL with a PENDING approval row. We approve it, then assert the + // row flips to APPROVED. (Post-approval the agent may COMPLETE or FAIL — both + // terminal — so the deterministic assertion is the recorded decision, not a + // specific terminal status.) + const submitApprove = integ.assertions.httpApiCall(`${apiUrl}tasks`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': idToken, + }, + body: JSON.stringify({ + workflow_ref: 'coding/new-task-v1', + repo: sandboxRepo, + task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.', + max_turns: 6, + max_budget_usd: 0.5, + }), + }); + const approveTaskId = submitApprove.getAttString('body.data.task_id'); + + // Wait for the gate to open (interim AWAITING_APPROVAL). + const pollGateApprove = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskTableName, + Key: { task_id: { S: approveTaskId } }, + }); + pollGateApprove + .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } })) + .waitForAssertions(GATE_POLL); + + // Read the PENDING approval row's request_id (SK). Querying by task_id (PK) is + // required because we do not know the agent-minted request_id. The status=PENDING + // FilterExpression makes Items[0] deterministic: a task could trip the gate more + // than once (or carry already-decided rows), and an unfiltered query orders only + // by SK, so without the filter Items[0] could be the wrong/decided row and the + // POST would target the wrong request_id. getAttString here flips this call to a + // flattened response, so we do NOT .expect() on it. + const queryApprove = integ.assertions.awsApiCall('DynamoDB', 'query', { + TableName: taskApprovalsTableName, + KeyConditionExpression: 'task_id = :tid', + FilterExpression: '#st = :pending', + ExpressionAttributeNames: { '#st': 'status' }, + ExpressionAttributeValues: { ':tid': { S: approveTaskId }, ':pending': { S: 'PENDING' } }, + }); + const approveRequestId = queryApprove.getAttString('Items.0.request_id.S'); + + const approve = integ.assertions.httpApiCall(`${apiUrl}tasks/${approveTaskId}/approve`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + // Fresh token (see reAuthApprove) — the original idToken may be expired by now. + 'Authorization': approveToken, + }, + body: JSON.stringify({ request_id: approveRequestId, decision: 'approve', scope: 'this_call' }), + }); + + // Assert the decision was recorded on the approval row. Now that request_id is + // known we read the exact row by its full key. + const pollApproveDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskApprovalsTableName, + Key: { task_id: { S: approveTaskId }, request_id: { S: approveRequestId } }, + }); + pollApproveDecision + .expect(ExpectedResult.objectLike({ Item: { status: { S: 'APPROVED' } } })) + .waitForAssertions(GATE_POLL); + + // --- Scenario 4: AWAITING_APPROVAL -> deny ---------------------------------- + // Identical trigger to scenario 3; we deny instead and assert the row flips to + // DENIED. + const submitDeny = integ.assertions.httpApiCall(`${apiUrl}tasks`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': idToken, + }, + body: JSON.stringify({ + workflow_ref: 'coding/new-task-v1', + repo: sandboxRepo, + task_description: 'Create a file named config.env at the repo root with the single line FOO=bar, then commit it.', + max_turns: 6, + max_budget_usd: 0.5, + }), + }); + const denyTaskId = submitDeny.getAttString('body.data.task_id'); + + const pollGateDeny = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskTableName, + Key: { task_id: { S: denyTaskId } }, + }); + pollGateDeny + .expect(ExpectedResult.objectLike({ Item: { status: { S: TaskStatus.AWAITING_APPROVAL } } })) + .waitForAssertions(GATE_POLL); + + const queryDeny = integ.assertions.awsApiCall('DynamoDB', 'query', { + TableName: taskApprovalsTableName, + KeyConditionExpression: 'task_id = :tid', + FilterExpression: '#st = :pending', + ExpressionAttributeNames: { '#st': 'status' }, + ExpressionAttributeValues: { ':tid': { S: denyTaskId }, ':pending': { S: 'PENDING' } }, + }); + const denyRequestId = queryDeny.getAttString('Items.0.request_id.S'); + + const deny = integ.assertions.httpApiCall(`${apiUrl}tasks/${denyTaskId}/deny`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + // Fresh token (see reAuthDeny) — the original idToken may be expired by now. + 'Authorization': denyToken, + }, + body: JSON.stringify({ request_id: denyRequestId, decision: 'deny', reason: 'integ: exercising the deny path' }), + }); + + const pollDenyDecision = integ.assertions.awsApiCall('DynamoDB', 'getItem', { + TableName: taskApprovalsTableName, + Key: { task_id: { S: denyTaskId }, request_id: { S: denyRequestId } }, + }); + pollDenyDecision + .expect(ExpectedResult.objectLike({ Item: { status: { S: 'DENIED' } } })) + .waitForAssertions(GATE_POLL); + + // Splice the gate steps into the chain. seedPut/onboardSandbox precede the gate + // submits (token + onboarding must exist first); approve/deny run sequentially. + chain = chain + .next(seedGet) + .next(seedPut) + .next(onboardSandbox) + .next(submitApprove) + .next(submitDeny) + .next(pollGateApprove) + .next(queryApprove) + .next(reAuthApprove) + .next(approve) + .next(pollApproveDecision) + .next(pollGateDeny) + .next(queryDeny) + .next(reAuthDeny) + .next(deny) + .next(pollDenyDecision); +} diff --git a/docs/guides/ROADMAP.md b/docs/guides/ROADMAP.md index 3e8aa438..1c9b392d 100644 --- a/docs/guides/ROADMAP.md +++ b/docs/guides/ROADMAP.md @@ -225,7 +225,7 @@ Planned capabilities, grouped by theme. Items are independent and may ship in an | Capability | Description | |------------|-------------| -| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). Phase 1 (full lifecycle / real agent runs) and Phase 2 (channels) follow. See [ADR-013](../decisions/ADR-013-tiered-validation-pyramid.md). | +| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). **Phase 1 landed ([#317](https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/317)):** a second test (`integ.task-lifecycle.ts`) deploys the *full* `AgentStack` (orchestrator + AgentCore runtime + agent container) to the dedicated E2E account and drives a real agent through the four terminal paths from the Cedar HITL matrix — `COMPLETED`, `FAILED`, and `AWAITING_APPROVAL` → approve/deny — capping cost with low `max_turns` + `max_budget_usd`. Phase 2 (channels) follows. See [ADR-013](../decisions/ADR-013-tiered-validation-pyramid.md). | | **Admission backlog observability** | Metric and alarm when `SUBMITTED` task depth exceeds an operator threshold (capacity and admission health). | | **Admission queue with deferred pickup** | When admission is at capacity, persist tasks in a durable queue instead of failing them. Automatically re-attempt admission and continue processing in FIFO order (with optional priority lanes) as concurrency becomes available. Preserve cancel/idempotency semantics and expose queue position/ETA in task status. | | **Safe orchestrator deploys** | Pre-deploy checks for active tasks (drain or warn); blue-green or canary Lambda deploy for the durable orchestrator with rollback on error regressions (`OBSERVABILITY.md`). | diff --git a/docs/src/content/docs/roadmap/Roadmap.md b/docs/src/content/docs/roadmap/Roadmap.md index 1d54f545..e43b43e8 100644 --- a/docs/src/content/docs/roadmap/Roadmap.md +++ b/docs/src/content/docs/roadmap/Roadmap.md @@ -229,7 +229,7 @@ Planned capabilities, grouped by theme. Items are independent and may ship in an | Capability | Description | |------------|-------------| -| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). Phase 1 (full lifecycle / real agent runs) and Phase 2 (channels) follow. See [ADR-013](/architecture/adr-013-tiered-validation-pyramid). | +| **Deployed runtime E2E verification** | **Phase 0 landed:** `@aws-cdk/integ-tests-alpha` + `integ-runner` deploy a trimmed Task API stack to a real account, assert the create-and-persist happy path (task persists at `SUBMITTED`), then tear it down (`mise //cdk:integ`). In CI it runs per-PR via `workflow_run` when the diff touches `cdk/**` or `agent/**`, behind the `integ` environment's admin-approval gate, and posts a required `integ-smoke` status that blocks merge (`workflow_dispatch` retained for manual runs). **Phase 1 landed ([#317](https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/317)):** a second test (`integ.task-lifecycle.ts`) deploys the *full* `AgentStack` (orchestrator + AgentCore runtime + agent container) to the dedicated E2E account and drives a real agent through the four terminal paths from the Cedar HITL matrix — `COMPLETED`, `FAILED`, and `AWAITING_APPROVAL` → approve/deny — capping cost with low `max_turns` + `max_budget_usd`. Phase 2 (channels) follows. See [ADR-013](/architecture/adr-013-tiered-validation-pyramid). | | **Admission backlog observability** | Metric and alarm when `SUBMITTED` task depth exceeds an operator threshold (capacity and admission health). | | **Admission queue with deferred pickup** | When admission is at capacity, persist tasks in a durable queue instead of failing them. Automatically re-attempt admission and continue processing in FIFO order (with optional priority lanes) as concurrency becomes available. Preserve cancel/idempotency semantics and expose queue position/ETA in task status. | | **Safe orchestrator deploys** | Pre-deploy checks for active tasks (drain or warn); blue-green or canary Lambda deploy for the durable orchestrator with rollback on error regressions (`OBSERVABILITY.md`). |