diff --git a/.github/actions/terragrunt/README.md b/.github/actions/terragrunt/README.md index ff59465e..fbe58fcc 100644 --- a/.github/actions/terragrunt/README.md +++ b/.github/actions/terragrunt/README.md @@ -10,7 +10,7 @@ This GitHub Action sets up **Terraform** and **Terragrunt** and runs a specified - Optionally passes Terragrunt variables via JSON tfvars - Supports `plan` mode for producing local saved plan files - Supports `init` mode for outputs-only reads -- Uses the repo-local `./.github/actions/just` action for saved plan artifact upload and download +- Relies on shared Terragrunt root hooks for per-stack saved plan artifact upload and download - Exports Terragrunt outputs as compact JSON when state exists The Terragrunt install step is kept in this repo-local action rather than hidden behind a third-party Terragrunt wrapper action so the repo can control the exact setup-action revision and react quickly to GitHub Actions runtime deprecations or nested dependency warnings. @@ -38,9 +38,9 @@ The Terragrunt install step is kept in this repo-local action rather than hidden - `apply` Runs `terragrunt apply -auto-approve` - `plan` - Runs `terragrunt plan -detailed-exitcode -out=/terragrunt.tfplan`, then renders `terragrunt.plan.txt` and writes `terragrunt.plan.meta.json` via the repo `justfile.tg` recipe `terragrunt-plan-render`. It then uploads those files to S3 through the repo-local `./.github/actions/just` action using the AWS credentials already configured in the job. + Runs `terragrunt plan -detailed-exitcode -out=terragrunt.tfplan`. The shared Terragrunt root `after_hook` then renders `terragrunt.plan.txt`, writes `terragrunt.plan.meta.json`, and uploads the per-stack plan bundle to the derived plan bucket when `TG_ENABLE_PLAN_ARTIFACTS=true` and `PLAN_ARTIFACT_RUN_ID` is set. - `apply_plan` - Downloads the saved plan files into `tg_directory` via the repo-local `./.github/actions/just` action and `justfile.tg`, using the caller-provided `PLAN_ARTIFACT_S3_PREFIX` environment variable plus the stack-derived suffix from `tg_directory`. It then fails if the binary plan file or `terragrunt.plan.meta.json` is missing, reads `has_changes` from the saved metadata file, and skips apply with a GitHub Actions warning when the saved plan contains no mutating resource changes. Otherwise it runs `terragrunt apply` against the absolute stack-path plan file. + Runs `terragrunt apply terragrunt.tfplan`. The shared Terragrunt root `before_hook` downloads the saved plan bundle into the Terragrunt working directory when `TG_ENABLE_PLAN_ARTIFACTS=true` and `PLAN_ARTIFACT_RUN_ID` is set, and fails early if the saved metadata reports mocked dependency outputs. - `destroy` Runs `terragrunt destroy -auto-approve` - `init` @@ -48,16 +48,17 @@ The Terragrunt install step is kept in this repo-local action rather than hidden ## Saved Plan Layout -- One run-level metadata file is stored separately by the shared infra wrapper at: - - `/infra-plan-metadata/plan-metadata.json` +- One run-level metadata file is stored separately by the shared infra wrapper as a GitHub Actions artifact: + - artifact name: `infra-plan-metadata` + - file: `plan-metadata.json` - Each Terragrunt stack or module stores its own plan bundle at: - - `/terragrunt-plan-/terragrunt.tfplan` - - `/terragrunt-plan-/terragrunt.plan.txt` - - `/terragrunt-plan-/terragrunt.plan.meta.json` + - `s3:///terragrunt_plan///terragrunt-plan-/terragrunt.tfplan` + - `s3:///terragrunt_plan///terragrunt-plan-/terragrunt.plan.txt` + - `s3:///terragrunt_plan///terragrunt-plan-/terragrunt.plan.meta.json` ## AWS Credentials -Configure AWS credentials in the workflow job before calling this action. The action then reuses those ambient credentials for Terragrunt itself and for any saved-plan upload or download steps. +Configure AWS credentials in the workflow job before calling this action. The action then reuses those ambient credentials for Terragrunt itself and for any Terragrunt-hook-driven saved-plan upload or download steps. ## Usage @@ -164,4 +165,4 @@ jobs: tg_action: apply_plan ``` -This action expects the workflow to download `terragrunt.tfplan`, `terragrunt.plan.txt`, and `terragrunt.plan.meta.json` into `tg_directory` before calling `tg_action: apply_plan`. +This action expects the workflow to set both `TG_ENABLE_PLAN_ARTIFACTS=true` and `PLAN_ARTIFACT_RUN_ID` when using cross-run saved plans so the shared Terragrunt root hooks can resolve the per-stack plan bundle location from the derived plan bucket and environment. diff --git a/.github/actions/terragrunt/action.yml b/.github/actions/terragrunt/action.yml index da37b6f8..274b858e 100644 --- a/.github/actions/terragrunt/action.yml +++ b/.github/actions/terragrunt/action.yml @@ -28,7 +28,7 @@ inputs: outputs: tg_outputs: description: "All Terraform outputs in JSON format" - value: ${{ steps.tg_outputs.outputs.terraform_json || steps.tg_outputs_skip.outputs.terraform_json }} + value: ${{ steps.tg_outputs.outputs.terraform_json }} runs: using: "composite" @@ -61,54 +61,17 @@ runs: run: | echo "$OVERRIDE_TG_VARS" | jq -c . > ${{ inputs.tg_directory }}/override_tg_vars.tfvars.json - - name: Download saved plan artifacts - if: inputs.tg_action == 'apply_plan' - uses: ./.github/actions/just - env: - TG_DIRECTORY: ${{ inputs.tg_directory }} - with: - aws_region: ${{ inputs.aws_region }} - justfile_path: justfile.tg - just_action: terragrunt-plan-download - - - name: Verify plan artifact files exist - if: inputs.tg_action == 'apply_plan' - shell: bash - run: | - test -f "${{ inputs.tg_directory }}/terragrunt.tfplan" || { - echo "Expected plan file '${{ inputs.tg_directory }}/terragrunt.tfplan' was not found before apply_plan." >&2 - exit 1 - } - test -f "${{ inputs.tg_directory }}/terragrunt.plan.meta.json" || { - echo "Expected plan metadata file '${{ inputs.tg_directory }}/terragrunt.plan.meta.json' was not found before apply_plan." >&2 - exit 1 - } - - - name: Check saved plan for mutating changes - if: inputs.tg_action == 'apply_plan' - id: apply_plan_guard - shell: bash - working-directory: ${{ inputs.tg_directory }} - run: | - PLAN_PATH="$(pwd)/terragrunt.tfplan" - PLAN_META_PATH="$(pwd)/terragrunt.plan.meta.json" - - if [ "$(jq -r '.has_changes' "$PLAN_META_PATH")" != "true" ]; then - echo "::warning title=Empty saved plan::Saved plan '$PLAN_PATH' contains no mutating resource changes. Skipping apply." - echo "should_apply=false" >> "$GITHUB_OUTPUT" - else - echo "should_apply=true" >> "$GITHUB_OUTPUT" - fi - - name: Action Terragrunt - if: inputs.tg_action != 'apply_plan' || steps.apply_plan_guard.outputs.should_apply == 'true' id: terragrunt_action shell: bash env: TF_IN_AUTOMATION: true + TG_PLAN_LOG_FILENAME: terragrunt.plan.log + TG_PLAN_LOG_ABS_PATH: ${{ github.workspace }}/${{ inputs.tg_directory }}/terragrunt.plan.log working-directory: ${{ inputs.tg_directory }} run: | - PLAN_PATH="$(pwd)/terragrunt.tfplan" + PLAN_PATH="terragrunt.tfplan" + PLAN_LOG_PATH="$(pwd)/terragrunt.plan.log" case "${{ inputs.tg_action }}" in apply) @@ -116,21 +79,61 @@ runs: ;; plan) set +e - terragrunt plan -input=false -lock=false -detailed-exitcode -compact-warnings -out="$PLAN_PATH" -var-file=override_tg_vars.tfvars.json - plan_exit_code=$? + terragrunt plan -input=false -lock=false -detailed-exitcode -compact-warnings -out="$PLAN_PATH" -var-file=override_tg_vars.tfvars.json 2>&1 | tee "$PLAN_LOG_PATH" + plan_exit_code=${PIPESTATUS[0]} set -e if [ "$plan_exit_code" -eq 1 ]; then exit 1 fi + plan_contains_mocked_outputs=false + if grep -Fq "mock outputs provided and returning those in dependency output" "$PLAN_LOG_PATH"; then + plan_contains_mocked_outputs=true + echo "::warning title=Mock outputs used during plan::Terragrunt used dependency mock outputs while creating a saved plan. This plan artifact should not be used with apply_plan until a fresh plan is created from real upstream outputs." + fi + echo "plan_exit_code=$plan_exit_code" >> "$GITHUB_OUTPUT" - echo "Terragrunt binary plan path: $PLAN_PATH" - ls -l "$PLAN_PATH" + echo "plan_contains_mocked_outputs=$plan_contains_mocked_outputs" >> "$GITHUB_OUTPUT" + echo "Terragrunt binary plan path in cache: $PLAN_PATH" ;; apply_plan) - terragrunt apply -auto-approve -compact-warnings "$PLAN_PATH" + set +e + APPLY_LOG_PATH="$(pwd)/terragrunt.apply.log" + terragrunt apply -auto-approve "$PLAN_PATH" 2>&1 | tee "$APPLY_LOG_PATH" + apply_exit_code=${PIPESTATUS[0]} + set -e + + if [ "$apply_exit_code" -ne 0 ]; then + emit_error() { + local title="$1" + local pattern="$2" + + if grep -Fq "$pattern" "$APPLY_LOG_PATH"; then + local err_line + err_line="$(grep -F "$pattern" "$APPLY_LOG_PATH" | head -n 1)" + echo "::error title=${title}::${err_line}" + fi + } + + emit_error "Saved plan contains mocked outputs" "contains mocked outputs. Regenerate it after upstream real outputs exist." + + if grep -Fq "Saved plan is stale" "$APPLY_LOG_PATH"; then + emit_error "Saved plan is stale" "Saved plan is stale" + fi + + emit_error "Saved plan artifact missing" "Saved plan artifact not found for" + emit_error "Missing PLAN_ARTIFACT_RUN_ID" "PLAN_ARTIFACT_RUN_ID is required when TG_ENABLE_PLAN_ARTIFACTS=true" + + if grep -Fq "Plan bucket" "$APPLY_LOG_PATH" && grep -Fq "does not exist" "$APPLY_LOG_PATH"; then + emit_error "Plan bucket missing" "Plan bucket" + fi + + emit_error "Plan bucket creation declined" "Plan bucket creation declined." + emit_error "Plan bucket confirmation unavailable" "no interactive terminal is available for confirmation" + exit "$apply_exit_code" + fi ;; destroy) terragrunt destroy -auto-approve -compact-warnings -var-file=override_tg_vars.tfvars.json @@ -145,28 +148,8 @@ runs: ;; esac - - name: Render plan sidecar artifacts - if: inputs.tg_action == 'plan' - uses: ./.github/actions/just - env: - TG_DIRECTORY: ${{ inputs.tg_directory }} - TG_PLAN_EXIT_CODE: ${{ steps.terragrunt_action.outputs.plan_exit_code }} - with: - justfile_path: justfile.tg - just_action: terragrunt-plan-render - - - name: Upload saved plan artifacts - if: inputs.tg_action == 'plan' - uses: ./.github/actions/just - env: - TG_DIRECTORY: ${{ inputs.tg_directory }} - with: - aws_region: ${{ inputs.aws_region }} - justfile_path: justfile.tg - just_action: terragrunt-plan-upload - - name: Capture Terraform Outputs - if: inputs.tg_action != 'destroy' && (inputs.tg_action != 'apply_plan' || steps.apply_plan_guard.outputs.should_apply == 'true') + if: inputs.tg_action != 'destroy' id: tg_outputs shell: bash working-directory: ${{ inputs.tg_directory }} @@ -180,10 +163,3 @@ runs: fi echo "terraform_json=$TERRAGRUNT_OUTPUTS" >> "$GITHUB_OUTPUT" echo "✅ Terraform outputs captured successfully." - - - name: Capture Terraform Outputs For Skipped Apply Plan - if: inputs.tg_action == 'apply_plan' && steps.apply_plan_guard.outputs.should_apply != 'true' - id: tg_outputs_skip - shell: bash - run: | - echo "terraform_json={}" >> "$GITHUB_OUTPUT" diff --git a/.github/docs/README.md b/.github/docs/README.md index 98deb3dc..952ffc81 100644 --- a/.github/docs/README.md +++ b/.github/docs/README.md @@ -82,13 +82,13 @@ flowchart LR ### Infra And Code Rollout - `shared_infra_plan.yml` - Plan wrapper around `shared_infra.yml`. It takes resolved workflow inputs directly, derives a single plan-artifact S3 prefix via `justfile.tg`, uploads one run-level metadata file under `/infra-plan-metadata/plan-metadata.json` via `justfile.ci`, and then calls `shared_infra.yml` with `tg_action: plan` plus that same resolved prefix. The bucket resolution follows the same artifact split as ECR and build outputs: `dev` uses the `dev` code bucket, while non-`dev` environments reuse the `ci` code bucket. After the plan completes, it prints the current workflow `github.run_id` into both the logs and the GitHub Actions step summary as `plan_artifact_run_id`, and exposes that value as a reusable-workflow output. + Plan wrapper around `shared_infra.yml`. It takes resolved workflow inputs directly, uploads one run-level `plan-metadata.json` file as a GitHub Actions artifact named `infra-plan-metadata`, and then calls `shared_infra.yml` with `tg_action: plan` plus `plan_run_id: ${{ github.run_id }}`. After the plan completes, it prints the current workflow `github.run_id` into both the logs and the GitHub Actions step summary as `plan_artifact_run_id`, and exposes that value as a reusable-workflow output. - `shared_infra_apply.yml` Direct-input apply wrapper around `shared_infra.yml`. It takes resolved workflow inputs directly and calls `shared_infra.yml` with `tg_action: apply`. - `shared_infra_apply_from_plan.yml` - Apply-from-plan wrapper around `shared_infra.yml`. It takes `plan_artifact_run_id`, resolves the same artifact bucket split used by release artifacts (`dev` stays on `dev`, non-`dev` uses `ci`) inside its `metadata` job, configures artifact-account AWS credentials once for that job, derives the matching plan-artifact S3 prefix via `justfile.tg`, downloads `infra-plan-metadata` from that location via `justfile.ci`, reads the frozen graph inputs back out, and then calls `shared_infra.yml` with `tg_action: apply_plan` plus that same resolved prefix. + Apply-from-plan wrapper around `shared_infra.yml`. It takes `plan_artifact_run_id`, downloads the `infra-plan-metadata` GitHub artifact from that earlier workflow run, reads the frozen graph inputs back out, and then calls `shared_infra.yml` with `tg_action: apply_plan` plus `plan_run_id: `. Per-stack plan bundle download still happens inside the shared Terragrunt root `before_hook`. - `shared_infra.yml` - Pure ordered infra graph executor. It applies shared stacks first, then runtime stacks, then frontend infrastructure. Shared stacks now include the CloudWatch observability dashboard. It accepts `tg_action` so the same graph can run a normal apply, upload derived per-stack plan artifacts to the resolved code bucket under `terragrunt_plan/`, or apply from previously uploaded plan artifacts. The wrapper workflows resolve one `plan_artifact_s3_prefix` and set it in the workflow env once, while each Terragrunt job configures AWS credentials at job start and then reuses that ambient session in the repo-local Terragrunt action. That means each infra run has one shared `plan-metadata.json` file for the whole graph and one separate saved plan bundle per Terragrunt stack or module. In `apply_plan` mode, each stack job first downloads its own saved plan files via `justfile.tg` through the Terragrunt action, then the Terragrunt action validates and applies those local files. Its visible step labels now follow the high-level operation, so both direct apply and apply-from-plan render as `Apply` while plan still renders as `Plan`. The `security -> network` edge is a real bootstrap dependency because `network` reads security outputs like `vpc_endpoint_sg` from remote state; if those outputs do not exist yet, `network` fails with an upstream `Unsupported attribute` error rather than a networking-specific error. + Pure ordered infra graph executor. It applies shared stacks first, then runtime stacks, then frontend infrastructure. Shared stacks now include the CloudWatch observability dashboard. It accepts `tg_action` so the same graph can run a normal apply, upload derived per-stack plan artifacts to the dedicated plan bucket under `terragrunt_plan/`, or apply from previously uploaded plan artifacts. The wrapper workflows now pass a single `plan_run_id`, exported to Terragrunt jobs as `PLAN_ARTIFACT_RUN_ID`, while each Terragrunt job configures AWS credentials at job start and then reuses that ambient session in the repo-local Terragrunt action. That means each infra run has one shared run-level metadata artifact (`infra-plan-metadata`) for the whole graph and one separate saved plan bundle per Terragrunt stack or module. Saved-plan transfer is opt-in: the shared workflow sets `TG_ENABLE_PLAN_ARTIFACTS=true` only for `plan` and `apply_plan`. In `plan` mode, the shared Terragrunt root `after_hook` renders and uploads each per-stack plan bundle. In `apply_plan` mode, the shared Terragrunt root `before_hook` downloads the saved plan bundle before `terragrunt apply` runs and fails if the saved metadata says mocked outputs were used. Its visible step labels now follow the high-level operation, so both direct apply and apply-from-plan render as `Apply` while plan still renders as `Plan`. Bootstrap-sensitive edges such as `security -> network` should be modeled with Terragrunt `dependency` blocks plus constrained `mock_outputs` in the live stack so `plan` and `validate` can run before upstream state exists, while `apply` still resolves real outputs. - The shared infra wrappers must forward the permissions required by the nested reusable call chain. In practice that means `id-token: write` everywhere the Terragrunt action may assume AWS OIDC and `contents: read` for checkout. The shared plan/apply wrappers now rely on AWS access to the shared code bucket rather than GitHub artifact permissions for cross-run recovery. - `shared_deploy.yml` Rolls out Lambda code, optional migrations, optional reconciliation Lambdas, ECS task and service updates, and optional frontend deploys. Its multi-step AWS jobs now configure credentials once at job start and let the local `just` and Terragrunt actions reuse that ambient session. The reusable workflow renders its Lambda and ECS CodeDeploy AppSpec files from the shared templates under `config/deploy/`, and its mutating `just` steps should target `justfile.deploy` rather than the repo-root `justfile`. @@ -142,18 +142,17 @@ Run these checks on every CI, workflow, or deploy-contract change. - compare every caller `with:` block against the callee `workflow_call.inputs` - compare expected outputs against actual `jobs..outputs.*` - verify optional inputs are intentionally omitted, not accidentally missing -- the repo-local `./.github/actions/terragrunt` action supports `tg_action: plan` for producing the binary plan locally; it renders `terragrunt.plan.txt` and writes `terragrunt.plan.meta.json` via `justfile.tg` (`terragrunt-plan-render`) -- `./.github/actions/terragrunt` always uploads per-stack plan artifacts on `plan` and always downloads them on `apply_plan`, using the caller-provided `PLAN_ARTIFACT_S3_PREFIX` environment variable, so graph executors like `shared_infra.yml` do not need separate `./.github/actions/just` steps for those transfers +- the repo-local `./.github/actions/terragrunt` action supports `tg_action: plan` for producing the binary plan locally; the shared Terragrunt root `after_hook` then renders `terragrunt.plan.txt` and writes `terragrunt.plan.meta.json` +- shared Terragrunt root hooks now upload per-stack plan artifacts on `plan` and download them on `apply_plan` only when `TG_ENABLE_PLAN_ARTIFACTS=true`, using the caller-provided `PLAN_ARTIFACT_RUN_ID` plus the root-derived `plan_bucket`, so graph executors like `shared_infra.yml` do not need separate `./.github/actions/just` steps for those transfers - both repo-local composite actions, `./.github/actions/just` and `./.github/actions/terragrunt`, now assume AWS credentials are already configured in the current job when they need AWS access. The repo pattern is to run `aws-actions/configure-aws-credentials` at the top of each AWS-using job and then call the local actions without extra auth inputs - `./.github/actions/just` installs the requested `just` version through `extractions/setup-crate@v2` in the same minimal composite-action shape as `extractions/setup-just`, rather than depending on `extractions/setup-just` itself - `./.github/actions/terragrunt` installs the requested Terragrunt version through `jdx/mise-action@v4`, while Terraform stays pinned separately through `hashicorp/setup-terraform` - saved infra-plan storage is intentionally split into two levels: - - one run-level metadata file at `/infra-plan-metadata/plan-metadata.json` - - one per-stack plan bundle under `/terragrunt-plan-/` -- plan artifact storage follows the same artifact environment split as ECR and build outputs: `dev` uses the `dev` code bucket, while non-`dev` environments read and write `terragrunt_plan/` in the shared `ci` code bucket -- `./.github/actions/terragrunt` skips `apply_plan` with a warning when the saved `terragrunt.plan.meta.json` reports `has_changes: false` + - one run-level metadata artifact named `infra-plan-metadata` containing `plan-metadata.json` + - one per-stack plan bundle under `s3:///terragrunt_plan///terragrunt-plan-/` +- the dedicated plan bucket is repo-wide, derived as `---tfplan`, and plan uniqueness comes from `terragrunt_plan///...` - `./.github/actions/terragrunt` derives its plan artifact name from `tg_directory`, so callers do not need to pass artifact naming inputs -- if `apply_plan` is used across separate workflow runs, pass the earlier workflow `run_id` through `plan_artifact_run_id`; the shared wrappers recover both metadata and per-stack plan files by deriving the matching `plan_artifact_s3_prefix` and reading from the shared code bucket under `terragrunt_plan///...` +- if `apply_plan` is used across separate workflow runs, pass the earlier workflow `run_id` through `plan_artifact_run_id`; the shared wrappers recover both metadata and per-stack plan files from the dedicated plan bucket under `terragrunt_plan///...` - if a cross-run apply should not ask the operator to re-enter versions or recompute artifact resolution, store both the input versions and the resolved reusable-workflow outputs in a metadata artifact during plan and recover them in the apply wrapper from the earlier `run_id` - keep `shared_infra.yml` as the pure graph executor and prefer handling metadata creation/recovery in the dedicated plan/apply wrappers - when using `./.github/actions/just`, check whether the caller needs the repo-root `justfile` or an explicit `justfile_path` @@ -161,7 +160,6 @@ Run these checks on every CI, workflow, or deploy-contract change. - keep the split `just` ownership clear: - repo-root `justfile` for local/developer commands - `justfile.ci` for read-only CI helpers - - `justfile.tg` for Terragrunt plan artifact helpers (render/upload/download) - `justfile.deploy` for mutating CI build and deploy steps - `justfile.destroy` for explicit teardown and post-destroy cleanup steps @@ -188,7 +186,7 @@ Run these checks on every CI, workflow, or deploy-contract change. ### Dependency Safety - check apply, deploy, and destroy behavior, not just apply -- verify downstream consumers of remote state still exist and are ordered correctly +- verify Terragrunt dependencies and their downstream consumers still exist and are ordered correctly - confirm every `needs..outputs.*` reference is in scope - confirm matrix values still match the naming contract expected by workflows and modules - do not change CI ordering blindly; first check whether the real issue is avoidable cross-stack coupling diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index 8fde8de8..0ef17300 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -175,8 +175,8 @@ jobs: tg_directory: infra/live/${{ inputs.environment }}/aws/${{ matrix.value }} tg_action: destroy - worker-messaging: - name: Worker Messaging + messaging: + name: Messaging runs-on: ubuntu-latest needs: - lambdas @@ -190,10 +190,10 @@ jobs: role-to-assume: ${{ env.AWS_OIDC_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - - name: Destroy worker messaging infra + - name: Destroy messaging infra uses: ./.github/actions/terragrunt with: - tg_directory: infra/live/${{ inputs.environment }}/aws/worker_messaging + tg_directory: infra/live/${{ inputs.environment }}/aws/messaging tg_action: destroy database: diff --git a/.github/workflows/shared_infra.yml b/.github/workflows/shared_infra.yml index c0fd3f85..aead7d34 100644 --- a/.github/workflows/shared_infra.yml +++ b/.github/workflows/shared_infra.yml @@ -33,8 +33,8 @@ on: required: false type: string default: "apply" - plan_artifact_s3_prefix: - description: "Optional resolved S3 prefix used for saved plan artifacts" + plan_run_id: + description: "Optional unique run id used to derive saved plan artifact paths" required: false type: string default: "" @@ -53,7 +53,8 @@ env: AWS_OIDC_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/${{ vars.PROJECT_NAME }}-${{ inputs.environment }}-github-oidc-role AWS_REGION: ${{ vars.AWS_REGION }} DOMAIN_NAME: ${{ vars.DOMAIN_NAME }} - PLAN_ARTIFACT_S3_PREFIX: ${{ inputs.plan_artifact_s3_prefix }} + TG_ENABLE_PLAN_ARTIFACTS: ${{ (inputs.tg_action == 'plan' || inputs.tg_action == 'apply_plan') && 'true' || 'false' }} + PLAN_ARTIFACT_RUN_ID: ${{ inputs.plan_run_id }} TG_ACTION_LABEL: ${{ (inputs.tg_action == 'apply' || inputs.tg_action == 'apply_plan') && 'Apply' || inputs.tg_action == 'plan' && 'Plan' || inputs.tg_action == 'destroy' && 'Destroy' || inputs.tg_action == 'init' && 'Init' || 'Run' }} jobs: @@ -71,11 +72,13 @@ jobs: - name: ${{ env.TG_ACTION_LABEL }} oidc role infra uses: ./.github/actions/terragrunt + env: + TG_RESET_PLAN_ARTIFACT_BUCKET: "true" # this ensures that the plan artifact bucket is reset on every run, preventing stale artifacts from being used with: tg_directory: infra/live/${{ inputs.environment }}/aws/oidc tg_action: ${{ inputs.tg_action }} - worker_messaging: + messaging: needs: oidc runs-on: ubuntu-latest steps: @@ -88,10 +91,10 @@ jobs: role-to-assume: ${{ env.AWS_OIDC_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - - name: ${{ env.TG_ACTION_LABEL }} worker messaging infra + - name: ${{ env.TG_ACTION_LABEL }} messaging infra uses: ./.github/actions/terragrunt with: - tg_directory: infra/live/${{ inputs.environment }}/aws/worker_messaging + tg_directory: infra/live/${{ inputs.environment }}/aws/messaging tg_action: ${{ inputs.tg_action }} observability: @@ -149,52 +152,10 @@ jobs: role-to-assume: ${{ env.AWS_OIDC_ROLE_ARN }} aws-region: ${{ env.AWS_REGION }} - - name: Get network infra - if: inputs.tg_action != 'apply_plan' - uses: ./.github/actions/terragrunt - id: get-network - with: - tg_directory: infra/live/${{ inputs.environment }}/aws/network - tg_action: init - - - name: Get cognito infra - if: inputs.tg_action != 'apply_plan' - uses: ./.github/actions/terragrunt - id: get-cognito - env: - TF_VAR_domain_name: ${{ env.DOMAIN_NAME }} - with: - tg_directory: infra/live/${{ inputs.environment }}/aws/cognito - tg_action: init - - - name: Get api invoke url - if: inputs.tg_action != 'apply_plan' - id: get_api_vars - env: - TG_OUTPUTS: ${{ steps.get-network.outputs.tg_outputs }} - run: | - echo "invoke_url=$(echo $TG_OUTPUTS | jq -r '.api_invoke_url.value')" >> $GITHUB_OUTPUT - - - name: Get cognito values - if: inputs.tg_action != 'apply_plan' - id: get_cognito_vars - env: - TG_OUTPUTS: ${{ steps.get-cognito.outputs.tg_outputs }} - run: | - echo "user_pool_id=$(echo "$TG_OUTPUTS" | jq -r '.user_pool_id.value')" >> "$GITHUB_OUTPUT" - echo "user_pool_client_id=$(echo "$TG_OUTPUTS" | jq -r '.user_pool_client_id.value')" >> "$GITHUB_OUTPUT" - echo "hosted_ui_url=$(echo "$TG_OUTPUTS" | jq -r '.hosted_ui_url.value')" >> "$GITHUB_OUTPUT" - echo "readonly_group_name=$(echo "$TG_OUTPUTS" | jq -r '.readonly_group_name.value')" >> "$GITHUB_OUTPUT" - - name: ${{ env.TG_ACTION_LABEL }} frontend infra uses: ./.github/actions/terragrunt env: - TF_VAR_api_invoke_url: ${{ inputs.tg_action == 'apply_plan' && '' || steps.get_api_vars.outputs.invoke_url }} TF_VAR_domain_name: ${{ env.DOMAIN_NAME }} - TF_VAR_auth_user_pool_id: ${{ inputs.tg_action == 'apply_plan' && '' || steps.get_cognito_vars.outputs.user_pool_id }} - TF_VAR_auth_user_pool_client_id: ${{ inputs.tg_action == 'apply_plan' && '' || steps.get_cognito_vars.outputs.user_pool_client_id }} - TF_VAR_auth_hosted_ui_url: ${{ inputs.tg_action == 'apply_plan' && '' || steps.get_cognito_vars.outputs.hosted_ui_url }} - TF_VAR_auth_readonly_group_name: ${{ inputs.tg_action == 'apply_plan' && '' || steps.get_cognito_vars.outputs.readonly_group_name }} with: tg_directory: infra/live/${{ inputs.environment }}/aws/frontend tg_action: ${{ inputs.tg_action }} @@ -221,8 +182,6 @@ jobs: security: needs: oidc runs-on: ubuntu-latest - outputs: - postgres_sg: ${{ steps.get_security_outputs.outputs.postgres_sg }} steps: - uses: actions/checkout@v6 with: @@ -240,14 +199,6 @@ jobs: tg_directory: infra/live/${{ inputs.environment }}/aws/security tg_action: ${{ inputs.tg_action }} - - name: Get security outputs - id: get_security_outputs - if: inputs.tg_action != 'destroy' - env: - TG_OUTPUTS: ${{ steps.deploy-security.outputs.tg_outputs }} - run: | - echo "postgres_sg=$(echo "$TG_OUTPUTS" | jq -r '.postgres_sg.value')" >> "$GITHUB_OUTPUT" - database: needs: - oidc @@ -265,8 +216,6 @@ jobs: - name: ${{ env.TG_ACTION_LABEL }} database infra uses: ./.github/actions/terragrunt - env: - TF_VAR_database_security_group_id: ${{ needs.security.outputs.postgres_sg }} with: tg_directory: infra/live/${{ inputs.environment }}/aws/database tg_action: ${{ inputs.tg_action }} @@ -298,7 +247,7 @@ jobs: - security - network - database - - worker_messaging + - messaging runs-on: ubuntu-latest strategy: fail-fast: false # this is to prevent terraform lock issues @@ -327,7 +276,7 @@ jobs: - cluster - network - database - - worker_messaging + - messaging runs-on: ubuntu-latest strategy: fail-fast: false diff --git a/.github/workflows/shared_infra_apply_from_plan.yml b/.github/workflows/shared_infra_apply_from_plan.yml index eca062ca..8b1a8bdb 100644 --- a/.github/workflows/shared_infra_apply_from_plan.yml +++ b/.github/workflows/shared_infra_apply_from_plan.yml @@ -20,8 +20,6 @@ permissions: env: AWS_OIDC_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/${{ vars.PROJECT_NAME }}-${{ inputs.environment }}-github-oidc-role AWS_REGION: ${{ vars.AWS_REGION }} - ARTIFACT_ENVIRONMENT: ${{ inputs.environment == 'dev' && 'dev' || 'ci' }} - ARTIFACT_AWS_OIDC_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/${{ vars.PROJECT_NAME }}-${{ inputs.environment == 'dev' && 'dev' || 'ci' }}-github-oidc-role jobs: metadata: @@ -31,54 +29,23 @@ jobs: code_bucket: ${{ steps.read_metadata.outputs.code_bucket }} lambda_matrix: ${{ steps.read_metadata.outputs.lambda_matrix }} bootstrap_image_uri: ${{ steps.read_metadata.outputs.bootstrap_image_uri }} - plan_artifact_s3_prefix: ${{ steps.plan_artifact_s3_prefix.outputs.just_outputs }} service_matrix: ${{ steps.read_metadata.outputs.service_matrix }} steps: - uses: actions/checkout@v6 - - uses: aws-actions/configure-aws-credentials@v6 + - name: Download plan metadata artifact + uses: actions/download-artifact@v7 with: - role-to-assume: ${{ env.ARTIFACT_AWS_OIDC_ROLE_ARN }} - aws-region: ${{ env.AWS_REGION }} - - - name: Get shared code bucket outputs - uses: ./.github/actions/terragrunt - id: code_action - with: - tg_directory: infra/live/${{ env.ARTIFACT_ENVIRONMENT }}/aws/code_bucket - tg_action: init - - - name: Get bucket name - id: get_bucket_name - env: - TG_OUTPUTS: ${{ steps.code_action.outputs.tg_outputs }} - run: | - echo "bucket=$(echo "$TG_OUTPUTS" | jq -r '.bucket.value // empty')" >> "$GITHUB_OUTPUT" - - - name: Get plan artifact S3 prefix - id: plan_artifact_s3_prefix - uses: ./.github/actions/just - env: - BUCKET_NAME: ${{ steps.get_bucket_name.outputs.bucket }} - ENVIRONMENT: ${{ inputs.environment }} - RUN_ID: ${{ inputs.plan_artifact_run_id }} - with: - justfile_path: justfile.tg - just_action: terragrunt-plan-base-s3-prefix - - - name: Download plan metadata from S3 - uses: ./.github/actions/just - env: - PLAN_ARTIFACT_S3_PREFIX: ${{ steps.plan_artifact_s3_prefix.outputs.just_outputs }} - with: - justfile_path: justfile.ci - just_action: infra-plan-metadata-download + name: infra-plan-metadata + github-token: ${{ github.token }} + run-id: ${{ inputs.plan_artifact_run_id }} + path: . - name: Check plan metadata artifact shell: bash run: | if [ ! -f plan-metadata.json ]; then - echo "::error title=Missing plan metadata artifact::Expected 'plan-metadata.json' at ${{ steps.plan_artifact_s3_prefix.outputs.just_outputs }}/infra-plan-metadata/plan-metadata.json, but it was not downloaded." + echo "::error title=Missing plan metadata artifact::Expected artifact 'infra-plan-metadata' containing 'plan-metadata.json' from workflow run '${{ inputs.plan_artifact_run_id }}', but it was not downloaded." exit 1 fi @@ -104,4 +71,4 @@ jobs: bootstrap_image_uri: ${{ needs.metadata.outputs.bootstrap_image_uri }} service_matrix: ${{ needs.metadata.outputs.service_matrix }} tg_action: apply_plan - plan_artifact_s3_prefix: ${{ needs.metadata.outputs.plan_artifact_s3_prefix }} + plan_run_id: ${{ inputs.plan_artifact_run_id }} diff --git a/.github/workflows/shared_infra_plan.yml b/.github/workflows/shared_infra_plan.yml index 6ad67abf..1a31a7d0 100644 --- a/.github/workflows/shared_infra_plan.yml +++ b/.github/workflows/shared_infra_plan.yml @@ -40,34 +40,15 @@ permissions: env: AWS_OIDC_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/${{ vars.PROJECT_NAME }}-${{ inputs.environment }}-github-oidc-role - AWS_REGION: ${{ vars.AWS_REGION }} jobs: metadata: runs-on: ubuntu-latest - outputs: - plan_artifact_s3_prefix: ${{ steps.plan_artifact_s3_prefix.outputs.just_outputs }} steps: - uses: actions/checkout@v6 with: ref: ${{ inputs.infra_version }} - - uses: aws-actions/configure-aws-credentials@v6 - with: - role-to-assume: ${{ env.AWS_OIDC_ROLE_ARN }} - aws-region: ${{ env.AWS_REGION }} - - - name: Get plan artifact S3 prefix - id: plan_artifact_s3_prefix - uses: ./.github/actions/just - env: - BUCKET_NAME: ${{ inputs.code_bucket }} - ENVIRONMENT: ${{ inputs.environment }} - RUN_ID: ${{ github.run_id }} - with: - justfile_path: justfile.tg - just_action: terragrunt-plan-base-s3-prefix - - name: Write plan metadata from workflow inputs shell: bash run: | @@ -81,13 +62,12 @@ jobs: } EOF - - name: Upload plan metadata to S3 - uses: ./.github/actions/just - env: - PLAN_ARTIFACT_S3_PREFIX: ${{ steps.plan_artifact_s3_prefix.outputs.just_outputs }} + - name: Upload plan metadata artifact + uses: actions/upload-artifact@v7 with: - justfile_path: justfile.ci - just_action: infra-plan-metadata-upload + name: infra-plan-metadata + path: plan-metadata.json + retention-days: 14 infra: needs: @@ -101,7 +81,7 @@ jobs: bootstrap_image_uri: ${{ inputs.bootstrap_image_uri }} service_matrix: ${{ inputs.service_matrix }} tg_action: plan - plan_artifact_s3_prefix: ${{ needs.metadata.outputs.plan_artifact_s3_prefix }} + plan_run_id: ${{ github.run_id }} plan_context: name: Plan Context diff --git a/README.md b/README.md index b36e94e5..948b90d8 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,10 @@ Lambda + ECS with CodeDeploy rollouts, plus provisioned concurrency controls for - shared deployment patterns for Lambda and ECS, with repo-local `just` commands for local and CI operations - runtime and infrastructure layouts designed to be extended without having to rediscover the whole repo each time +## Bootstrap-Friendly Plans + +For cross-stack contracts that often block CI plans before upstream stacks exist, this repo prefers Terragrunt `dependency` wiring in the live stack plus `mock_outputs` for non-mutating commands such as `plan` and `validate`. The Terraform modules should consume explicit inputs rather than reaching back into sibling stack state directly when the contract needs bootstrap-friendly plan behavior. + Use [CONTRIBUTING.md](CONTRIBUTING.md) for expectations when changing the repo itself. ## For AI Agents @@ -160,7 +164,7 @@ just frontend That Vite server is also started automatically by `just start`. It proxies `/api/*` to the local Lambda API and `/api/ecs/*` to the local ECS API with the same prefix stripping the deployed CloudFront distribution performs. It also serves `auth-config.json` with no-cache headers locally so frontend auth config changes are picked up immediately. When `frontend/public/auth-config.json` has `"enabled": false`, the frontend runs in a local unauthenticated mode instead of redirecting to Cognito. -The local ElasticMQ config now mirrors the shared AWS worker-messaging contract by exposing: +The local ElasticMQ config now mirrors the shared AWS messaging contract by exposing: - `lambda-worker-queue` for the Lambda worker consumer - `ecs-worker-queue` for the ECS worker consumer diff --git a/REPO_INSTRUCTIONS.md b/REPO_INSTRUCTIONS.md index 26f58f6c..aa002a83 100644 --- a/REPO_INSTRUCTIONS.md +++ b/REPO_INSTRUCTIONS.md @@ -98,17 +98,27 @@ These instructions apply to the entire repository. - verify runtime type (Lambda/ECS), deploy mode, and (for ECS) connection type and load-balancer shape - verify required infra resources exist (CodeDeploy app/deployment group, listeners/target groups, alarms, VPC link if applicable) - when changing reusable workflow contracts, compare every caller `with:` block to the callee `workflow_call.inputs` +- when adding or renaming Terraform module `output` values that are intended for Terragrunt `dependency..outputs` passthrough, verify every downstream consumer wrapper declares a `variable` with the exact same name +- if that same-name output-to-variable contract does not hold yet, do not leave it implicit: either add the matching variables, or call out the mismatch explicitly before closing the task - check apply/deploy/destroy, and avoid unnecessary `terraform_remote_state` coupling (especially for fast-changing outputs) +- for bootstrap-sensitive or plan-sensitive cross-stack contracts, prefer Terragrunt `dependency` inputs in the live stack and `mock_outputs` for non-mutating commands rather than reading upstream state directly inside Terraform modules +- if CI plan failures are caused by missing upstream state, fix the contract shape first instead of papering over the issue with more direct `terraform_remote_state` reads +- when the same Terragrunt dependency wiring or mocks are needed across environments, centralize that shared config under `infra/live/dependencies/` in a capability-scoped helper such as `network.hcl` and have environment stacks read it rather than duplicating the same blocks in `dev`, `prod`, or `ci` +- keep this approach visible to users as well: when you introduce or expand this pattern, update the top-level `README.md` so the bootstrap-friendly mock strategy is documented outside agent-only instructions +- if you intentionally add a Terraform `data "terraform_remote_state"` block, add a `# remote_state_reason: ...` comment immediately above it explaining why Terragrunt `dependency` plus `mock_outputs` is not practical for that case +- if you intentionally add a Terraform `data "terraform_remote_state"` block, add a `# remote_state_reason: ...` comment immediately above it explaining why Terragrunt `dependency` plus `mock_outputs` is not practical for that case ## Terragrunt Plan Expectation - when a change touches `*.hcl`, Terraform modules, live Terragrunt stacks, or downstream dependencies that can affect Terraform evaluation or plan output, run the relevant `just tg plan` command before closing the task when feasible - choose the smallest relevant plan surface rather than defaulting to `run-all`; for example, plan only the affected `dev`, `ci`, or `prod` stack(s) - when shared modules or remote-state contracts change, consider the downstream consumer stacks too and run plans for the affected dependents, not just the module wrapper you edited +- treat saved plans as apply-intent artifacts, not as general previews: only keep a `plan` you expect to apply, because Terraform reuses the exact planned variable values during `apply_plan` +- be especially careful on first deploys or bootstrap-sensitive stacks that use Terragrunt `mock_outputs` for planability; if a saved plan captured mock values, discard it and create a fresh plan after the upstream real outputs exist - if a plan is not feasible because credentials, network, permissions, or state access are unavailable, say that explicitly in the final response and name the plan command that should be run manually ## High-Signal Edit Warnings -- before editing `justfile.destroy` or `justfile.tg`, print an explicit terminal warning in commentary (destroy/terragrunt command ownership boundary) +- before editing `justfile.destroy`, print an explicit terminal warning in commentary (destroy command ownership boundary) - before editing `.github/workflows/shared_*.yml`, print an explicit terminal warning in commentary (shared CI workflow blast radius) - before editing `infra/modules/aws/_shared/**`, print an explicit terminal warning in commentary (shared-contract blast radius) diff --git a/infra/README.md b/infra/README.md index a7f407e9..5c9264c0 100644 --- a/infra/README.md +++ b/infra/README.md @@ -28,13 +28,16 @@ The root Terragrunt file derives state paths from the live stack path: - bucket: `---tfstate` - key: `///terraform.tfstate` -Shared artifact names also follow environment-aware conventions from `infra/root.hcl`: +Shared artifact names also follow naming conventions from `infra/root.hcl`: - shared artifact base: `dev -> ...-dev`, otherwise `...-ci` +- dedicated saved-plan bucket: `---tfplan` - code bucket: `-code` - ECS ECR repository: `-ecr` -- saved Terragrunt plan artifacts: `s3:///terragrunt_plan///...` -- code-bucket lifecycle inputs: `code_artifact_expiration_days` for deployable code artifacts and `infra_plan_artifact_expiration_days` for `terragrunt_plan/` +- saved Terragrunt plan artifacts: `s3:///terragrunt_plan///...` +- plan-bucket retention: `infra_plan_artifact_expiration_days` applies an S3 lifecycle rule to `terragrunt_plan/` in the dedicated saved-plan bucket +- during `terragrunt init` and saved-plan `plan`, the root hook ensures the dedicated saved-plan bucket exists; interactive runs prompt before creation and non-interactive runs fail if no prompt is possible +- to reapply the configured `infra_plan_artifact_expiration_days` lifecycle rule locally for an existing bucket, rerun with `TG_RESET_PLAN_ARTIFACT_BUCKET=true` So a stack at: @@ -77,7 +80,7 @@ stores state at: Owns the VPC-attached Lambda used to run schema migrations against the shared Aurora PostgreSQL stack. - `rds_reader_tagger` Owns the EventBridge rule and Lambda that sync cluster tags onto new Aurora reader instances created later by scale-out. -- `worker_messaging` +- `messaging` Owns the shared worker SNS topic plus the Lambda-worker and ECS-worker SQS queues used for fanout. - `task_*` Register ECS task definitions. @@ -92,10 +95,10 @@ Current examples include: Shared CloudWatch dashboard shape for recent Lambda logs, ECS app logs, and ECS OTEL logs. - `rds_reader_tagger` Event-driven Aurora reader tag-sync shape: catch the RDS instance-created event, derive the parent cluster, and copy the cluster's non-AWS tags onto the new reader. -- `worker_messaging` +- `messaging` Shared worker fanout shape: one SNS topic publishes to two independent worker queues so Lambda and ECS consumers each receive the same event. - `task_worker` / `service_worker` - Internal ECS worker service shape, with the ECS worker queue owned by `worker_messaging` and a container health check based on a local worker heartbeat file. + Internal ECS worker service shape, with the ECS worker queue owned by `messaging` and a container health check based on a local worker heartbeat file. - `task_api` / `service_api` ECS API service shape exposed on the shared API Gateway at `/ecs` using `vpc_link` and `blue_green`, backed by a dedicated listener on the shared ALB. Through the frontend distribution it is reached at `/api/ecs/*`, while the Lambda API is reached at `/api/*`. @@ -104,16 +107,48 @@ That `containers/lib` directory is helper code only and is not treated as a depl ## Dependency Notes -- many modules use `data.terraform_remote_state` to read outputs from other stacks -- prefer using `data.terraform_remote_state` only for outputs that are expected to stay stable or change rarely; avoid using it as the normal handoff for values that change as part of the same rollout, because downstream plans can then drift from the upstream state they were planned against -- because of that, workflow ordering matters for apply, deploy, and destroy -- `service_api` consumes the shared JWT authorizer output from `network`, so `cognito` and `network` must exist before that ECS API service stack applies, and the service must destroy before `network` is torn down -- on destroy, `network` can tear down once downstream consumers such as `frontend`, `service_*`, `task_*`, and `database` are gone -- on destroy, `cluster` can tear down in parallel with `network` once `service_*`, `task_*`, and other real cluster consumers are gone; `frontend` is not a cluster dependency -- on destroy, `security` must wait for VPC-attached lambdas such as `migrations` as well as `network`, otherwise the shared runtime security group can still be attached during Lambda ENI cleanup -- avoid making one runtime depend on another runtime's state ownership unnecessarily; for example, shared worker fanout state is owned by `worker_messaging` rather than by `lambda_worker` or `task_worker` -- some shared infrastructure, such as the landing-zone VPC and tagged private subnets, is discovered with `data` lookups and must already exist -- frontend custom-domain deploys also require the matching Route53 hosted zone to already exist +- modules use Terragrunt `dependency` blocks to consume outputs from other stacks instead of `data.terraform_remote_state` +- this allows Terragrunt to understand the dependency graph explicitly and manage ordering for apply and destroy operations + +### Dependency Strategy + +- prefer `dependency` blocks for all cross-stack communication +- use `mock_outputs` for dependencies during `plan`, `validate`, and other non-apply commands to allow independent iteration without requiring upstream stacks to be deployed +- restrict mocks using `mock_outputs_allowed_terraform_commands` to ensure real outputs are always used during `apply` +- treat saved `plan` artifacts as apply-intent only: Terraform will reuse the exact variable values captured in the plan file during `apply_plan` +- for first deploys or other bootstrap-sensitive stacks, do not reuse a saved plan that captured `mock_outputs`; re-plan after the upstream real outputs exist before running `apply_plan` + +### When to Use Remote State + +- avoid using `data.terraform_remote_state` as the default mechanism for passing values between stacks +- it may still be used for: + - infrastructure that is managed outside of Terragrunt + - globally stable/shared resources that rarely change + - cross-account or external dependencies where Terragrunt `dependency` is not practical + +### Workflow and Ordering + +- Terragrunt dependencies define ordering implicitly, but logical constraints still apply: + + - `service_api` consumes the shared JWT authorizer output from `network`, so `cognito` and `network` must exist before the ECS API service stack applies + - the API service must be destroyed before `network` is torn down + +- on destroy: + + - `network` can tear down once downstream consumers such as `frontend`, `service_*`, `task_*`, and `database` are gone + - `cluster` can tear down in parallel with `network` once `service_*`, `task_*`, and other cluster consumers are gone; `frontend` is not a cluster dependency + - `security` must wait for VPC-attached lambdas such as `migrations` as well as `network`, otherwise the shared runtime security group may still be attached during Lambda ENI cleanup + +### Design Guidelines + +- avoid making one runtime depend on another runtime's state ownership unnecessarily + - for example, shared worker fanout state is owned by `messaging` rather than by `lambda_worker` or `task_worker` + +- prefer explicit ownership boundaries between stacks + +- some shared infrastructure, such as the landing-zone VPC and tagged private subnets, is discovered via `data` lookups and must already exist + +- frontend custom-domain deploys require the matching Route53 hosted zone to already exist ## Deployment Model @@ -121,7 +156,7 @@ That `containers/lib` directory is helper code only and is not treated as a depl - build workflows produce Lambda zips and container images - `*_infra` wrappers need the inputs required to apply infra safely, such as directory-derived stack matrices and any artifact-derived bootstrap references - in `prod`, the `*_infra` wrappers read shared artifact resources from `ci` but only apply service and task stacks in `prod` -- saved `plan` / `apply_plan` artifacts live in the shared code bucket under `terragrunt_plan///...`; `dev` uses the `dev` code bucket, while non-`dev` environments reuse the shared `ci` code bucket +- saved `plan` / `apply_plan` artifacts live in the dedicated plan bucket under `terragrunt_plan///...` - deploy workflows: - publish Lambda versions and use Lambda CodeDeploy - optionally invoke the `migrations` Lambda when it is part of the Lambda deploy matrix @@ -163,6 +198,26 @@ just --justfile justfile.deploy lambda-get-version just --justfile justfile.deploy frontend-build ``` +For a local saved-plan run that can upload plan artifacts through the normal repo wrapper, enable artifact mode, provide a unique run id, and pass the Terragrunt operation as one quoted argument: + +```sh +TG_ENABLE_PLAN_ARTIFACTS=true \ +PLAN_ARTIFACT_RUN_ID="local-example-run" \ +just tg dev aws/oidc 'plan -out=terragrunt.tfplan' +``` + +The `tg` recipe treats the final argument as the Terragrunt operation string, so quoting lets you pass flags such as `-out=...` through the wrapper. The current saved-plan hook expects the binary plan filename to be `terragrunt.tfplan`; if you choose a different `-out` filename, the upload hook will not find it. + +Per-stack saved-plan bundles in S3 use the live stack identity rather than your full local filesystem path, for example `terragrunt-plan-dev-aws-oidc`. + +To apply that same saved plan later, reuse the same run id: + +```sh +TG_ENABLE_PLAN_ARTIFACTS=true \ +PLAN_ARTIFACT_RUN_ID="local-example-run" \ +just tg dev aws/oidc 'apply terragrunt.tfplan' +``` + ## Naming Conventions - `task_` diff --git a/infra/live/dependencies/cluster.hcl b/infra/live/dependencies/cluster.hcl new file mode 100644 index 00000000..2bbd9e09 --- /dev/null +++ b/infra/live/dependencies/cluster.hcl @@ -0,0 +1,15 @@ +dependency "cluster" { + config_path = "${get_original_terragrunt_dir()}/../cluster" + + mock_outputs = { + cluster_id = "mock-cluster-id" + cluster_name = "mock-cluster" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + cluster_id = dependency.cluster.outputs.cluster_id + cluster_name = dependency.cluster.outputs.cluster_name +} diff --git a/infra/live/dependencies/database.hcl b/infra/live/dependencies/database.hcl new file mode 100644 index 00000000..02dd17c6 --- /dev/null +++ b/infra/live/dependencies/database.hcl @@ -0,0 +1,21 @@ +dependency "database" { + config_path = "${get_original_terragrunt_dir()}/../database" + + mock_outputs = { + database_credentials_secret_arn = "arn:aws:secretsmanager:eu-west-2:111111111111:secret:mock-database-credentials" + database_readwrite_endpoint = "mock-database.cluster-abcdefghijkl.eu-west-2.rds.amazonaws.com" + database_name = "app" + database_port = 5432 + database_cluster_identifier = "mock-database-cluster" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + database_credentials_secret_arn = dependency.database.outputs.database_credentials_secret_arn + database_readwrite_endpoint = dependency.database.outputs.database_readwrite_endpoint + database_name = dependency.database.outputs.database_name + database_port = dependency.database.outputs.database_port + database_cluster_identifier = dependency.database.outputs.database_cluster_identifier +} diff --git a/infra/live/dependencies/frontend.hcl b/infra/live/dependencies/frontend.hcl new file mode 100644 index 00000000..01dba858 --- /dev/null +++ b/infra/live/dependencies/frontend.hcl @@ -0,0 +1,30 @@ +dependency "network" { + config_path = "${get_original_terragrunt_dir()}/../network" + + mock_outputs = { + api_invoke_url = "https://mockapi123.execute-api.eu-west-2.amazonaws.com" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +dependency "cognito" { + config_path = "${get_original_terragrunt_dir()}/../cognito" + + mock_outputs = { + auth_user_pool_id = "eu-west-2_mock" + auth_user_pool_client_id = "mock-user-pool-client-id" + auth_hosted_ui_url = "https://mock-domain.auth.eu-west-2.amazoncognito.com" + auth_readonly_group_name = "readonly" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + api_invoke_url = dependency.network.outputs.api_invoke_url + auth_user_pool_id = dependency.cognito.outputs.auth_user_pool_id + auth_user_pool_client_id = dependency.cognito.outputs.auth_user_pool_client_id + auth_hosted_ui_url = dependency.cognito.outputs.auth_hosted_ui_url + auth_readonly_group_name = dependency.cognito.outputs.auth_readonly_group_name +} diff --git a/infra/live/dependencies/messaging.hcl b/infra/live/dependencies/messaging.hcl new file mode 100644 index 00000000..12399468 --- /dev/null +++ b/infra/live/dependencies/messaging.hcl @@ -0,0 +1,22 @@ +dependency "messaging" { + config_path = "${get_original_terragrunt_dir()}/../messaging" + + mock_outputs = { + worker_topic_name = "mock-worker-events" + worker_topic_arn = "arn:aws:sns:eu-west-2:111111111111:mock-worker-events" + worker_topic_publish_policy_arn = "arn:aws:iam::111111111111:policy/mock-worker-topic-publish" + lambda_worker_queue_name = "mock-lambda-worker-queue" + lambda_worker_queue_arn = "arn:aws:sqs:eu-west-2:111111111111:mock-lambda-worker-queue" + lambda_worker_queue_url = "https://sqs.eu-west-2.amazonaws.com/111111111111/mock-lambda-worker-queue" + lambda_worker_queue_read_policy_arn = "arn:aws:iam::111111111111:policy/mock-lambda-worker-queue-read" + lambda_worker_dead_letter_queue_name = "mock-lambda-worker-dlq" + lambda_worker_dead_letter_queue_url = "https://sqs.eu-west-2.amazonaws.com/111111111111/mock-lambda-worker-dlq" + ecs_worker_queue_name = "mock-ecs-worker-queue" + ecs_worker_queue_url = "https://sqs.eu-west-2.amazonaws.com/111111111111/mock-ecs-worker-queue" + ecs_worker_queue_read_policy_arn = "arn:aws:iam::111111111111:policy/mock-ecs-worker-queue-read" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = dependency.messaging.outputs diff --git a/infra/live/dependencies/network.hcl b/infra/live/dependencies/network.hcl new file mode 100644 index 00000000..de2a1de5 --- /dev/null +++ b/infra/live/dependencies/network.hcl @@ -0,0 +1,35 @@ +dependency "network" { + config_path = "${get_original_terragrunt_dir()}/../network" + + mock_outputs = { + default_target_group_arn = "arn:aws:elasticloadbalancing:eu-west-2:111111111111:targetgroup/mock-default/1234567890abcdef" + load_balancer_arn = "arn:aws:elasticloadbalancing:eu-west-2:111111111111:loadbalancer/app/mock-internal/1234567890abcdef" + default_http_listener_arn = "arn:aws:elasticloadbalancing:eu-west-2:111111111111:listener/app/mock-internal/1234567890abcdef/abcdef1234567890" + load_balancer_arn_suffix = "app/mock-internal/1234567890abcdef" + target_group_arn_suffix = "targetgroup/mock-default/1234567890abcdef" + internal_invoke_url = "http://mock-internal-123456.eu-west-2.elb.amazonaws.com" + api_id = "mockapi123" + api_invoke_url = "https://mockapi123.execute-api.eu-west-2.amazonaws.com" + api_execution_arn = "arn:aws:execute-api:eu-west-2:111111111111:mockapi123" + api_stage_name = "$default" + vpc_link_id = "vpclink-mock123" + http_api_authorizer_id = "auth-mock123" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + network_default_target_group_arn = dependency.network.outputs.default_target_group_arn + network_load_balancer_arn = dependency.network.outputs.load_balancer_arn + network_default_http_listener_arn = dependency.network.outputs.default_http_listener_arn + network_load_balancer_arn_suffix = dependency.network.outputs.load_balancer_arn_suffix + network_target_group_arn_suffix = dependency.network.outputs.target_group_arn_suffix + network_internal_invoke_url = dependency.network.outputs.internal_invoke_url + network_api_id = dependency.network.outputs.api_id + network_api_invoke_url = dependency.network.outputs.api_invoke_url + network_api_execution_arn = dependency.network.outputs.api_execution_arn + network_api_stage_name = dependency.network.outputs.api_stage_name + network_vpc_link_id = dependency.network.outputs.vpc_link_id + network_http_api_authorizer_id = dependency.network.outputs.http_api_authorizer_id +} diff --git a/infra/live/dependencies/security.hcl b/infra/live/dependencies/security.hcl new file mode 100644 index 00000000..07ac0a9e --- /dev/null +++ b/infra/live/dependencies/security.hcl @@ -0,0 +1,14 @@ +dependency "security" { + config_path = "${get_original_terragrunt_dir()}/../security" + + mock_outputs = { + load_balancer_sg = "sg-00000000000000001" + api_vpc_link_sg = "sg-00000000000000002" + vpc_endpoint_sg = "sg-00000000000000003" + ecs_sg = "sg-00000000000000004" + runtime_sg = "sg-00000000000000005" + postgres_sg = "sg-00000000000000006" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} diff --git a/infra/live/dev/aws/database/terragrunt.hcl b/infra/live/dev/aws/database/terragrunt.hcl index 87094184..c6ee9ddc 100644 --- a/infra/live/dev/aws/database/terragrunt.hcl +++ b/infra/live/dev/aws/database/terragrunt.hcl @@ -2,15 +2,24 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - database_name = "app" - backup_retention_period = 1 - rds_min_capacity = 0.5 - rds_max_capacity = 1.0 - rds_max_reader_count = 0 - performance_insights_enabled = false +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") } terraform { source = "../../../../modules//aws//database" } + +inputs = merge( + { + database_security_group_id = dependency.security.outputs.postgres_sg + }, + { + database_name = "app" + backup_retention_period = 1 + rds_min_capacity = 0.5 + rds_max_capacity = 1.0 + rds_max_reader_count = 0 + performance_insights_enabled = false + }, +) diff --git a/infra/live/dev/aws/frontend/terragrunt.hcl b/infra/live/dev/aws/frontend/terragrunt.hcl index 3086bb63..fa22a838 100644 --- a/infra/live/dev/aws/frontend/terragrunt.hcl +++ b/infra/live/dev/aws/frontend/terragrunt.hcl @@ -2,6 +2,12 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + frontend = read_terragrunt_config(find_in_parent_folders("dependencies/frontend.hcl")) +} + terraform { source = "../../../../modules//aws//frontend" } + +inputs = local.frontend.inputs diff --git a/infra/live/dev/aws/lambda_api/terragrunt.hcl b/infra/live/dev/aws/lambda_api/terragrunt.hcl index 2b12aa0c..957503d9 100644 --- a/infra/live/dev/aws/lambda_api/terragrunt.hcl +++ b/infra/live/dev/aws/lambda_api/terragrunt.hcl @@ -2,30 +2,39 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - api_5xx_alarm_threshold = 20.0 - api_5xx_alarm_evaluation_periods = 1 - api_5xx_alarm_datapoints_to_alarm = 1 - - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 3 - } - - provisioned_config = { - auto_scale = { - max = 2 - min = 1 - trigger_percent = 20 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - - reserved_concurrency = 10 - } +locals { + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) } terraform { source = "../../../../modules//aws//lambda_api" } + +inputs = merge( + local.network.inputs, + local.messaging.inputs, + { + api_5xx_alarm_threshold = 20.0 + api_5xx_alarm_evaluation_periods = 1 + api_5xx_alarm_datapoints_to_alarm = 1 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 + } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 + } + }, +) diff --git a/infra/live/dev/aws/lambda_worker/terragrunt.hcl b/infra/live/dev/aws/lambda_worker/terragrunt.hcl index e25cfa13..bdb94fdd 100644 --- a/infra/live/dev/aws/lambda_worker/terragrunt.hcl +++ b/infra/live/dev/aws/lambda_worker/terragrunt.hcl @@ -2,28 +2,35 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) - sqs_dlq_alarm_evaluation_periods = 1 - sqs_dlq_alarm_datapoints_to_alarm = 1 - - deployment_config = { - strategy = "canary" - percentage = 50 - interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers - } - - provisioned_config = { - sqs_scale = { - min = 1 - max = 5 - visible_messages = 10 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - } +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) } terraform { source = "../../../../modules//aws//lambda_worker" } + +inputs = merge( + local.messaging.inputs, + { + sqs_dlq_alarm_threshold = 1 # fail when any messages are in the DLQ (quick fail for testing) + sqs_dlq_alarm_evaluation_periods = 1 + sqs_dlq_alarm_datapoints_to_alarm = 1 + + deployment_config = { + strategy = "canary" + percentage = 50 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } + }, +) diff --git a/infra/live/dev/aws/worker_messaging/terragrunt.hcl b/infra/live/dev/aws/messaging/terragrunt.hcl similarity index 58% rename from infra/live/dev/aws/worker_messaging/terragrunt.hcl rename to infra/live/dev/aws/messaging/terragrunt.hcl index fba5f066..b85cf717 100644 --- a/infra/live/dev/aws/worker_messaging/terragrunt.hcl +++ b/infra/live/dev/aws/messaging/terragrunt.hcl @@ -3,5 +3,5 @@ include "root" { } terraform { - source = "../../../../modules//aws//worker_messaging" + source = "../../../../modules//aws//messaging" } diff --git a/infra/live/dev/aws/migrations/terragrunt.hcl b/infra/live/dev/aws/migrations/terragrunt.hcl index 0856befd..edc1b7f2 100644 --- a/infra/live/dev/aws/migrations/terragrunt.hcl +++ b/infra/live/dev/aws/migrations/terragrunt.hcl @@ -2,6 +2,21 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//migrations" } + +inputs = merge( + { + runtime_security_group_id = dependency.security.outputs.runtime_sg + }, + local.database.inputs, +) diff --git a/infra/live/dev/aws/network/terragrunt.hcl b/infra/live/dev/aws/network/terragrunt.hcl index 92b17cab..93443bf3 100644 --- a/infra/live/dev/aws/network/terragrunt.hcl +++ b/infra/live/dev/aws/network/terragrunt.hcl @@ -2,6 +2,29 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + terraform { source = "../../../../modules//aws//network" } + +dependency "cognito" { + config_path = "${get_original_terragrunt_dir()}/../cognito" + + mock_outputs = { + auth_user_pool_client_id = "mock-user-pool-client-id" + auth_issuer_url = "https://cognito-idp.eu-west-2.amazonaws.com/eu-west-2_mock" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + load_balancer_sg = dependency.security.outputs.load_balancer_sg + api_vpc_link_sg = dependency.security.outputs.api_vpc_link_sg + vpc_endpoint_sg = dependency.security.outputs.vpc_endpoint_sg + auth_user_pool_client_id = dependency.cognito.outputs.auth_user_pool_client_id + auth_issuer_url = dependency.cognito.outputs.auth_issuer_url +} diff --git a/infra/live/dev/aws/rds_reader_tagger/terragrunt.hcl b/infra/live/dev/aws/rds_reader_tagger/terragrunt.hcl index 6d8d6e98..bc618a77 100644 --- a/infra/live/dev/aws/rds_reader_tagger/terragrunt.hcl +++ b/infra/live/dev/aws/rds_reader_tagger/terragrunt.hcl @@ -2,6 +2,12 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//rds_reader_tagger" } + +inputs = local.database.inputs diff --git a/infra/live/dev/aws/service_api/terragrunt.hcl b/infra/live/dev/aws/service_api/terragrunt.hcl index 97be2f29..501e7a44 100644 --- a/infra/live/dev/aws/service_api/terragrunt.hcl +++ b/infra/live/dev/aws/service_api/terragrunt.hcl @@ -2,6 +2,23 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + cluster = read_terragrunt_config(find_in_parent_folders("dependencies/cluster.hcl")) + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) +} + terraform { source = "../../../../modules//aws//service_api" } + +inputs = merge( + { + ecs_security_group_id = dependency.security.outputs.ecs_sg + }, + local.cluster.inputs, + local.network.inputs, +) diff --git a/infra/live/dev/aws/service_worker/terragrunt.hcl b/infra/live/dev/aws/service_worker/terragrunt.hcl index 8e44b264..7b23580e 100644 --- a/infra/live/dev/aws/service_worker/terragrunt.hcl +++ b/infra/live/dev/aws/service_worker/terragrunt.hcl @@ -2,6 +2,25 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) + cluster = read_terragrunt_config(find_in_parent_folders("dependencies/cluster.hcl")) + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) +} + terraform { source = "../../../../modules//aws//service_worker" } + +inputs = merge( + { + ecs_security_group_id = dependency.security.outputs.ecs_sg + }, + local.messaging.inputs, + local.cluster.inputs, + local.network.inputs, +) diff --git a/infra/live/dev/aws/task_worker/terragrunt.hcl b/infra/live/dev/aws/task_worker/terragrunt.hcl index b0a81635..b28cbca9 100644 --- a/infra/live/dev/aws/task_worker/terragrunt.hcl +++ b/infra/live/dev/aws/task_worker/terragrunt.hcl @@ -2,6 +2,13 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//task_worker" } + +inputs = merge(local.messaging.inputs, local.database.inputs) diff --git a/infra/live/global_vars.hcl b/infra/live/global_vars.hcl index fac34513..4c04287b 100644 --- a/infra/live/global_vars.hcl +++ b/infra/live/global_vars.hcl @@ -24,7 +24,7 @@ locals { "secretsmanager:*", "kms:*", "acm:*", - "route53:*", + "route53:**", "cognito-idp:*", "tag:GetResources", ] diff --git a/infra/live/prod/aws/database/terragrunt.hcl b/infra/live/prod/aws/database/terragrunt.hcl index 4772e5da..5901ba70 100644 --- a/infra/live/prod/aws/database/terragrunt.hcl +++ b/infra/live/prod/aws/database/terragrunt.hcl @@ -2,16 +2,25 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - database_name = "app" - backup_retention_period = 7 - rds_min_capacity = 0.5 - rds_max_capacity = 2.0 - rds_max_reader_count = 1 - performance_insights_enabled = true - performance_insights_retention_period = 7 +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") } terraform { source = "../../../../modules//aws//database" } + +inputs = merge( + { + database_security_group_id = dependency.security.outputs.postgres_sg + }, + { + database_name = "app" + backup_retention_period = 7 + rds_min_capacity = 0.5 + rds_max_capacity = 2.0 + rds_max_reader_count = 1 + performance_insights_enabled = true + performance_insights_retention_period = 7 + }, +) diff --git a/infra/live/prod/aws/frontend/terragrunt.hcl b/infra/live/prod/aws/frontend/terragrunt.hcl index 3086bb63..fa22a838 100644 --- a/infra/live/prod/aws/frontend/terragrunt.hcl +++ b/infra/live/prod/aws/frontend/terragrunt.hcl @@ -2,6 +2,12 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + frontend = read_terragrunt_config(find_in_parent_folders("dependencies/frontend.hcl")) +} + terraform { source = "../../../../modules//aws//frontend" } + +inputs = local.frontend.inputs diff --git a/infra/live/prod/aws/lambda_api/terragrunt.hcl b/infra/live/prod/aws/lambda_api/terragrunt.hcl index f17ea0ab..8e29bf96 100644 --- a/infra/live/prod/aws/lambda_api/terragrunt.hcl +++ b/infra/live/prod/aws/lambda_api/terragrunt.hcl @@ -2,30 +2,39 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - api_5xx_alarm_threshold = 5.0 - api_5xx_alarm_evaluation_periods = 3 - api_5xx_alarm_datapoints_to_alarm = 3 - - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 5 - } - - provisioned_config = { - auto_scale = { - max = 2 - min = 1 - trigger_percent = 20 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - - reserved_concurrency = 10 - } +locals { + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) } terraform { source = "../../../../modules//aws//lambda_api" } + +inputs = merge( + local.network.inputs, + local.messaging.inputs, + { + api_5xx_alarm_threshold = 5.0 + api_5xx_alarm_evaluation_periods = 3 + api_5xx_alarm_datapoints_to_alarm = 3 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 5 + } + + provisioned_config = { + auto_scale = { + max = 2 + min = 1 + trigger_percent = 20 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + + reserved_concurrency = 10 + } + }, +) diff --git a/infra/live/prod/aws/lambda_worker/terragrunt.hcl b/infra/live/prod/aws/lambda_worker/terragrunt.hcl index 21cdef71..294342f8 100644 --- a/infra/live/prod/aws/lambda_worker/terragrunt.hcl +++ b/infra/live/prod/aws/lambda_worker/terragrunt.hcl @@ -2,28 +2,35 @@ include "root" { path = find_in_parent_folders("root.hcl") } -inputs = { - sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ - sqs_dlq_alarm_evaluation_periods = 3 - sqs_dlq_alarm_datapoints_to_alarm = 3 - - deployment_config = { - strategy = "canary" - percentage = 10 - interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers - } - - provisioned_config = { - sqs_scale = { - min = 1 - max = 5 - visible_messages = 10 - scale_in_cooldown_seconds = 60 - scale_out_cooldown_seconds = 60 - } - } +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) } terraform { source = "../../../../modules//aws//lambda_worker" } + +inputs = merge( + local.messaging.inputs, + { + sqs_dlq_alarm_threshold = 5 # fail when there are 5 messages in the DLQ + sqs_dlq_alarm_evaluation_periods = 3 + sqs_dlq_alarm_datapoints_to_alarm = 3 + + deployment_config = { + strategy = "canary" + percentage = 10 + interval_minutes = 3 # this should be > the CloudWatch alarm evaluation period to ensure we catch the alarm if it triggers + } + + provisioned_config = { + sqs_scale = { + min = 1 + max = 5 + visible_messages = 10 + scale_in_cooldown_seconds = 60 + scale_out_cooldown_seconds = 60 + } + } + }, +) diff --git a/infra/live/prod/aws/worker_messaging/terragrunt.hcl b/infra/live/prod/aws/messaging/terragrunt.hcl similarity index 58% rename from infra/live/prod/aws/worker_messaging/terragrunt.hcl rename to infra/live/prod/aws/messaging/terragrunt.hcl index fba5f066..b85cf717 100644 --- a/infra/live/prod/aws/worker_messaging/terragrunt.hcl +++ b/infra/live/prod/aws/messaging/terragrunt.hcl @@ -3,5 +3,5 @@ include "root" { } terraform { - source = "../../../../modules//aws//worker_messaging" + source = "../../../../modules//aws//messaging" } diff --git a/infra/live/prod/aws/migrations/terragrunt.hcl b/infra/live/prod/aws/migrations/terragrunt.hcl index 0856befd..edc1b7f2 100644 --- a/infra/live/prod/aws/migrations/terragrunt.hcl +++ b/infra/live/prod/aws/migrations/terragrunt.hcl @@ -2,6 +2,21 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//migrations" } + +inputs = merge( + { + runtime_security_group_id = dependency.security.outputs.runtime_sg + }, + local.database.inputs, +) diff --git a/infra/live/prod/aws/network/terragrunt.hcl b/infra/live/prod/aws/network/terragrunt.hcl index 92b17cab..93443bf3 100644 --- a/infra/live/prod/aws/network/terragrunt.hcl +++ b/infra/live/prod/aws/network/terragrunt.hcl @@ -2,6 +2,29 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + terraform { source = "../../../../modules//aws//network" } + +dependency "cognito" { + config_path = "${get_original_terragrunt_dir()}/../cognito" + + mock_outputs = { + auth_user_pool_client_id = "mock-user-pool-client-id" + auth_issuer_url = "https://cognito-idp.eu-west-2.amazonaws.com/eu-west-2_mock" + } + + mock_outputs_allowed_terraform_commands = ["validate", "plan", "destroy", "init", "show"] +} + +inputs = { + load_balancer_sg = dependency.security.outputs.load_balancer_sg + api_vpc_link_sg = dependency.security.outputs.api_vpc_link_sg + vpc_endpoint_sg = dependency.security.outputs.vpc_endpoint_sg + auth_user_pool_client_id = dependency.cognito.outputs.auth_user_pool_client_id + auth_issuer_url = dependency.cognito.outputs.auth_issuer_url +} diff --git a/infra/live/prod/aws/rds_reader_tagger/terragrunt.hcl b/infra/live/prod/aws/rds_reader_tagger/terragrunt.hcl index 6d8d6e98..bc618a77 100644 --- a/infra/live/prod/aws/rds_reader_tagger/terragrunt.hcl +++ b/infra/live/prod/aws/rds_reader_tagger/terragrunt.hcl @@ -2,6 +2,12 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//rds_reader_tagger" } + +inputs = local.database.inputs diff --git a/infra/live/prod/aws/service_api/terragrunt.hcl b/infra/live/prod/aws/service_api/terragrunt.hcl index 97be2f29..501e7a44 100644 --- a/infra/live/prod/aws/service_api/terragrunt.hcl +++ b/infra/live/prod/aws/service_api/terragrunt.hcl @@ -2,6 +2,23 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + cluster = read_terragrunt_config(find_in_parent_folders("dependencies/cluster.hcl")) + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) +} + terraform { source = "../../../../modules//aws//service_api" } + +inputs = merge( + { + ecs_security_group_id = dependency.security.outputs.ecs_sg + }, + local.cluster.inputs, + local.network.inputs, +) diff --git a/infra/live/prod/aws/service_worker/terragrunt.hcl b/infra/live/prod/aws/service_worker/terragrunt.hcl index 8e44b264..7b23580e 100644 --- a/infra/live/prod/aws/service_worker/terragrunt.hcl +++ b/infra/live/prod/aws/service_worker/terragrunt.hcl @@ -2,6 +2,25 @@ include "root" { path = find_in_parent_folders("root.hcl") } +include "security" { + path = find_in_parent_folders("dependencies/security.hcl") +} + +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) + cluster = read_terragrunt_config(find_in_parent_folders("dependencies/cluster.hcl")) + network = read_terragrunt_config(find_in_parent_folders("dependencies/network.hcl")) +} + terraform { source = "../../../../modules//aws//service_worker" } + +inputs = merge( + { + ecs_security_group_id = dependency.security.outputs.ecs_sg + }, + local.messaging.inputs, + local.cluster.inputs, + local.network.inputs, +) diff --git a/infra/live/prod/aws/task_worker/terragrunt.hcl b/infra/live/prod/aws/task_worker/terragrunt.hcl index b0a81635..b28cbca9 100644 --- a/infra/live/prod/aws/task_worker/terragrunt.hcl +++ b/infra/live/prod/aws/task_worker/terragrunt.hcl @@ -2,6 +2,13 @@ include "root" { path = find_in_parent_folders("root.hcl") } +locals { + messaging = read_terragrunt_config(find_in_parent_folders("dependencies/messaging.hcl")) + database = read_terragrunt_config(find_in_parent_folders("dependencies/database.hcl")) +} + terraform { source = "../../../../modules//aws//task_worker" } + +inputs = merge(local.messaging.inputs, local.database.inputs) diff --git a/infra/live/prod/environment_vars.hcl b/infra/live/prod/environment_vars.hcl index e2f272bf..be4a6c71 100644 --- a/infra/live/prod/environment_vars.hcl +++ b/infra/live/prod/environment_vars.hcl @@ -1,14 +1,16 @@ locals { - log_retention_days = 14 - deploy_branches = ["main"] - cognito_callback_urls = ["http://localhost:5173"] - cognito_logout_urls = ["http://localhost:5173"] + log_retention_days = 14 + deploy_branches = ["main"] + cognito_callback_urls = ["http://localhost:5173"] + cognito_logout_urls = ["http://localhost:5173"] + infra_plan_artifact_expiration_days = 30 } inputs = { - log_retention_days = local.log_retention_days - deploy_branches = local.deploy_branches - otel_sample_rate = 0.1 # 10% of traces sampled - callback_urls = local.cognito_callback_urls - logout_urls = local.cognito_logout_urls + log_retention_days = local.log_retention_days + deploy_branches = local.deploy_branches + otel_sample_rate = 0.1 # 10% of traces sampled + callback_urls = local.cognito_callback_urls + logout_urls = local.cognito_logout_urls + infra_plan_artifact_expiration_days = local.infra_plan_artifact_expiration_days } diff --git a/infra/modules/aws/cognito/README.md b/infra/modules/aws/cognito/README.md index 8447be65..1f635eb9 100644 --- a/infra/modules/aws/cognito/README.md +++ b/infra/modules/aws/cognito/README.md @@ -19,13 +19,13 @@ Concrete Cognito user-auth module for the frontend and HTTP API. ## Key outputs -- `user_pool_id` -- `user_pool_arn` -- `user_pool_client_id` -- `issuer_url` -- `hosted_ui_url` -- `hosted_ui_domain` -- `readonly_group_name` +- `auth_user_pool_id` +- `auth_user_pool_arn` +- `auth_user_pool_client_id` +- `auth_issuer_url` +- `auth_hosted_ui_url` +- `auth_hosted_ui_domain` +- `auth_readonly_group_name` This module intentionally creates infrastructure, not individual users. In this repo, user seeding is expected to happen operationally with AWS CLI or `just` recipes so access can be granted explicitly to a small allowlist such as the initial `readonly` user. The module derives the deployed frontend URL as `https://..` and adds it to the Hosted UI callback and logout URLs alongside any local development URLs. diff --git a/infra/modules/aws/cognito/outputs.tf b/infra/modules/aws/cognito/outputs.tf index 8ddbc2ef..d7d31e2f 100644 --- a/infra/modules/aws/cognito/outputs.tf +++ b/infra/modules/aws/cognito/outputs.tf @@ -1,27 +1,27 @@ -output "user_pool_id" { +output "auth_user_pool_id" { value = aws_cognito_user_pool.this.id } -output "user_pool_arn" { +output "auth_user_pool_arn" { value = aws_cognito_user_pool.this.arn } -output "user_pool_client_id" { +output "auth_user_pool_client_id" { value = aws_cognito_user_pool_client.frontend.id } -output "issuer_url" { +output "auth_issuer_url" { value = local.issuer_url } -output "hosted_ui_url" { +output "auth_hosted_ui_url" { value = local.hosted_ui_url } -output "hosted_ui_domain" { +output "auth_hosted_ui_domain" { value = aws_cognito_user_pool_domain.this.domain } -output "readonly_group_name" { +output "auth_readonly_group_name" { value = aws_cognito_user_group.readonly.name } diff --git a/infra/modules/aws/database/README.md b/infra/modules/aws/database/README.md index 3f4cf9c0..1d968481 100644 --- a/infra/modules/aws/database/README.md +++ b/infra/modules/aws/database/README.md @@ -27,15 +27,15 @@ Concrete Aurora PostgreSQL wrapper. ## Key outputs -- `cluster_identifier` -- `security_group_id` -- `credentials_secret_arn` -- `readonly_endpoint_ssm_name` -- `readwrite_endpoint_ssm_name` +- `database_cluster_identifier` +- `database_security_group_id` +- `database_credentials_secret_arn` +- `database_readonly_endpoint_ssm_name` +- `database_readwrite_endpoint_ssm_name` - `database_name` - `database_port` -- `readonly_endpoint` -- `readwrite_endpoint` +- `database_readonly_endpoint` +- `database_readwrite_endpoint` This module keeps repo-specific network lookup logic out of `_shared/database`. It selects public or private subnets by `tag:Name` based on `publicly_accessible` and passes the resulting subnet ids into the shared Aurora module. The database credentials outputs point at the Aurora-managed master secret rather than a repo-created fixed-name secret. diff --git a/infra/modules/aws/database/outputs.tf b/infra/modules/aws/database/outputs.tf index 93afbf47..df108b78 100644 --- a/infra/modules/aws/database/outputs.tf +++ b/infra/modules/aws/database/outputs.tf @@ -1,20 +1,20 @@ -output "credentials_secret_arn" { +output "database_credentials_secret_arn" { value = module.database.credentials_secret_arn } -output "readonly_endpoint_ssm_name" { +output "database_readonly_endpoint_ssm_name" { value = module.database.readonly_endpoint_ssm_name } -output "readwrite_endpoint_ssm_name" { +output "database_readwrite_endpoint_ssm_name" { value = module.database.readwrite_endpoint_ssm_name } -output "cluster_identifier" { +output "database_cluster_identifier" { value = module.database.cluster_identifier } -output "security_group_id" { +output "database_security_group_id" { value = module.database.security_group_id } @@ -26,10 +26,10 @@ output "database_port" { value = module.database.database_port } -output "readonly_endpoint" { +output "database_readonly_endpoint" { value = module.database.readonly_endpoint } -output "readwrite_endpoint" { +output "database_readwrite_endpoint" { value = module.database.readwrite_endpoint } diff --git a/infra/modules/aws/lambda_api/README.md b/infra/modules/aws/lambda_api/README.md index ca353d15..212c0a50 100644 --- a/infra/modules/aws/lambda_api/README.md +++ b/infra/modules/aws/lambda_api/README.md @@ -13,7 +13,7 @@ Lambda-backed public HTTP API module. ## Dependencies - shared API Gateway HTTP API, VPC link, and JWT authorizer from `network` -- shared worker SNS topic from `worker_messaging` +- shared worker SNS topic from `messaging` ## Key outputs @@ -25,3 +25,4 @@ Lambda-backed public HTTP API module. This module is Lambda-specific. The shared API surface and shared JWT authorizer now live in `network`. When accessed through the frontend CloudFront distribution, the public Lambda path is `/api/*` because CloudFront strips the leading `/api` prefix before forwarding to API Gateway. The packaged runtime can publish JSON payloads to the shared worker SNS topic via `POST /messages`, which fans the message out to both the Lambda and ECS worker queues. +The public `GET /health` route is intentionally left unauthenticated so external uptime checks do not need a JWT, while the catch-all API routes remain JWT-protected. diff --git a/infra/modules/aws/lambda_api/data.tf b/infra/modules/aws/lambda_api/data.tf index 1a52292a..02351f14 100644 --- a/infra/modules/aws/lambda_api/data.tf +++ b/infra/modules/aws/lambda_api/data.tf @@ -1,23 +1,3 @@ -data "terraform_remote_state" "network" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/network/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "worker_messaging" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/worker_messaging/terraform.tfstate" - region = var.aws_region - } -} - data "aws_iam_policy_document" "worker_topic_publish" { statement { actions = [ @@ -25,7 +5,7 @@ data "aws_iam_policy_document" "worker_topic_publish" { ] resources = [ - data.terraform_remote_state.worker_messaging.outputs.sns_topic_arn, + var.worker_topic_arn, ] } } diff --git a/infra/modules/aws/lambda_api/main.tf b/infra/modules/aws/lambda_api/main.tf index cbdd57fb..ef453167 100644 --- a/infra/modules/aws/lambda_api/main.tf +++ b/infra/modules/aws/lambda_api/main.tf @@ -15,8 +15,8 @@ module "lambda_api" { environment_variables = { DEBUG_DELAY_MS = 500 - WORKER_TOPIC_ARN = data.terraform_remote_state.worker_messaging.outputs.sns_topic_arn - WORKER_TOPIC_NAME = data.terraform_remote_state.worker_messaging.outputs.sns_topic_name + WORKER_TOPIC_ARN = var.worker_topic_arn + WORKER_TOPIC_NAME = var.worker_topic_name } additional_policy_arns = [ @@ -33,26 +33,32 @@ module "lambda_api" { } resource "aws_apigatewayv2_integration" "lambda_proxy" { - api_id = data.terraform_remote_state.network.outputs.api_id + api_id = var.network_api_id integration_type = "AWS_PROXY" integration_uri = module.lambda_api.alias_arn payload_format_version = "2.0" } resource "aws_apigatewayv2_route" "root" { - api_id = data.terraform_remote_state.network.outputs.api_id + api_id = var.network_api_id route_key = "ANY /" target = "integrations/${aws_apigatewayv2_integration.lambda_proxy.id}" authorization_type = "JWT" - authorizer_id = data.terraform_remote_state.network.outputs.http_api_authorizer_id + authorizer_id = var.network_http_api_authorizer_id +} + +resource "aws_apigatewayv2_route" "health" { + api_id = var.network_api_id + route_key = "GET /health" + target = "integrations/${aws_apigatewayv2_integration.lambda_proxy.id}" } resource "aws_apigatewayv2_route" "proxy" { - api_id = data.terraform_remote_state.network.outputs.api_id + api_id = var.network_api_id route_key = "ANY /{proxy+}" target = "integrations/${aws_apigatewayv2_integration.lambda_proxy.id}" authorization_type = "JWT" - authorizer_id = data.terraform_remote_state.network.outputs.http_api_authorizer_id + authorizer_id = var.network_http_api_authorizer_id } resource "aws_lambda_permission" "allow_invoke" { @@ -60,7 +66,7 @@ resource "aws_lambda_permission" "allow_invoke" { action = "lambda:InvokeFunction" function_name = module.lambda_api.alias_arn principal = "apigateway.amazonaws.com" - source_arn = "${data.terraform_remote_state.network.outputs.api_execution_arn}/*/*" # all routes/stages + source_arn = "${var.network_api_execution_arn}/*/*" # all routes/stages } resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { @@ -97,8 +103,8 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { period = 60 # most aws metrics are emitted at 1-minute intervals, so using a shorter period can lead to more volatile alarms dimensions = { - ApiId = data.terraform_remote_state.network.outputs.api_id - Stage = data.terraform_remote_state.network.outputs.api_stage_name + ApiId = var.network_api_id + Stage = var.network_api_stage_name } } } @@ -115,8 +121,8 @@ resource "aws_cloudwatch_metric_alarm" "api_5xx_rate" { period = 60 dimensions = { - ApiId = data.terraform_remote_state.network.outputs.api_id - Stage = data.terraform_remote_state.network.outputs.api_stage_name + ApiId = var.network_api_id + Stage = var.network_api_stage_name } } } diff --git a/infra/modules/aws/lambda_api/outputs.tf b/infra/modules/aws/lambda_api/outputs.tf index 770a3861..3be5dfba 100644 --- a/infra/modules/aws/lambda_api/outputs.tf +++ b/infra/modules/aws/lambda_api/outputs.tf @@ -1,13 +1,13 @@ output "invoke_url" { - value = data.terraform_remote_state.network.outputs.api_invoke_url + value = var.network_api_invoke_url } output "api_id" { - value = data.terraform_remote_state.network.outputs.api_id + value = var.network_api_id } output "vpc_link_id" { - value = data.terraform_remote_state.network.outputs.vpc_link_id + value = var.network_vpc_link_id } output "cloudwatch_log_group" { diff --git a/infra/modules/aws/lambda_api/variables.tf b/infra/modules/aws/lambda_api/variables.tf index dba57dda..209139c2 100644 --- a/infra/modules/aws/lambda_api/variables.tf +++ b/infra/modules/aws/lambda_api/variables.tf @@ -11,7 +11,7 @@ variable "environment" { variable "aws_region" { type = string - description = "AWS region used for remote state lookups" + description = "AWS region used for provider resources" } variable "state_bucket" { @@ -79,3 +79,35 @@ variable "vpc_name" { type = string description = "VPC name tag used to look up private subnets for the shared API Gateway VPC link" } + +variable "network_api_id" { + type = string +} + +variable "network_api_invoke_url" { + type = string +} + +variable "network_api_execution_arn" { + type = string +} + +variable "network_api_stage_name" { + type = string +} + +variable "network_vpc_link_id" { + type = string +} + +variable "network_http_api_authorizer_id" { + type = string +} + +variable "worker_topic_arn" { + type = string +} + +variable "worker_topic_name" { + type = string +} diff --git a/infra/modules/aws/lambda_worker/README.md b/infra/modules/aws/lambda_worker/README.md index 5fa82303..9009bd15 100644 --- a/infra/modules/aws/lambda_worker/README.md +++ b/infra/modules/aws/lambda_worker/README.md @@ -5,7 +5,7 @@ Worker Lambda wrapper module. ## Owns - worker Lambda via `_shared/lambda` -- Lambda worker event-source mapping onto the shared worker messaging queue +- Lambda worker event-source mapping onto the shared messaging queue - DLQ alarming for the Lambda worker queue ## Key outputs @@ -15,4 +15,4 @@ Worker Lambda wrapper module. - SQS read policy ARN - log group -This is the concrete worker implementation on top of the shared Lambda primitives. It reads the Lambda worker queue from the `worker_messaging` stack so the same SNS event can fan out to both the Lambda and ECS worker consumers. +This is the concrete worker implementation on top of the shared Lambda primitives. It reads the Lambda worker queue from the `messaging` stack so the same SNS event can fan out to both the Lambda and ECS worker consumers. diff --git a/infra/modules/aws/lambda_worker/data.tf b/infra/modules/aws/lambda_worker/data.tf deleted file mode 100644 index 5c9e2ae6..00000000 --- a/infra/modules/aws/lambda_worker/data.tf +++ /dev/null @@ -1,9 +0,0 @@ -data "terraform_remote_state" "worker_messaging" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/worker_messaging/terraform.tfstate" - region = var.aws_region - } -} diff --git a/infra/modules/aws/lambda_worker/main.tf b/infra/modules/aws/lambda_worker/main.tf index 7d2df03f..a340afa9 100644 --- a/infra/modules/aws/lambda_worker/main.tf +++ b/infra/modules/aws/lambda_worker/main.tf @@ -14,7 +14,7 @@ module "lambda_worker" { } additional_policy_arns = [ - data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_read_policy_arn + var.lambda_worker_queue_read_policy_arn ] deployment_config = var.deployment_config @@ -29,7 +29,7 @@ module "lambda_worker" { sqs_scale = merge( var.provisioned_config.sqs_scale, { - queue_name = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_name + queue_name = var.lambda_worker_queue_name } ) } @@ -37,7 +37,7 @@ module "lambda_worker" { } resource "aws_lambda_event_source_mapping" "sqs" { - event_source_arn = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_arn + event_source_arn = var.lambda_worker_queue_arn function_name = module.lambda_worker.function_name batch_size = local.sqs_chunk_size @@ -47,8 +47,8 @@ resource "aws_lambda_event_source_mapping" "sqs" { } resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { - alarm_name = "${data.terraform_remote_state.worker_messaging.outputs.lambda_worker_dead_letter_queue_name}-new-messages" - alarm_description = "New messages sent to DLQ ${data.terraform_remote_state.worker_messaging.outputs.lambda_worker_dead_letter_queue_name}" + alarm_name = "${var.lambda_worker_dead_letter_queue_name}-new-messages" + alarm_description = "New messages sent to DLQ ${var.lambda_worker_dead_letter_queue_name}" actions_enabled = true namespace = "AWS/SQS" @@ -64,6 +64,6 @@ resource "aws_cloudwatch_metric_alarm" "dlq_new_messages" { treat_missing_data = "notBreaching" dimensions = { - QueueName = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_dead_letter_queue_name + QueueName = var.lambda_worker_dead_letter_queue_name } } diff --git a/infra/modules/aws/lambda_worker/outputs.tf b/infra/modules/aws/lambda_worker/outputs.tf index 9a93388c..aadade6d 100644 --- a/infra/modules/aws/lambda_worker/outputs.tf +++ b/infra/modules/aws/lambda_worker/outputs.tf @@ -15,17 +15,17 @@ output "lambda_alias_name" { } output "sqs_queue_url" { - value = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_url + value = var.lambda_worker_queue_url } output "sqs_queue_name" { - value = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_name + value = var.lambda_worker_queue_name } output "sqs_queue_read_policy_arn" { - value = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_queue_read_policy_arn + value = var.lambda_worker_queue_read_policy_arn } output "dead_letter_queue_url" { - value = data.terraform_remote_state.worker_messaging.outputs.lambda_worker_dead_letter_queue_url + value = var.lambda_worker_dead_letter_queue_url } diff --git a/infra/modules/aws/lambda_worker/variables.tf b/infra/modules/aws/lambda_worker/variables.tf index fa8be37f..d3550685 100644 --- a/infra/modules/aws/lambda_worker/variables.tf +++ b/infra/modules/aws/lambda_worker/variables.tf @@ -11,12 +11,12 @@ variable "environment" { variable "state_bucket" { type = string - description = "Remote state bucket used to read shared stack outputs" + description = "Terraform state bucket" } variable "aws_region" { type = string - description = "AWS region for remote state and provider resources" + description = "AWS region for provider resources" } variable "code_bucket" { @@ -75,3 +75,27 @@ variable "sqs_dlq_alarm_datapoints_to_alarm" { type = number description = "The number of evaluated periods that must be breaching to trigger ALARM" } + +variable "lambda_worker_queue_name" { + type = string +} + +variable "lambda_worker_queue_arn" { + type = string +} + +variable "lambda_worker_queue_url" { + type = string +} + +variable "lambda_worker_queue_read_policy_arn" { + type = string +} + +variable "lambda_worker_dead_letter_queue_name" { + type = string +} + +variable "lambda_worker_dead_letter_queue_url" { + type = string +} diff --git a/infra/modules/aws/worker_messaging/README.md b/infra/modules/aws/messaging/README.md similarity index 82% rename from infra/modules/aws/worker_messaging/README.md rename to infra/modules/aws/messaging/README.md index fa2a55d5..8790208d 100644 --- a/infra/modules/aws/worker_messaging/README.md +++ b/infra/modules/aws/messaging/README.md @@ -1,6 +1,6 @@ -# `worker_messaging` +# `messaging` -Shared worker messaging stack. +Shared messaging stack. ## Owns @@ -11,8 +11,9 @@ Shared worker messaging stack. ## Key outputs -- `sns_topic_arn` -- `sns_topic_publish_policy_arn` +- `worker_topic_name` +- `worker_topic_arn` +- `worker_topic_publish_policy_arn` - `lambda_worker_queue_name` - `lambda_worker_queue_url` - `lambda_worker_queue_read_policy_arn` diff --git a/infra/modules/aws/worker_messaging/data.tf b/infra/modules/aws/messaging/data.tf similarity index 100% rename from infra/modules/aws/worker_messaging/data.tf rename to infra/modules/aws/messaging/data.tf diff --git a/infra/modules/aws/worker_messaging/local.tf b/infra/modules/aws/messaging/local.tf similarity index 100% rename from infra/modules/aws/worker_messaging/local.tf rename to infra/modules/aws/messaging/local.tf diff --git a/infra/modules/aws/worker_messaging/main.tf b/infra/modules/aws/messaging/main.tf similarity index 100% rename from infra/modules/aws/worker_messaging/main.tf rename to infra/modules/aws/messaging/main.tf diff --git a/infra/modules/aws/worker_messaging/outputs.tf b/infra/modules/aws/messaging/outputs.tf similarity index 94% rename from infra/modules/aws/worker_messaging/outputs.tf rename to infra/modules/aws/messaging/outputs.tf index a365faf1..fe513772 100644 --- a/infra/modules/aws/worker_messaging/outputs.tf +++ b/infra/modules/aws/messaging/outputs.tf @@ -1,12 +1,12 @@ -output "sns_topic_name" { +output "worker_topic_name" { value = aws_sns_topic.worker_events.name } -output "sns_topic_arn" { +output "worker_topic_arn" { value = aws_sns_topic.worker_events.arn } -output "sns_topic_publish_policy_arn" { +output "worker_topic_publish_policy_arn" { value = aws_iam_policy.topic_publish.arn } diff --git a/infra/modules/aws/worker_messaging/variables.tf b/infra/modules/aws/messaging/variables.tf similarity index 100% rename from infra/modules/aws/worker_messaging/variables.tf rename to infra/modules/aws/messaging/variables.tf diff --git a/infra/modules/aws/worker_messaging/versions.tf b/infra/modules/aws/messaging/versions.tf similarity index 100% rename from infra/modules/aws/worker_messaging/versions.tf rename to infra/modules/aws/messaging/versions.tf diff --git a/infra/modules/aws/migrations/README.md b/infra/modules/aws/migrations/README.md index 1fc35b48..35ec2507 100644 --- a/infra/modules/aws/migrations/README.md +++ b/infra/modules/aws/migrations/README.md @@ -17,5 +17,6 @@ Lambda wrapper for database migrations using packaged SQLAlchemy models. - `cloudwatch_log_group` This module is intended for manual or pipeline-triggered schema migrations against the shared Aurora PostgreSQL database. It runs inside the VPC and reuses the shared runtime security group from `security` so it can reach the database without introducing a second database-ingress rule pattern. +The live Terragrunt stack is expected to pass that runtime security group id as an explicit input. For bootstrap-friendly plan and validate flows, prefer Terragrunt dependency mocks in the live stack instead of direct `security` remote-state reads in the module. The current handler loads the packaged SQLAlchemy models, checks whether its owned table already exists, and creates the declared table metadata directly in the default schema when needed for the worker runtime. In this repo's reusable code deploy workflow, the function is also invoked automatically when `migrations` is part of the Lambda deployment matrix. diff --git a/infra/modules/aws/migrations/data.tf b/infra/modules/aws/migrations/data.tf index 2c4aca5e..9aa389d5 100644 --- a/infra/modules/aws/migrations/data.tf +++ b/infra/modules/aws/migrations/data.tf @@ -1,23 +1,3 @@ -data "terraform_remote_state" "database" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/database/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "security" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/security/terraform.tfstate" - region = var.aws_region - } -} - data "aws_vpc" "this" { filter { name = "tag:Name" @@ -44,7 +24,7 @@ data "aws_iam_policy_document" "database_secret_read" { ] resources = [ - data.terraform_remote_state.database.outputs.credentials_secret_arn, + var.database_credentials_secret_arn, ] } } diff --git a/infra/modules/aws/migrations/main.tf b/infra/modules/aws/migrations/main.tf index 283970b2..376baf73 100644 --- a/infra/modules/aws/migrations/main.tf +++ b/infra/modules/aws/migrations/main.tf @@ -15,10 +15,10 @@ module "migrations" { lambda_name = local.lambda_name environment_variables = { - DB_HOST = data.terraform_remote_state.database.outputs.readwrite_endpoint - DB_NAME = data.terraform_remote_state.database.outputs.database_name - DB_PORT = tostring(data.terraform_remote_state.database.outputs.database_port) - DB_SECRET_ARN = data.terraform_remote_state.database.outputs.credentials_secret_arn + DB_HOST = var.database_readwrite_endpoint + DB_NAME = var.database_name + DB_PORT = tostring(var.database_port) + DB_SECRET_ARN = var.database_credentials_secret_arn } additional_policy_arns = [ @@ -27,6 +27,6 @@ module "migrations" { vpc_subnet_ids = data.aws_subnets.private.ids vpc_security_group_ids = [ - data.terraform_remote_state.security.outputs.runtime_sg, + var.runtime_security_group_id, ] } diff --git a/infra/modules/aws/migrations/variables.tf b/infra/modules/aws/migrations/variables.tf index 4bdb1076..75be630a 100644 --- a/infra/modules/aws/migrations/variables.tf +++ b/infra/modules/aws/migrations/variables.tf @@ -26,3 +26,23 @@ variable "otel_sample_rate" { variable "vpc_name" { type = string } + +variable "runtime_security_group_id" { + type = string +} + +variable "database_readwrite_endpoint" { + type = string +} + +variable "database_name" { + type = string +} + +variable "database_port" { + type = number +} + +variable "database_credentials_secret_arn" { + type = string +} diff --git a/infra/modules/aws/network/README.md b/infra/modules/aws/network/README.md index 02d76f72..e7c58a19 100644 --- a/infra/modules/aws/network/README.md +++ b/infra/modules/aws/network/README.md @@ -31,28 +31,20 @@ In the common ECS API shape used here: ## Dependencies - pre-existing tagged VPC and private subnets discovered with `data` lookups -- shared security groups from `security` -- `cognito` remote state for the shared JWT issuer and audience +- shared security-group outputs from the `security` live stack +- shared Cognito outputs from the `cognito` live stack for the JWT issuer and audience + +The live Terragrunt stack is expected to provide those upstream values as explicit module inputs. For plan and validate flows before upstream stacks exist, prefer Terragrunt `dependency` mocks in the live stack instead of reading cross-stack state directly inside the Terraform module. ## Bootstrap Notes -This module is not bootstrap-independent. It reads multiple outputs from the `security` stack through remote state, including `vpc_endpoint_sg` for the interface VPC endpoints and `api_vpc_link_sg` for the shared API Gateway VPC link. +This module still depends on upstream `security` and `cognito` stacks at apply time, but the bootstrap-sensitive contract should live in the Terragrunt wrapper rather than in Terraform `terraform_remote_state` blocks inside the module. That means: -- `security` must be applied successfully before `network` -- the `security` state file must contain the current outputs, not just an empty or partially initialized state -- a failed or stale bootstrap of `security` can surface here as an `Unsupported attribute` error when Terraform tries to read `data.terraform_remote_state.security.outputs.*` - -If you see an error like: - -```text -Error: Unsupported attribute -data.terraform_remote_state.security.outputs is object with no attributes -This object does not have an attribute named "vpc_endpoint_sg". -``` - -then the problem is usually not the `network` module itself. It means the upstream `security` stack has not produced readable outputs yet. In that case, apply `security` first and confirm its state includes `vpc_endpoint_sg`, `api_vpc_link_sg`, and the other expected outputs before retrying `network`. +- `security` and `cognito` still need to exist for real applies +- plan and validate flows can use Terragrunt `dependency` mocks when those upstream stacks are not available yet +- if apply-time values are missing, fix the upstream stack or the live-stack dependency wiring rather than adding direct cross-stack remote-state reads back into the module ## Feasibility Constraints diff --git a/infra/modules/aws/network/data.tf b/infra/modules/aws/network/data.tf index 918deda7..a346ce5a 100644 --- a/infra/modules/aws/network/data.tf +++ b/infra/modules/aws/network/data.tf @@ -23,23 +23,3 @@ data "aws_route_tables" "private" { values = data.aws_subnets.private.ids } } - -data "terraform_remote_state" "security" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/security/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "cognito" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/cognito/terraform.tfstate" - region = var.aws_region - } -} diff --git a/infra/modules/aws/network/main.tf b/infra/modules/aws/network/main.tf index d8fd0272..5c18afd5 100644 --- a/infra/modules/aws/network/main.tf +++ b/infra/modules/aws/network/main.tf @@ -2,7 +2,7 @@ resource "aws_lb" "this" { name = local.load_balancer_name internal = true load_balancer_type = "application" - security_groups = [data.terraform_remote_state.security.outputs.load_balancer_sg] + security_groups = [var.load_balancer_sg] subnets = data.aws_subnets.private.ids } @@ -14,7 +14,7 @@ resource "aws_apigatewayv2_api" "http_api" { resource "aws_apigatewayv2_vpc_link" "http_api" { name = "${var.project_name}-${var.environment}-http-vpc-link" subnet_ids = data.aws_subnets.private.ids - security_group_ids = [data.terraform_remote_state.security.outputs.api_vpc_link_sg] + security_group_ids = [var.api_vpc_link_sg] } resource "aws_apigatewayv2_stage" "default" { @@ -30,8 +30,8 @@ resource "aws_apigatewayv2_authorizer" "cognito_jwt" { identity_sources = ["$request.header.Authorization"] jwt_configuration { - audience = [data.terraform_remote_state.cognito.outputs.user_pool_client_id] - issuer = data.terraform_remote_state.cognito.outputs.issuer_url + audience = [var.auth_user_pool_client_id] + issuer = var.auth_issuer_url } } @@ -41,7 +41,7 @@ resource "aws_vpc_endpoint" "interface_endpoints" { vpc_id = data.aws_vpc.this.id service_name = "com.amazonaws.${var.aws_region}.${each.value}" vpc_endpoint_type = "Interface" - security_group_ids = [data.terraform_remote_state.security.outputs.vpc_endpoint_sg] + security_group_ids = [var.vpc_endpoint_sg] subnet_ids = data.aws_subnets.private.ids private_dns_enabled = true } diff --git a/infra/modules/aws/network/variables.tf b/infra/modules/aws/network/variables.tf index 6242bad8..f8830886 100644 --- a/infra/modules/aws/network/variables.tf +++ b/infra/modules/aws/network/variables.tf @@ -16,6 +16,26 @@ variable "state_bucket" { } ### end of static vars set in root.hcl ### +variable "load_balancer_sg" { + type = string +} + +variable "api_vpc_link_sg" { + type = string +} + +variable "vpc_endpoint_sg" { + type = string +} + +variable "auth_user_pool_client_id" { + type = string +} + +variable "auth_issuer_url" { + type = string +} + variable "vpc_name" { type = string } diff --git a/infra/modules/aws/rds_reader_tagger/README.md b/infra/modules/aws/rds_reader_tagger/README.md index 6f4c0719..5f23e197 100644 --- a/infra/modules/aws/rds_reader_tagger/README.md +++ b/infra/modules/aws/rds_reader_tagger/README.md @@ -30,7 +30,7 @@ EventBridge-triggered and directly invokable Lambda that syncs cluster tags onto ## Dependency Notes -- reads the shared `database` remote state to get the expected Aurora cluster identifier +- expects the live Terragrunt stack to pass the shared `database` cluster identifier through a `dependency` block - relies on the shared Lambda build and deploy flow for shipping the tagging code - when `rds_reader_tagger` is present in the Lambda deploy matrix, the reusable `deploy.yml` workflow invokes it once after Lambda rollout so existing readers are reconciled too - uses a shortened AWS resource-name prefix (`rds-tag-sync`) so the Lambda, IAM, CodeDeploy, and EventBridge resources stay within AWS name limits while the stack directory remains `rds_reader_tagger` diff --git a/infra/modules/aws/rds_reader_tagger/data.tf b/infra/modules/aws/rds_reader_tagger/data.tf index 941ad6ca..23579557 100644 --- a/infra/modules/aws/rds_reader_tagger/data.tf +++ b/infra/modules/aws/rds_reader_tagger/data.tf @@ -1,13 +1,3 @@ -data "terraform_remote_state" "database" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/database/terraform.tfstate" - region = var.aws_region - } -} - data "aws_iam_policy_document" "reader_tag_sync" { statement { actions = [ diff --git a/infra/modules/aws/rds_reader_tagger/main.tf b/infra/modules/aws/rds_reader_tagger/main.tf index fc365f2f..ea68c86a 100644 --- a/infra/modules/aws/rds_reader_tagger/main.tf +++ b/infra/modules/aws/rds_reader_tagger/main.tf @@ -15,7 +15,7 @@ module "rds_reader_tagger" { lambda_name = local.lambda_name environment_variables = { - EXPECTED_CLUSTER_IDENTIFIER = data.terraform_remote_state.database.outputs.cluster_identifier + EXPECTED_CLUSTER_IDENTIFIER = var.database_cluster_identifier } additional_policy_arns = [ diff --git a/infra/modules/aws/rds_reader_tagger/variables.tf b/infra/modules/aws/rds_reader_tagger/variables.tf index f9fcabc7..fdcc7224 100644 --- a/infra/modules/aws/rds_reader_tagger/variables.tf +++ b/infra/modules/aws/rds_reader_tagger/variables.tf @@ -22,3 +22,7 @@ variable "otel_sample_rate" { type = number default = 1.0 } + +variable "database_cluster_identifier" { + type = string +} diff --git a/infra/modules/aws/service_api/README.md b/infra/modules/aws/service_api/README.md index de5584d3..73608c66 100644 --- a/infra/modules/aws/service_api/README.md +++ b/infra/modules/aws/service_api/README.md @@ -39,9 +39,11 @@ Concrete ECS API service wrapper for the sample API service. ## Dependency Notes -- reads `task_api` remote state for the task definition -- reads `cluster`, `network`, and `security` remote state +- expects the live Terragrunt stack to pass the `task_api` task definition through a `dependency` block +- expects the live Terragrunt stack to pass the shared `cluster` and `network` outputs as explicit inputs +- expects the live Terragrunt stack to pass the ECS runtime security group id as an explicit input - depends on the `network` stack owning the shared VPC link, ALB listener path, and JWT authorizer inputs +- for bootstrap-friendly plan and validate flows, prefer Terragrunt dependency mocks in the live stack rather than sibling state reads inside the module ## Inherits Behavior From diff --git a/infra/modules/aws/service_api/data.tf b/infra/modules/aws/service_api/data.tf index 4e912710..93d02b4c 100644 --- a/infra/modules/aws/service_api/data.tf +++ b/infra/modules/aws/service_api/data.tf @@ -1,44 +1,3 @@ -data "terraform_remote_state" "task_api" { - count = var.bootstrap ? 0 : 1 - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/task_api/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "network" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/network/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "security" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/security/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "cluster" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/cluster/terraform.tfstate" - region = var.aws_region - } -} - data "aws_vpc" "this" { filter { name = "tag:Name" diff --git a/infra/modules/aws/service_api/main.tf b/infra/modules/aws/service_api/main.tf index 5442302c..a7ad4e89 100644 --- a/infra/modules/aws/service_api/main.tf +++ b/infra/modules/aws/service_api/main.tf @@ -2,7 +2,7 @@ module "service_api" { source = "../_shared/service" service_name = var.service_name - task_definition_arn = var.bootstrap ? "" : data.terraform_remote_state.task_api[0].outputs.task_definition_arn + task_definition_arn = var.bootstrap ? "" : var.task_definition_arn container_port = var.container_port root_path = var.root_path connection_type = var.connection_type @@ -11,22 +11,22 @@ module "service_api" { vpc_id = data.aws_vpc.this.id private_subnet_ids = data.aws_subnets.private.ids - cluster_id = data.terraform_remote_state.cluster.outputs.cluster_id - cluster_name = data.terraform_remote_state.cluster.outputs.cluster_name - ecs_security_group_id = data.terraform_remote_state.security.outputs.ecs_sg + cluster_id = var.cluster_id + cluster_name = var.cluster_name + ecs_security_group_id = var.ecs_security_group_id - default_target_group_arn = data.terraform_remote_state.network.outputs.default_target_group_arn - load_balancer_arn = data.terraform_remote_state.network.outputs.load_balancer_arn - default_http_listener_arn = data.terraform_remote_state.network.outputs.default_http_listener_arn - load_balancer_arn_suffix = data.terraform_remote_state.network.outputs.load_balancer_arn_suffix - target_group_arn_suffix = data.terraform_remote_state.network.outputs.target_group_arn_suffix + default_target_group_arn = var.network_default_target_group_arn + load_balancer_arn = var.network_load_balancer_arn + default_http_listener_arn = var.network_default_http_listener_arn + load_balancer_arn_suffix = var.network_load_balancer_arn_suffix + target_group_arn_suffix = var.network_target_group_arn_suffix - api_id = data.terraform_remote_state.network.outputs.api_id - vpc_link_id = data.terraform_remote_state.network.outputs.vpc_link_id - internal_invoke_url = data.terraform_remote_state.network.outputs.internal_invoke_url - api_invoke_url = data.terraform_remote_state.network.outputs.api_invoke_url + api_id = var.network_api_id + vpc_link_id = var.network_vpc_link_id + internal_invoke_url = var.network_internal_invoke_url + api_invoke_url = var.network_api_invoke_url authorization_type = "JWT" - authorizer_id = data.terraform_remote_state.network.outputs.http_api_authorizer_id + authorizer_id = var.network_http_api_authorizer_id bootstrap = var.bootstrap bootstrap_image_uri = var.bootstrap_image_uri diff --git a/infra/modules/aws/service_api/outputs.tf b/infra/modules/aws/service_api/outputs.tf index d2be3728..6981573c 100644 --- a/infra/modules/aws/service_api/outputs.tf +++ b/infra/modules/aws/service_api/outputs.tf @@ -3,7 +3,7 @@ output "service_name" { } output "cluster_name" { - value = data.terraform_remote_state.cluster.outputs.cluster_name + value = var.cluster_name } output "codedeploy_app_name" { diff --git a/infra/modules/aws/service_api/variables.tf b/infra/modules/aws/service_api/variables.tf index 7facdd20..38f61457 100644 --- a/infra/modules/aws/service_api/variables.tf +++ b/infra/modules/aws/service_api/variables.tf @@ -75,3 +75,60 @@ variable "bootstrap_image_uri" { error_message = "bootstrap_image_uri must be set when bootstrap is true." } } + +variable "ecs_security_group_id" { + type = string +} + +variable "task_definition_arn" { + type = string + default = "arn:aws:ecs:eu-west-2:111111111111:task-definition/mock-task-api:1" +} + +variable "cluster_id" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "network_default_target_group_arn" { + type = string +} + +variable "network_load_balancer_arn" { + type = string +} + +variable "network_default_http_listener_arn" { + type = string +} + +variable "network_load_balancer_arn_suffix" { + type = string +} + +variable "network_target_group_arn_suffix" { + type = string +} + +variable "network_api_id" { + type = string +} + +variable "network_vpc_link_id" { + type = string +} + +variable "network_internal_invoke_url" { + type = string +} + +variable "network_api_invoke_url" { + type = string +} + +variable "network_http_api_authorizer_id" { + type = string +} diff --git a/infra/modules/aws/service_worker/README.md b/infra/modules/aws/service_worker/README.md index 820a4d3b..13447734 100644 --- a/infra/modules/aws/service_worker/README.md +++ b/infra/modules/aws/service_worker/README.md @@ -15,7 +15,7 @@ Concrete ECS worker service wrapper. ## Inputs That Change Behavior - uses the worker task revision exported by `task_worker` -- uses autoscaling inputs derived from the shared ECS worker queue owned by `worker_messaging` +- uses autoscaling inputs derived from the shared ECS worker queue owned by `messaging` - uses placeholder values during bootstrap applies so the first service apply does not require pre-existing task state ## Outputs Consumers Rely On @@ -35,12 +35,14 @@ Concrete ECS worker service wrapper. ## Dependency Notes -- reads `task_worker` remote state -- reads `worker_messaging` remote state -- reads `cluster`, `network`, and `security` remote state -- relies on `worker_messaging` owning the queue contract rather than duplicating queue state locally +- expects the live Terragrunt stack to pass the `task_worker` task definition through a `dependency` block +- expects the live Terragrunt stack to pass the shared ECS worker queue name through a `dependency` block to drive autoscaling +- expects the live Terragrunt stack to pass the shared `cluster` and `network` outputs as explicit inputs +- expects the live Terragrunt stack to pass the ECS runtime security group id as an explicit input +- relies on `messaging` owning the queue contract rather than duplicating queue state locally +- for bootstrap-friendly plan and validate flows, prefer Terragrunt dependency mocks in the live stack rather than sibling state reads inside the module -It uses the shared ECS worker queue name exported by `worker_messaging` for service autoscaling. +It uses the shared ECS worker queue name exported by `messaging` for service autoscaling. During bootstrap applies, it uses placeholder values instead of reading task outputs directly so the bootstrap path does not need a pre-existing task state file. ## Inherits Behavior From diff --git a/infra/modules/aws/service_worker/data.tf b/infra/modules/aws/service_worker/data.tf index ae6015ee..93d02b4c 100644 --- a/infra/modules/aws/service_worker/data.tf +++ b/infra/modules/aws/service_worker/data.tf @@ -1,55 +1,3 @@ -data "terraform_remote_state" "task_worker" { - count = var.bootstrap ? 0 : 1 - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/task_worker/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "worker_messaging" { - count = var.bootstrap ? 0 : 1 - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/worker_messaging/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "network" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/network/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "security" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/security/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "cluster" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/cluster/terraform.tfstate" - region = var.aws_region - } -} - data "aws_vpc" "this" { filter { name = "tag:Name" diff --git a/infra/modules/aws/service_worker/locals.tf b/infra/modules/aws/service_worker/locals.tf index 7d4d849e..a1c86e06 100644 --- a/infra/modules/aws/service_worker/locals.tf +++ b/infra/modules/aws/service_worker/locals.tf @@ -1,7 +1,4 @@ locals { - task_worker_outputs = var.bootstrap ? null : one(data.terraform_remote_state.task_worker[*].outputs) - worker_messaging_outputs = var.bootstrap ? null : one(data.terraform_remote_state.worker_messaging[*].outputs) - - task_definition_arn = var.bootstrap ? "" : local.task_worker_outputs.task_definition_arn - autoscaling_queue_name = var.bootstrap ? "not_set" : local.worker_messaging_outputs.ecs_worker_queue_name + task_definition_arn = var.bootstrap ? "" : var.task_definition_arn + autoscaling_queue_name = var.bootstrap ? "not_set" : var.ecs_worker_queue_name } diff --git a/infra/modules/aws/service_worker/main.tf b/infra/modules/aws/service_worker/main.tf index 626599dc..6aef2011 100644 --- a/infra/modules/aws/service_worker/main.tf +++ b/infra/modules/aws/service_worker/main.tf @@ -11,19 +11,19 @@ module "service_worker" { vpc_id = data.aws_vpc.this.id private_subnet_ids = data.aws_subnets.private.ids - cluster_id = data.terraform_remote_state.cluster.outputs.cluster_id - cluster_name = data.terraform_remote_state.cluster.outputs.cluster_name - ecs_security_group_id = data.terraform_remote_state.security.outputs.ecs_sg + cluster_id = var.cluster_id + cluster_name = var.cluster_name + ecs_security_group_id = var.ecs_security_group_id - default_target_group_arn = data.terraform_remote_state.network.outputs.default_target_group_arn - default_http_listener_arn = data.terraform_remote_state.network.outputs.default_http_listener_arn - load_balancer_arn_suffix = data.terraform_remote_state.network.outputs.load_balancer_arn_suffix - target_group_arn_suffix = data.terraform_remote_state.network.outputs.target_group_arn_suffix + default_target_group_arn = var.network_default_target_group_arn + default_http_listener_arn = var.network_default_http_listener_arn + load_balancer_arn_suffix = var.network_load_balancer_arn_suffix + target_group_arn_suffix = var.network_target_group_arn_suffix - api_id = data.terraform_remote_state.network.outputs.api_id - vpc_link_id = data.terraform_remote_state.network.outputs.vpc_link_id - internal_invoke_url = data.terraform_remote_state.network.outputs.internal_invoke_url - api_invoke_url = data.terraform_remote_state.network.outputs.api_invoke_url + api_id = var.network_api_id + vpc_link_id = var.network_vpc_link_id + internal_invoke_url = var.network_internal_invoke_url + api_invoke_url = var.network_api_invoke_url bootstrap = var.bootstrap bootstrap_image_uri = var.bootstrap_image_uri diff --git a/infra/modules/aws/service_worker/outputs.tf b/infra/modules/aws/service_worker/outputs.tf index 38a97a9d..67794a66 100644 --- a/infra/modules/aws/service_worker/outputs.tf +++ b/infra/modules/aws/service_worker/outputs.tf @@ -3,7 +3,7 @@ output "service_name" { } output "cluster_name" { - value = data.terraform_remote_state.cluster.outputs.cluster_name + value = var.cluster_name } output "codedeploy_app_name" { diff --git a/infra/modules/aws/service_worker/variables.tf b/infra/modules/aws/service_worker/variables.tf index 4b065255..a2c58837 100644 --- a/infra/modules/aws/service_worker/variables.tf +++ b/infra/modules/aws/service_worker/variables.tf @@ -75,3 +75,56 @@ variable "bootstrap_image_uri" { error_message = "bootstrap_image_uri must be set when bootstrap is true." } } + +variable "ecs_security_group_id" { + type = string +} + +variable "task_definition_arn" { + type = string + default = "arn:aws:ecs:eu-west-2:111111111111:task-definition/mock-task-worker:1" +} + +variable "ecs_worker_queue_name" { + type = string +} + +variable "cluster_id" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "network_default_target_group_arn" { + type = string +} + +variable "network_default_http_listener_arn" { + type = string +} + +variable "network_load_balancer_arn_suffix" { + type = string +} + +variable "network_target_group_arn_suffix" { + type = string +} + +variable "network_api_id" { + type = string +} + +variable "network_vpc_link_id" { + type = string +} + +variable "network_internal_invoke_url" { + type = string +} + +variable "network_api_invoke_url" { + type = string +} diff --git a/infra/modules/aws/task_worker/README.md b/infra/modules/aws/task_worker/README.md index e132b56b..25b6b622 100644 --- a/infra/modules/aws/task_worker/README.md +++ b/infra/modules/aws/task_worker/README.md @@ -40,8 +40,8 @@ Concrete ECS worker task wrapper. ## Dependency Notes -- reads queue details from `worker_messaging` remote state -- reads database connection details from the shared `database` stack +- expects the live Terragrunt stack to pass queue details from `messaging` through a `dependency` block +- expects the live Terragrunt stack to pass shared `database` connection details as explicit inputs - publishes the task definition consumed by `service_worker` -This module is the image-driven deployment unit for the ECS worker. It reads the ECS worker queue from the `worker_messaging` stack so the task definition and service can consume the same fanout event stream as the Lambda worker, and it reads the shared `database` stack so the worker can persist consumed messages to Aurora PostgreSQL. +This module is the image-driven deployment unit for the ECS worker. It consumes the ECS worker queue contract owned by `messaging` and the shared `database` contract passed in from the live Terragrunt stack so the task definition and service can use the same fanout event stream and Aurora PostgreSQL connection details without the module reading sibling stack state directly. diff --git a/infra/modules/aws/task_worker/data.tf b/infra/modules/aws/task_worker/data.tf index a67310da..4499a1c9 100644 --- a/infra/modules/aws/task_worker/data.tf +++ b/infra/modules/aws/task_worker/data.tf @@ -1,23 +1,3 @@ -data "terraform_remote_state" "worker_messaging" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/worker_messaging/terraform.tfstate" - region = var.aws_region - } -} - -data "terraform_remote_state" "database" { - backend = "s3" - - config = { - bucket = var.state_bucket - key = "${var.environment}/aws/database/terraform.tfstate" - region = var.aws_region - } -} - data "aws_iam_policy_document" "database_secret_read" { statement { actions = [ @@ -25,7 +5,7 @@ data "aws_iam_policy_document" "database_secret_read" { ] resources = [ - data.terraform_remote_state.database.outputs.credentials_secret_arn, + var.database_credentials_secret_arn, ] } } diff --git a/infra/modules/aws/task_worker/main.tf b/infra/modules/aws/task_worker/main.tf index 6b964062..04f3073b 100644 --- a/infra/modules/aws/task_worker/main.tf +++ b/infra/modules/aws/task_worker/main.tf @@ -24,23 +24,23 @@ module "task_worker" { additional_env_vars = [ { name = "AWS_SQS_QUEUE_URL" - value = data.terraform_remote_state.worker_messaging.outputs.ecs_worker_queue_url + value = var.ecs_worker_queue_url }, { name = "DB_HOST" - value = data.terraform_remote_state.database.outputs.readwrite_endpoint + value = var.database_readwrite_endpoint }, { name = "DB_NAME" - value = data.terraform_remote_state.database.outputs.database_name + value = var.database_name }, { name = "DB_PORT" - value = tostring(data.terraform_remote_state.database.outputs.database_port) + value = tostring(var.database_port) }, { name = "DB_SECRET_ARN" - value = data.terraform_remote_state.database.outputs.credentials_secret_arn + value = var.database_credentials_secret_arn }, { name = "HEARTBEAT_FILE" @@ -48,7 +48,7 @@ module "task_worker" { } ] additional_runtime_policy_arns = [ - data.terraform_remote_state.worker_messaging.outputs.ecs_worker_queue_read_policy_arn, + var.ecs_worker_queue_read_policy_arn, aws_iam_policy.database_secret_read.arn, ] diff --git a/infra/modules/aws/task_worker/outputs.tf b/infra/modules/aws/task_worker/outputs.tf index 3f353262..0e6dc60a 100644 --- a/infra/modules/aws/task_worker/outputs.tf +++ b/infra/modules/aws/task_worker/outputs.tf @@ -15,13 +15,13 @@ output "service_name" { } output "sqs_queue_name" { - value = data.terraform_remote_state.worker_messaging.outputs.ecs_worker_queue_name + value = var.ecs_worker_queue_name } output "sqs_queue_url" { - value = data.terraform_remote_state.worker_messaging.outputs.ecs_worker_queue_url + value = var.ecs_worker_queue_url } output "sqs_queue_read_policy_arn" { - value = data.terraform_remote_state.worker_messaging.outputs.ecs_worker_queue_read_policy_arn + value = var.ecs_worker_queue_read_policy_arn } diff --git a/infra/modules/aws/task_worker/variables.tf b/infra/modules/aws/task_worker/variables.tf index c4bd09a5..a629b8b1 100644 --- a/infra/modules/aws/task_worker/variables.tf +++ b/infra/modules/aws/task_worker/variables.tf @@ -62,3 +62,31 @@ variable "xray_enabled" { type = bool default = false } + +variable "ecs_worker_queue_name" { + type = string +} + +variable "ecs_worker_queue_url" { + type = string +} + +variable "ecs_worker_queue_read_policy_arn" { + type = string +} + +variable "database_readwrite_endpoint" { + type = string +} + +variable "database_name" { + type = string +} + +variable "database_port" { + type = number +} + +variable "database_credentials_secret_arn" { + type = string +} diff --git a/infra/root.hcl b/infra/root.hcl index 6e713ae9..40d8460d 100644 --- a/infra/root.hcl +++ b/infra/root.hcl @@ -12,16 +12,23 @@ locals { global_vars = read_terragrunt_config(find_in_parent_folders("global_vars.hcl")) environment_vars = read_terragrunt_config(find_in_parent_folders("environment_vars.hcl")) + infra_root_dir = abspath(dirname(find_in_parent_folders("root.hcl"))) project_name = element(split("/", local.github_repo), 1) - aws_region = local.global_vars.inputs.aws_region - base_reference = "${local.aws_account_id}-${local.aws_region}-${local.project_name}" - deploy_role_name = "${local.project_name}-${local.environment}-github-oidc-role" - deploy_role_arn = "arn:aws:iam::${local.aws_account_id}:role/${local.deploy_role_name}" - state_bucket = "${local.base_reference}-tfstate" - state_key = "${local.environment}/${local.provider}/${local.module}/terraform.tfstate" - state_lock_table = "${local.project_name}-tf-lockid" + aws_region = local.global_vars.inputs.aws_region + base_reference = "${local.aws_account_id}-${local.aws_region}-${local.project_name}" + deploy_role_name = "${local.project_name}-${local.environment}-github-oidc-role" + deploy_role_arn = "arn:aws:iam::${local.aws_account_id}:role/${local.deploy_role_name}" + state_bucket = "${local.base_reference}-tfstate" + plan_bucket = "${local.base_reference}-tfplan" + state_key = "${local.environment}/${local.provider}/${local.module}/terraform.tfstate" + plan_artifact_stack_key = "${local.environment}/${local.provider}/${local.module}" + state_lock_table = "${local.project_name}-tf-lockid" + plan_artifact_retention_days = try( + local.environment_vars.inputs.infra_plan_artifact_expiration_days, + 1, + ) # separate shared artifact resources when dev, otherwise ci artifact_base = local.environment == "dev" ? "${local.base_reference}-${local.environment}" : "${local.base_reference}-ci" code_bucket = "${local.artifact_base}-code" @@ -35,6 +42,41 @@ terraform { "bash", "-c", "echo STATE:${local.state_bucket}/${local.state_key} TABLE:${local.state_lock_table}" ] } + + before_hook "ensure_plan_artifact_bucket" { + commands = ["init", "plan"] + execute = [ + "bash", + "${local.infra_root_dir}/scripts/ensure-plan-artifact-bucket.sh", + local.plan_bucket, + local.aws_region, + tostring(local.plan_artifact_retention_days), + ] + } + + before_hook "download_saved_plan" { + commands = ["apply"] + execute = [ + "bash", + "${local.infra_root_dir}/scripts/handle-plan-artifact.sh", + "download", + local.plan_artifact_stack_key, + local.plan_bucket, + local.environment, + ] + } + + after_hook "upload_saved_plan" { + commands = ["plan"] + execute = [ + "bash", + "${local.infra_root_dir}/scripts/handle-plan-artifact.sh", + "upload", + local.plan_artifact_stack_key, + local.plan_bucket, + local.environment, + ] + } } remote_state { @@ -101,6 +143,7 @@ inputs = merge( deploy_role_name = local.deploy_role_name deploy_role_arn = local.deploy_role_arn state_bucket = local.state_bucket + plan_bucket = local.plan_bucket state_lock_table = local.state_lock_table code_bucket = local.code_bucket ecr_repository_name = local.ecr_repository_name diff --git a/infra/scripts/ensure-plan-artifact-bucket.sh b/infra/scripts/ensure-plan-artifact-bucket.sh new file mode 100644 index 00000000..b1596b3e --- /dev/null +++ b/infra/scripts/ensure-plan-artifact-bucket.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euo pipefail + +bucket_name="${1:?bucket name is required}" +aws_region="${2:?aws region is required}" +retention_days="${3:-0}" +plan_prefix="${INFRA_PLAN_DIR:-terragrunt_plan/}" +reset_flag="${TG_RESET_PLAN_ARTIFACT_BUCKET:-false}" + +if [[ "$plan_prefix" != */ ]]; then + plan_prefix="${plan_prefix}/" +fi + +ensure_lifecycle() { + if [[ "$retention_days" =~ ^[0-9]+$ ]] && [ "$retention_days" -gt 0 ]; then + aws s3api put-bucket-lifecycle-configuration \ + --bucket "$bucket_name" \ + --lifecycle-configuration "{ + \"Rules\": [ + { + \"ID\": \"expire-plan-artifacts\", + \"Status\": \"Enabled\", + \"Filter\": {\"Prefix\": \"$plan_prefix\"}, + \"Expiration\": {\"Days\": $retention_days} + } + ] + }" >/dev/null + echo "Ensured plan artifact retention of ${retention_days} days on s3://${bucket_name}/${plan_prefix}" + fi +} + +if aws s3api head-bucket --bucket "$bucket_name" >/dev/null 2>&1; then + if [ "$reset_flag" = "true" ]; then + ensure_lifecycle + fi + exit 0 +fi + +if [ -r /dev/tty ] && [ -w /dev/tty ]; then + printf "Plan bucket '%s' does not exist. Create it in %s? [y/N] " "$bucket_name" "$aws_region" > /dev/tty + read -r response < /dev/tty + case "$response" in + [yY]|[yY][eE][sS]) ;; + *) + echo "Plan bucket creation declined." >&2 + exit 1 + ;; + esac +else + echo "Plan bucket '$bucket_name' does not exist and no interactive terminal is available for confirmation." >&2 + echo "Create it manually or rerun from a terminal where Terragrunt hooks can prompt." >&2 + exit 1 +fi + +if [ "$aws_region" = "us-east-1" ]; then + aws s3api create-bucket --bucket "$bucket_name" >/dev/null +else + aws s3api create-bucket --bucket "$bucket_name" --create-bucket-configuration "LocationConstraint=$aws_region" >/dev/null +fi + +ensure_lifecycle diff --git a/infra/scripts/handle-plan-artifact.sh b/infra/scripts/handle-plan-artifact.sh new file mode 100644 index 00000000..4cee9200 --- /dev/null +++ b/infra/scripts/handle-plan-artifact.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail + +mode="${1:?mode is required}" +logical_tg_dir="${2:?terragrunt directory is required}" +plan_bucket="${3:?plan bucket is required}" +environment="${4:?environment is required}" +infra_plan_dir="${INFRA_PLAN_DIR:-terragrunt_plan}" + +plan_path="${PWD}/terragrunt.tfplan" +plan_text_path="${PWD}/terragrunt.plan.txt" +plan_meta_path="${PWD}/terragrunt.plan.meta.json" +plan_json_path="${PWD}/terragrunt.plan.json" +plan_log_path="${PWD}/${TG_PLAN_LOG_FILENAME:-terragrunt.plan.log}" +fallback_plan_log_path="${TG_PLAN_LOG_ABS_PATH:-}" + +if [[ "${TG_ENABLE_PLAN_ARTIFACTS:-false}" != "true" ]]; then + echo "TG_ENABLE_PLAN_ARTIFACTS=false, skipping plan artifact ${mode}." >&2 + exit 0 +fi + +if [[ -z "${PLAN_ARTIFACT_RUN_ID:-}" ]]; then + echo "PLAN_ARTIFACT_RUN_ID is required when TG_ENABLE_PLAN_ARTIFACTS=true." >&2 + exit 1 +fi + +sanitized_dir="$(echo "$logical_tg_dir" | tr '/.' '--')" +artifact_s3_prefix="s3://${plan_bucket}/${infra_plan_dir}/${environment}/${PLAN_ARTIFACT_RUN_ID}/terragrunt-plan-${sanitized_dir}" + +case "$mode" in + download) + echo "Downloading plan artifacts from ${artifact_s3_prefix}" >&2 + if ! aws s3 ls "${artifact_s3_prefix}/terragrunt.tfplan" >/dev/null 2>&1; then + echo "Saved plan artifact not found for ${logical_tg_dir} and PLAN_ARTIFACT_RUN_ID=${PLAN_ARTIFACT_RUN_ID}." >&2 + echo "Expected plan bundle at ${artifact_s3_prefix}" >&2 + exit 1 + fi + + aws s3 cp "${artifact_s3_prefix}/terragrunt.tfplan" "$plan_path" + aws s3 cp "${artifact_s3_prefix}/terragrunt.plan.txt" "$plan_text_path" + aws s3 cp "${artifact_s3_prefix}/terragrunt.plan.meta.json" "$plan_meta_path" + echo "Downloaded plan artifacts for ${logical_tg_dir}" >&2 + + if [[ "$(jq -r '.contains_mocked_outputs // false' "$plan_meta_path")" == "true" ]]; then + echo "Saved plan for '${logical_tg_dir}' contains mocked outputs. Regenerate it after upstream real outputs exist." >&2 + exit 1 + fi + ;; + upload) + if [[ ! -f "$plan_path" ]]; then + exit 0 + fi + + terraform show -no-color "$plan_path" > "$plan_text_path" + terraform show -json "$plan_path" > "$plan_json_path" + + contains_mocked_outputs=false + if [[ -f "$plan_log_path" ]] && grep -Fq "mock outputs provided and returning those in dependency output" "$plan_log_path"; then + contains_mocked_outputs=true + elif [[ -n "$fallback_plan_log_path" ]] && [[ -f "$fallback_plan_log_path" ]] && grep -Fq "mock outputs provided and returning those in dependency output" "$fallback_plan_log_path"; then + contains_mocked_outputs=true + fi + + jq -n \ + --arg tg_directory "$logical_tg_dir" \ + --argjson has_changes "$(jq -r '([(.resource_changes // [])[]?.change.actions[]?] | any(. != "no-op")) or ((.output_changes // {}) | length > 0)' "$plan_json_path")" \ + --argjson contains_mocked_outputs "$contains_mocked_outputs" \ + '{tg_directory: $tg_directory, has_changes: $has_changes, contains_mocked_outputs: $contains_mocked_outputs}' \ + > "$plan_meta_path" + + echo "Uploading plan artifacts for ${logical_tg_dir} to ${artifact_s3_prefix}" >&2 + aws s3 cp "$plan_path" "${artifact_s3_prefix}/terragrunt.tfplan" + aws s3 cp "$plan_text_path" "${artifact_s3_prefix}/terragrunt.plan.txt" + aws s3 cp "$plan_meta_path" "${artifact_s3_prefix}/terragrunt.plan.meta.json" + echo "Uploaded plan artifacts for ${logical_tg_dir}" >&2 + rm -f "$plan_json_path" + ;; + *) + echo "Unknown mode '$mode'." >&2 + exit 2 + ;; +esac diff --git a/justfile b/justfile index 6a50aa06..f29abaa9 100644 --- a/justfile +++ b/justfile @@ -3,8 +3,6 @@ _default: @just --list @printf '\nCI recipes (`just --justfile justfile.ci --list`):\n' @just --justfile justfile.ci --list - @printf '\nTerragrunt recipes (`just --justfile justfile.tg --list`):\n' - @just --justfile justfile.tg --list @printf '\nDeploy recipes (`just --justfile justfile.deploy --list`):\n' @just --justfile justfile.deploy --list @printf '\nDestroy recipes (`just --justfile justfile.destroy --list`):\n' diff --git a/justfile.ci b/justfile.ci index 1e402741..c0ec334c 100644 --- a/justfile.ci +++ b/justfile.ci @@ -108,36 +108,6 @@ get-version-file-keys: | jq -s -c . -# Upload shared infra plan metadata to the shared code bucket. -infra-plan-metadata-upload: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${PLAN_ARTIFACT_S3_PREFIX:-}" ]]; then - echo "❌ PLAN_ARTIFACT_S3_PREFIX environment variable is not set." - exit 1 - fi - - artifact_s3_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-s3-prefix)" - - aws s3 cp "plan-metadata.json" "${artifact_s3_prefix}/infra-plan-metadata/plan-metadata.json" - - -# Download shared infra plan metadata from the shared code bucket. -infra-plan-metadata-download: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${PLAN_ARTIFACT_S3_PREFIX:-}" ]]; then - echo "❌ PLAN_ARTIFACT_S3_PREFIX environment variable is not set." - exit 1 - fi - - artifact_s3_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-s3-prefix)" - - aws s3 cp "${artifact_s3_prefix}/infra-plan-metadata/plan-metadata.json" "plan-metadata.json" - - # Return the Lambda artifact directory name from the repo-root justfile. code-bucket-get-lambda-artifact-dir: @just --justfile "{{PROJECT_DIR}}/justfile" code-bucket-get-lambda-artifact-dir diff --git a/justfile.deploy b/justfile.deploy index f58f03e9..cd9cd0d3 100644 --- a/justfile.deploy +++ b/justfile.deploy @@ -122,8 +122,8 @@ lambda-build: exit 1 fi - python3 -m venv venv - source venv/bin/activate + python3 -m venv .venv + source .venv/bin/activate LAMBDA_BUILD_DIR="{{PROJECT_DIR}}/{{LAMBDA_DIR}}/build" diff --git a/justfile.tg b/justfile.tg deleted file mode 100644 index 9491885b..00000000 --- a/justfile.tg +++ /dev/null @@ -1,167 +0,0 @@ -# Terragrunt plan artifact helpers. -# This file is for producing, downloading, and uploading saved Terragrunt plan files. - -PROJECT_DIR := `just --justfile justfile --evaluate PROJECT_DIR` -INFRA_PLAN_DIR := `just --justfile justfile --evaluate INFRA_PLAN_DIR` - -PLAN_FILE := "terragrunt.tfplan" -PLAN_TEXT_FILE := "terragrunt.plan.txt" -PLAN_META_FILE := "terragrunt.plan.meta.json" - - -# Render Terragrunt plan sidecars (plan text + metadata) for an existing binary plan. -# -# Expected environment variables: -# - TG_DIRECTORY: directory containing the saved plan file -# - TG_PLAN_EXIT_CODE: detailed-exitcode from `terragrunt plan` (0 or 2) -terragrunt-plan-render: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${TG_DIRECTORY:-}" ]]; then - echo "❌ TG_DIRECTORY environment variable is not set." - exit 1 - fi - - if [[ -z "${TG_PLAN_EXIT_CODE:-}" ]]; then - echo "❌ TG_PLAN_EXIT_CODE environment variable is not set." - exit 1 - fi - - cd "$TG_DIRECTORY" - - PLAN_PATH="$(pwd)/{{PLAN_FILE}}" - PLAN_TEXT_PATH="$(pwd)/{{PLAN_TEXT_FILE}}" - PLAN_META_PATH="$(pwd)/{{PLAN_META_FILE}}" - - if [[ ! -f "$PLAN_PATH" ]]; then - echo "❌ Expected plan file '$PLAN_PATH' was not found." - exit 1 - fi - - terragrunt show -no-color "$PLAN_PATH" > "$PLAN_TEXT_PATH" - - jq -n \ - --arg tg_directory "$TG_DIRECTORY" \ - --argjson exit_code "$TG_PLAN_EXIT_CODE" \ - --argjson has_changes "$([ "$TG_PLAN_EXIT_CODE" -eq 2 ] && echo true || echo false)" \ - '{tg_directory: $tg_directory, exit_code: $exit_code, has_changes: $has_changes}' \ - > "$PLAN_META_PATH" - - echo "Terragrunt binary plan path: $PLAN_PATH" - ls -l "$PLAN_PATH" - echo "Terragrunt rendered plan path: $PLAN_TEXT_PATH" - cat "$PLAN_TEXT_PATH" - echo "Terragrunt plan metadata path: $PLAN_META_PATH" - cat "$PLAN_META_PATH" - - -# Derive the shared S3 base prefix for Terragrunt plan artifacts. -terragrunt-plan-base-prefix: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -n "${PLAN_ARTIFACT_S3_PREFIX:-}" ]]; then - echo "${PLAN_ARTIFACT_S3_PREFIX#s3://*/}" - exit 0 - fi - - if [[ -z "${ENVIRONMENT:-}" ]]; then - echo "❌ ENVIRONMENT environment variable is not set." - exit 1 - fi - - if [[ -z "${RUN_ID:-}" ]]; then - echo "❌ RUN_ID environment variable is not set." - exit 1 - fi - - infra_plan_dir="${TF_VAR_infra_plan_dir:-{{INFRA_PLAN_DIR}}}" - - echo "${infra_plan_dir}/${ENVIRONMENT}/${RUN_ID}" - - -# Derive the shared S3 URI prefix for Terragrunt plan artifacts. -terragrunt-plan-base-s3-prefix: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -n "${PLAN_ARTIFACT_S3_PREFIX:-}" ]]; then - echo "${PLAN_ARTIFACT_S3_PREFIX}" - exit 0 - fi - - if [[ -z "${BUCKET_NAME:-}" ]]; then - echo "❌ BUCKET_NAME environment variable is not set." - exit 1 - fi - - artifact_base_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-prefix)" - - echo "s3://${BUCKET_NAME}/${artifact_base_prefix}" - - -# Derive the shared S3 prefix for Terragrunt plan artifacts. -terragrunt-plan-prefix: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${TG_DIRECTORY:-}" ]]; then - echo "❌ TG_DIRECTORY environment variable is not set." - exit 1 - fi - - artifact_base_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-prefix)" - sanitized_dir="$(echo "$TG_DIRECTORY" | tr '/.' '--')" - artifact_name="terragrunt-plan-${sanitized_dir}" - artifact_prefix="${artifact_base_prefix}/${artifact_name}" - - echo "$artifact_prefix" - - -# Download saved Terragrunt plan files for a stack from the shared code bucket. -terragrunt-plan-download: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${PLAN_ARTIFACT_S3_PREFIX:-}" && -z "${BUCKET_NAME:-}" ]]; then - echo "❌ PLAN_ARTIFACT_S3_PREFIX or BUCKET_NAME environment variable is required." - exit 1 - fi - - if [[ -z "${TG_DIRECTORY:-}" ]]; then - echo "❌ TG_DIRECTORY environment variable is not set." - exit 1 - fi - - artifact_s3_base_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-s3-prefix)" - artifact_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-prefix)" - - mkdir -p "$TG_DIRECTORY" - - aws s3 cp "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_FILE}}" "${TG_DIRECTORY}/{{PLAN_FILE}}" - aws s3 cp "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_TEXT_FILE}}" "${TG_DIRECTORY}/{{PLAN_TEXT_FILE}}" - aws s3 cp "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_META_FILE}}" "${TG_DIRECTORY}/{{PLAN_META_FILE}}" - - -# Upload saved Terragrunt plan files for a stack to the shared code bucket. -terragrunt-plan-upload: - #!/usr/bin/env bash - set -euo pipefail - - if [[ -z "${PLAN_ARTIFACT_S3_PREFIX:-}" && -z "${BUCKET_NAME:-}" ]]; then - echo "❌ PLAN_ARTIFACT_S3_PREFIX or BUCKET_NAME environment variable is required." - exit 1 - fi - - if [[ -z "${TG_DIRECTORY:-}" ]]; then - echo "❌ TG_DIRECTORY environment variable is not set." - exit 1 - fi - - artifact_s3_base_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-base-s3-prefix)" - artifact_prefix="$(just --justfile "{{PROJECT_DIR}}/justfile.tg" terragrunt-plan-prefix)" - - aws s3 cp "${TG_DIRECTORY}/{{PLAN_FILE}}" "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_FILE}}" - aws s3 cp "${TG_DIRECTORY}/{{PLAN_TEXT_FILE}}" "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_TEXT_FILE}}" - aws s3 cp "${TG_DIRECTORY}/{{PLAN_META_FILE}}" "${artifact_s3_base_prefix}/${artifact_prefix##*/}/{{PLAN_META_FILE}}" diff --git a/lambdas/lambda_api/README.md b/lambdas/lambda_api/README.md index 0b7d16c4..2e1b42f6 100644 --- a/lambdas/lambda_api/README.md +++ b/lambdas/lambda_api/README.md @@ -14,7 +14,7 @@ Public Lambda-backed HTTP API. - `GET /` Basic success response - `GET /health` - Health response + Health response. This route is intentionally unauthenticated at the API Gateway layer. - `GET /fail` - `GET /error` Forced 500 response for alarm and rollback testing