diff --git a/compute/ecs_cluster/README.md b/compute/ecs_cluster/README.md index 02f0368..b82fd37 100644 --- a/compute/ecs_cluster/README.md +++ b/compute/ecs_cluster/README.md @@ -100,19 +100,19 @@ module "ecs" { private_subnet_ids = ["subnet-private-1", "subnet-private-2"] public_subnet_ids = ["subnet-public-1", "subnet-public-2"] - # Enable all capacity providers + # Attach all capacity providers. AWS does not allow mixing Fargate and EC2 + # providers in the cluster's default strategy, so the default strategy + # commits to a single family — EC2 here, since EC2 wins when enabled + # (override with capacity_provider_default). Services can still + # target FARGATE/FARGATE_SPOT via their own capacity_provider_strategies. fargate_enabled = true fargate_spot_enabled = true - fargate_weight = 1 - fargate_spot_weight = 2 # EC2 for baseline capacity ec2_instance_type = "t3.large" ec2_min_size = 2 ec2_max_size = 20 ec2_desired_capacity = 2 - ec2_weight = 1 - ec2_base = 2 # Always run 2 tasks on EC2 # Both ALBs public_alb_enabled = true @@ -210,6 +210,7 @@ module "api_service" { | Name | Description | Type | Default | Required | |------|-------------|------|---------|----------| | container_insights_enabled | Enable CloudWatch Container Insights | `bool` | `true` | no | +| capacity_provider_default | Family for the cluster default strategy: `ec2`, `fargate` (includes Fargate Spot when enabled), or `fargate_spot`. AWS forbids mixing Fargate and EC2 providers in one strategy. Defaults to `ec2` if EC2 is enabled, then `fargate`, then `fargate_spot` | `string` | `null` | no | ### Fargate Capacity Provider @@ -507,7 +508,7 @@ module "api_service" { ║ │ • ec2_capacity_provider_name = enable_ec2 ? "${var.name}-ec2" : null │ ║ ║ │ │ ║ ║ │ CAPACITY PROVIDER STRATEGY: │ ║ -║ │ • capacity_provider_strategy = concat(fargate_strategy, fargate_spot_strategy, ec2_strategy) │ ║ +║ │ • capacity_provider_strategy = single family via capacity_provider_default │ ║ ║ │ │ ║ ║ │ EC2 CONFIGURATION: │ ║ ║ │ • ecs_user_data = base64encode(ECS_CLUSTER config + custom user_data) │ ║ @@ -690,7 +691,7 @@ module "api_service" { ║ │ ▼ ▼ ║ ║ │ var.fargate_enabled ────►┌──────────────────────────────────────────────┐ ║ ║ │ var.fargate_spot_enabled►│ aws_ecs_cluster_capacity_providers.this │ ║ -║ │ local.enable_ec2 ──────►│ (FARGATE + FARGATE_SPOT + EC2 strategy) │ ║ +║ │ local.enable_ec2 ──────►│ (single-family default strategy) │ ║ ║ │ └──────────────────────────────────────────────┘ ║ ║ │ ║ ║ │ ┌────────────────────────────────────────────────────┐ ║ @@ -785,20 +786,33 @@ ECS supports three types of capacity providers, each with distinct trade-offs: - You need specific instance types or kernel configurations - You require persistent local storage -**Example: Cost-optimized mixed strategy** +**Example: Cost-optimized mixed cluster** + +AWS does not allow a single capacity provider strategy to mix Fargate and EC2 +(Auto Scaling group) providers, so the cluster's default strategy commits to +one family (`capacity_provider_default`). To mix families across +workloads, attach both to the cluster and pick the family per service: ```hcl -# Use EC2 for baseline, Fargate Spot for burst capacity +# EC2 is the cluster default; specific services opt into Fargate Spot module "ecs" { source = "..." fargate_enabled = false # Disable standard Fargate - fargate_spot_enabled = true # Use Fargate Spot for overflow - fargate_spot_weight = 1 + fargate_spot_enabled = true # Attached for services that want Spot - ec2_instance_type = "m5.large" - ec2_base = 5 # Always run 5 tasks on EC2 - ec2_weight = 1 + ec2_instance_type = "m5.large" # EC2 wins the default strategy when enabled +} + +module "batch_service" { + source = ".../compute/ecs_service" + + # ... service configuration ... + + # Override the cluster default for this service only + capacity_provider_strategies = [ + { capacity_provider = "FARGATE_SPOT", weight = 1 } + ] } ``` @@ -813,8 +827,8 @@ The **base** and **weight** parameters control how ECS distributes tasks across │ │ │ 1. First, satisfy BASE requirements (guaranteed tasks per provider) │ │ │ -│ Example: fargate_base=2, ec2_base=3 │ -│ → First 5 tasks: 2 on Fargate, 3 on EC2 │ +│ Example: fargate_base=2, fargate_spot_weight=1 │ +│ → First 2 tasks on Fargate, then split with Fargate Spot │ │ │ │ 2. Then, distribute remaining tasks by WEIGHT ratio │ │ │ @@ -830,7 +844,11 @@ The **base** and **weight** parameters control how ECS distributes tasks across |----------|---------------|--------| | Fargate only | `fargate_enabled=true` | All tasks on Fargate | | Cost savings | `fargate_weight=1, fargate_spot_weight=3` | 25% Fargate, 75% Fargate Spot | -| EC2 baseline | `ec2_base=5, ec2_weight=0, fargate_weight=1` | First 5 on EC2, rest on Fargate | +| EC2 default | `ec2_instance_type="m5.large"` | Default strategy is EC2; services may target Fargate via their own strategy | + +Note: base/weight only combine providers within the same family (Fargate + +Fargate Spot). A strategy cannot mix Fargate and EC2 providers — the cluster +default commits to one family via `capacity_provider_default`. ### How does EC2 managed scaling work? @@ -955,6 +973,7 @@ The module automatically creates a security group for EC2 instances that: ## Notes - The EC2 capacity provider is only created when `ec2_instance_type` is specified +- The cluster default capacity provider strategy commits to a single family (AWS forbids mixing Fargate and EC2 providers in one strategy); control it with `capacity_provider_default` - By default, uses the latest ECS-optimized Amazon Linux 2023 AMI - EC2 instances automatically register with the ECS cluster via user data - IMDSv2 is enforced by default for enhanced security diff --git a/compute/ecs_cluster/ec2.tf b/compute/ecs_cluster/ec2.tf index 5497200..b2f7c74 100644 --- a/compute/ecs_cluster/ec2.tf +++ b/compute/ecs_cluster/ec2.tf @@ -64,7 +64,7 @@ module "ecs_instance_security_group" { all_egress_enabled = true # For ip_protocol="-1" (all protocols), AWS requires from_port/to_port to - # be omitted; use -1 here for caller clarity. + # be -1; setting them to 0 causes update failures. ingress_rules = concat( # Allow inbound from public ALB if enabled var.public_alb_enabled ? [ diff --git a/compute/ecs_cluster/locals.tf b/compute/ecs_cluster/locals.tf index 0bee0d1..40c53ff 100644 --- a/compute/ecs_cluster/locals.tf +++ b/compute/ecs_cluster/locals.tf @@ -24,9 +24,22 @@ locals { # EC2 capacity provider name ec2_capacity_provider_name = local.enable_ec2 ? "${var.name}-ec2" : null - # Build capacity provider strategy based on enabled providers - capacity_provider_strategy = concat( - var.fargate_enabled ? [{ + # Family used for the cluster default strategy. AWS rejects default + # strategies that mix Fargate and EC2 (ASG) capacity providers, so the + # default strategy must commit to a single family. + capacity_provider_default = coalesce( + var.capacity_provider_default, + local.enable_ec2 ? "ec2" : var.fargate_enabled ? "fargate" : "fargate_spot" + ) + + # Build the default capacity provider strategy from the selected family. + # FARGATE and FARGATE_SPOT may share a strategy; EC2 must stand alone. + capacity_provider_strategy = local.capacity_provider_default == "ec2" ? [{ + capacity_provider = aws_ecs_capacity_provider.ec2[0].name + weight = var.ec2_weight + base = var.ec2_base + }] : concat( + local.capacity_provider_default == "fargate" && var.fargate_enabled ? [{ capacity_provider = "FARGATE" weight = var.fargate_weight base = var.fargate_base @@ -35,11 +48,6 @@ locals { capacity_provider = "FARGATE_SPOT" weight = var.fargate_spot_weight base = var.fargate_spot_base - }] : [], - local.enable_ec2 ? [{ - capacity_provider = aws_ecs_capacity_provider.ec2[0].name - weight = var.ec2_weight - base = var.ec2_base }] : [] ) diff --git a/compute/ecs_cluster/rvn-ecs-cluster-definition.yml b/compute/ecs_cluster/rvn-ecs-cluster-definition.yml index 271cb4f..120e38b 100644 --- a/compute/ecs_cluster/rvn-ecs-cluster-definition.yml +++ b/compute/ecs_cluster/rvn-ecs-cluster-definition.yml @@ -3,8 +3,8 @@ definition: name: ECS Cluster description: Production-ready AWS ECS cluster with Fargate, Fargate Spot, optional EC2 capacity, and shared load balancers. release: - version: 0.1.3 - description: Fix all-protocol security group rule updates + version: 0.2.0 + description: Automatically derive the default capacity provider so the cluster default strategy commits to a single provider family module: inputs: - id: network diff --git a/compute/ecs_cluster/tests/basic.tftest.hcl b/compute/ecs_cluster/tests/basic.tftest.hcl index 5115990..87b25bb 100644 --- a/compute/ecs_cluster/tests/basic.tftest.hcl +++ b/compute/ecs_cluster/tests/basic.tftest.hcl @@ -879,9 +879,11 @@ run "ec2_custom_weights" { ec2_base = 0 } + # AWS rejects default strategies mixing Fargate and EC2 providers, so the + # default strategy commits to a single family (EC2 wins when enabled). assert { - condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 2 - error_message = "Should have 2 capacity provider strategies (Fargate + EC2)" + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy should contain only the EC2 capacity provider when EC2 is enabled" } } diff --git a/compute/ecs_cluster/tests/capacity_provider_strategy.tftest.hcl b/compute/ecs_cluster/tests/capacity_provider_strategy.tftest.hcl new file mode 100644 index 0000000..d17cc17 --- /dev/null +++ b/compute/ecs_cluster/tests/capacity_provider_strategy.tftest.hcl @@ -0,0 +1,258 @@ +# Default Capacity Provider Strategy Tests +# +# AWS rejects default capacity provider strategies that mix Fargate and EC2 +# (Auto Scaling group) capacity providers. These tests verify that the default +# strategy always commits to a single family, controlled by +# capacity_provider_default. +# +# Run with: tofu test + +mock_provider "aws" { + override_data { + target = data.aws_caller_identity.current + values = { + account_id = "123456789012" + } + } + + override_data { + target = data.aws_region.current + values = { + id = "us-east-1" + name = "us-east-1" + } + } + + override_data { + target = data.aws_ssm_parameter.ecs_optimized_ami + values = { + value = "ami-0123456789abcdef0" + } + } + + override_data { + target = data.aws_elb_service_account.current + values = { + arn = "arn:aws:iam::127311923021:root" + } + } + + override_resource { + target = aws_iam_instance_profile.ecs_instance + values = { + arn = "arn:aws:iam::123456789012:instance-profile/test-cluster-ecs-instance" + } + } + + override_resource { + target = aws_launch_template.ecs + values = { + arn = "arn:aws:ec2:us-east-1:123456789012:launch-template/lt-0123456789abcdef" + id = "lt-0123456789abcdef" + } + } + + override_resource { + target = module.ecs_autoscaling.aws_autoscaling_group.this + values = { + arn = "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:12345678-1234-1234-1234-123456789012:autoScalingGroupName/test-cluster-ecs" + } + } +} + +variables { + name = "test-cluster" + vpc_id = "vpc-12345678" + private_subnet_ids = ["subnet-private1", "subnet-private2"] +} + +################################################################################ +# Implicit family selection (capacity_provider_default = null) +################################################################################ + +# Fargate only (module defaults): default strategy is FARGATE only +run "defaults_to_fargate" { + command = plan + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy should contain exactly one entry" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "FARGATE"]) + error_message = "Default strategy should contain FARGATE" + } +} + +# Fargate + Fargate Spot: both share the default strategy (same AWS family) +run "fargate_and_spot_share_default_strategy" { + command = plan + + variables { + fargate_spot_enabled = true + } + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 2 + error_message = "Default strategy should contain FARGATE and FARGATE_SPOT" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "FARGATE_SPOT"]) + error_message = "Default strategy should contain FARGATE_SPOT" + } +} + +# Fargate disabled, Spot enabled: falls back to FARGATE_SPOT +run "defaults_to_spot_when_fargate_disabled" { + command = plan + + variables { + fargate_enabled = false + fargate_spot_enabled = true + } + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy should contain exactly one entry" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "FARGATE_SPOT"]) + error_message = "Default strategy should contain FARGATE_SPOT" + } +} + +# EC2 enabled alongside Fargate (the failed terratest run scenario): +# EC2 wins the default strategy; Fargate stays attached but out of the strategy +run "ec2_wins_default_strategy" { + command = plan + + variables { + ec2_instance_type = "t3.medium" + fargate_enabled = true + fargate_spot_enabled = true + } + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy must not mix Fargate and EC2 capacity providers" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "test-cluster-ec2"]) + error_message = "Default strategy should contain the EC2 capacity provider" + } + + assert { + condition = contains(aws_ecs_cluster_capacity_providers.this.capacity_providers, "FARGATE") + error_message = "FARGATE should still be attached to the cluster" + } + + assert { + condition = contains(aws_ecs_cluster_capacity_providers.this.capacity_providers, "FARGATE_SPOT") + error_message = "FARGATE_SPOT should still be attached to the cluster" + } + + assert { + condition = contains(aws_ecs_cluster_capacity_providers.this.capacity_providers, "test-cluster-ec2") + error_message = "The EC2 capacity provider should be attached to the cluster" + } +} + +################################################################################ +# Explicit family selection +################################################################################ + +# EC2 enabled but Fargate explicitly chosen as the default family +run "explicit_fargate_family_with_ec2_enabled" { + command = plan + + variables { + ec2_instance_type = "t3.medium" + capacity_provider_default = "fargate" + } + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy should contain exactly one entry" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "FARGATE"]) + error_message = "Default strategy should contain FARGATE when the fargate family is chosen explicitly" + } + + assert { + condition = contains(aws_ecs_cluster_capacity_providers.this.capacity_providers, "test-cluster-ec2") + error_message = "The EC2 capacity provider should still be attached to the cluster" + } +} + +# Fargate Spot explicitly chosen even though Fargate is enabled +run "explicit_spot_family" { + command = plan + + variables { + fargate_enabled = true + fargate_spot_enabled = true + capacity_provider_default = "fargate_spot" + } + + assert { + condition = length(aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy) == 1 + error_message = "Default strategy should contain exactly one entry" + } + + assert { + condition = anytrue([for s in aws_ecs_cluster_capacity_providers.this.default_capacity_provider_strategy : s.capacity_provider == "FARGATE_SPOT"]) + error_message = "Default strategy should contain only FARGATE_SPOT when the fargate_spot family is chosen explicitly" + } +} + +################################################################################ +# Validation +################################################################################ + +run "invalid_family_value" { + command = plan + + variables { + capacity_provider_default = "bogus" + } + + expect_failures = [var.capacity_provider_default] +} + +run "ec2_family_requires_ec2_enabled" { + command = plan + + variables { + capacity_provider_default = "ec2" + } + + expect_failures = [var.capacity_provider_default] +} + +run "fargate_family_requires_fargate_enabled" { + command = plan + + variables { + fargate_enabled = false + fargate_spot_enabled = true + capacity_provider_default = "fargate" + } + + expect_failures = [var.capacity_provider_default] +} + +run "spot_family_requires_spot_enabled" { + command = plan + + variables { + capacity_provider_default = "fargate_spot" + } + + expect_failures = [var.capacity_provider_default] +} diff --git a/compute/ecs_cluster/variables.tf b/compute/ecs_cluster/variables.tf index 52b3f79..6125c0d 100644 --- a/compute/ecs_cluster/variables.tf +++ b/compute/ecs_cluster/variables.tf @@ -89,6 +89,36 @@ variable "container_insights_enabled" { default = true } +################################################################################ +# Default Capacity Provider Strategy +################################################################################ + +variable "capacity_provider_default" { + type = string + description = "Capacity provider family used for the cluster's default strategy. AWS rejects default strategies that mix Fargate and EC2 (Auto Scaling group) capacity providers, so the default strategy must commit to a single family; services can still target any attached capacity provider via their own strategy. Valid values: 'ec2', 'fargate' (also includes Fargate Spot when enabled), 'fargate_spot'. When null, defaults to 'ec2' if the EC2 capacity provider is enabled, then 'fargate' if enabled, and finally 'fargate_spot'." + default = null + + validation { + condition = var.capacity_provider_default == null || contains(["ec2", "fargate", "fargate_spot"], coalesce(var.capacity_provider_default, "null")) + error_message = "The capacity_provider_default must be 'ec2', 'fargate', or 'fargate_spot'." + } + + validation { + condition = var.capacity_provider_default != "ec2" || var.ec2_instance_type != null + error_message = "The capacity_provider_default 'ec2' requires ec2_instance_type to be set." + } + + validation { + condition = var.capacity_provider_default != "fargate" || var.fargate_enabled + error_message = "The capacity_provider_default 'fargate' requires fargate_enabled to be true." + } + + validation { + condition = var.capacity_provider_default != "fargate_spot" || var.fargate_spot_enabled + error_message = "The capacity_provider_default 'fargate_spot' requires fargate_spot_enabled to be true." + } +} + ################################################################################ # Fargate Capacity Provider ################################################################################ diff --git a/compute/ecs_service/.terraform.lock.hcl b/compute/ecs_service/.terraform.lock.hcl index 28ffb2f..5c17aaf 100644 --- a/compute/ecs_service/.terraform.lock.hcl +++ b/compute/ecs_service/.terraform.lock.hcl @@ -1,25 +1,26 @@ -# This file is maintained automatically by "tofu init". +# This file is maintained automatically by "terraform init". # Manual edits may be lost in future updates. -provider "registry.opentofu.org/hashicorp/aws" { - version = "6.39.0" - constraints = ">= 5.0.0" +provider "registry.terraform.io/hashicorp/aws" { + version = "6.50.0" + constraints = ">= 6.0.0, >= 6.21.0" hashes = [ - "h1:c9SG8ZdYgzqpxORpTqeLFeXW4qQQ8GMGCcUkU+FAfQM=", - "zh:00a6c0d8b5b86833087e367b632e9ab73fb8db9c43569020ebd0489dc2c919ce", - "zh:05f2b56211f4c8a0b66a093d025187cbc7be086dedef62306f5a28290598ebdc", - "zh:24d97a31d5ab814c33ed32a5b7674f1a15544b2367a95bddd00cfdd8d6b82740", - "zh:258194e24ac07ee194d580ca25a25fa7bc48fa40fed4fd58352b0a64da0da4c9", - "zh:315337e5f0ccafeadf490f117151b52c6d66244bf652f4fee975eddda662af3b", - "zh:38573dd56cca8c0ffe33396cf17cc8bd13de1d27d3c4da4177e485d174f1eaf0", - "zh:4baa806c5eb8faae95cea3f1dfafb153b5e3e96c5b30a2102072da4f032d2d9b", - "zh:4f258106baca7e00a6904b2353579d283e4400a75cd0353a25e057921e8a8d96", - "zh:62e5d4628d03883a6c2a6e3c297eb54df9b5935e9e3a655dbb1c6c5ddaf7ea33", - "zh:8af5fae01c1cef65d149fa6fe47e94cf46ffa97d29e8f2dfe41aeae01da590ea", - "zh:a8240b40f7be408ac24897597a85dc4fe56f390224b11ecad2c1327e686fca58", - "zh:c549eee2a0cf0e2c4a676614d990121b685beab0047b1073407ee26247c4be13", - "zh:cfed074ba8948c75445c74c69722cb17c960024b1917b4f26905aa9c9ac4e667", - "zh:d6f4f4fa01e33d0d546705e2776f38d0b4f2847827b3f07ecde87cc02ef3d23e", - "zh:e7239b349c3149e4670750481b687c5c828908fd09f2196d7af1ac1b4d83e80b", + "h1:D8uNiOpl3UkAX4zI5T47ALMiRFXTa1XfdQC+TBu3RmE=", + "zh:0072806bb262c6d86bc25b4a75750e469881144c14818afdba7b82db840e1588", + "zh:1ebc2dae335dad7a8b16a1985b69a63a14954282bb44fdba7d5103f77551ac7b", + "zh:2dab48fe8f3193b8216d578ac1e3674fa566435cc7dbce2953d55b72e31d0241", + "zh:2fc3d3029c2b7429472391ef339672e1fca8e6ff32c8a519bf3acedafa7e24fe", + "zh:38a36e64e7212f6cedac861ea4d449cce07131b3378de601bf9d49a99e000208", + "zh:3ac70758ed251ce78b7f541a5a79cc6fe56474412783ae1decef719bdd0f30bf", + "zh:4385d3903e685bddb2b8005b4eb7db89f030267d4d03c7d792d2f5e739cc874a", + "zh:4cce0760b87fbafd51f30faec2a737f4183b7c615f4a86557f7d3c893a610dc5", + "zh:4feaeed18694239b896c6415d9a1e5ef89e1da4f4ad60924aa0522adeb1f6599", + "zh:502fca2be1c95f443c3e67d0555601d1de65b4ca82d197c059e9c868360e3a0a", + "zh:57d037f6fdd045f2660909c3bdface9622d81165ce647479cba98d1f353c5eab", + "zh:5dc5a0b915c2ac5256d909458f5c8e40b35f78b3a36ea893c86624eaf6c54e37", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:b84c87c58a320adbb2c74a4cad03ae5aac7f2eae21db26f00fdde98c8c4d4523", + "zh:c895f1d5cbcbeff77850ac99efd36bde0048d4e909b296882331b9b9ebf48cfa", + "zh:ead82831683619124597a1f170dd31e9b293e9cf22f558cb166d5e734fcd11e4", ] } diff --git a/compute/ecs_service/README.md b/compute/ecs_service/README.md index df5e542..b9ca61c 100644 --- a/compute/ecs_service/README.md +++ b/compute/ecs_service/README.md @@ -1,12 +1,14 @@ # ECS Service Module -This module creates an Amazon ECS service with a placeholder task definition, load balancer integration, auto scaling, and service discovery. It supports both rolling and blue/green deployment strategies. +This module creates an Amazon ECS service with a placeholder task definition, load balancer integration, auto scaling, and service discovery. It supports the native ECS deployment strategies: rolling, blue/green, linear, and canary. -**Note:** This module provisions infrastructure with a placeholder container (hello-world). An external deployment controller (e.g. CodeDeploy or another CI/CD tool) is expected to deploy the actual application by updating the task definition. +**Note:** This module provisions infrastructure with a placeholder container (hello-world). The Flightcontrol deploy manager deploys the actual application by registering task definitions and calling UpdateService with the authoritative `deploymentConfiguration` (strategy, bake times, pause lifecycle hooks) on every deploy. + +When a load balancer is attached, the module always provisions the production + alternate target-group pair, the ECS infrastructure role, and the service's `load_balancer.advanced_configuration` — so the deployment strategy is a **per-deployment decision**: eligible services can switch between rolling / blue_green / linear / canary on a single deploy with no Terraform changes. ALB traffic-shift deployments require a single production listener rule. `deployment_type` only seeds the strategy at create time. ## Features -- ECS service with configurable deployment strategies (rolling or blue/green) +- ECS service with configurable native deployment strategies (rolling, blue/green, linear, canary) - Placeholder task definition (hello-world) - the external deployment controller updates with the actual application - IAM roles for task execution and task roles with optional ECS Exec support - Security group for ECS tasks with configurable ingress rules @@ -15,7 +17,7 @@ This module creates an Amazon ECS service with a placeholder task definition, lo - NLB listener creation with TLS support - Application Auto Scaling with target tracking and scheduled scaling - AWS Cloud Map service discovery integration -- Blue/green deployment infrastructure (managed by an external deployment controller) +- Native traffic-shift deployment infrastructure (production/alternate target groups, ECS infrastructure role, advanced_configuration) provisioned for every load-balanced service so the strategy can change per deployment - Support for EFS and Docker volume configurations - Capacity provider strategy support for mixed Fargate/EC2 deployments @@ -114,9 +116,10 @@ module "api_service" { } } -# Use the outputs to configure an external deployment controller -# module.api_service.blue_target_group_arn -# module.api_service.green_target_group_arn +# Target groups + ECS infrastructure role for the traffic shift: +# module.api_service.production_target_group_arn +# module.api_service.alternate_target_group_arn +# module.api_service.ecs_infrastructure_role_arn ``` ### With Service Discovery @@ -253,7 +256,7 @@ module "worker_service" { | Name | Version | |------|---------| | opentofu/terraform | >= 1.10.0 | -| aws | >= 5.0 | +| aws | >= 6.21 | ## Inputs @@ -314,7 +317,15 @@ module "worker_service" { | Name | Description | Type | Default | Required | |------|-------------|------|---------|----------| | desired_count | Desired number of tasks (0 for infrastructure-first) | `number` | `0` | no | -| deployment_type | Deployment type: rolling or blue_green | `string` | `"rolling"` | no | +| deployment_type | Initial deployment strategy for direct Terraform use; Ravion stacks use rolling and set blue_green/linear/canary per deploy via UpdateService | `string` | `"rolling"` | no | +| deployment_strategy_config | Initial bake/canary/linear tuning for direct Terraform use; Ravion stacks set this per deploy through the deploy manager | `object` | `{}` | no | +| test_listener_rule_arn | Optional ALB listener rule ARN for test traffic during blue/green validation when the module-created green listener rule is not enabled | `string` | `null` | no | +| green_alb_listener_rule_enabled | Create a dedicated ALB listener rule that routes test traffic to the green (alternate) target group during native traffic-shift deployments, so the new revision can be validated before production traffic shifts. ALB-only; no effect for NLB services | `bool` | `true` | no | +| test_traffic_condition_type | Which request attribute distinguishes test traffic for the green rule: `header` (test_header_name/value) or `query-string` (test_query_string_key/value). One type per service — ALB AND-combines conditions and ECS wires exactly one test rule, so genuine "header OR query-string" matching is not possible natively | `string` | `"query-string"` | no | +| test_header_name | HTTP header name that routes test traffic to the green target group when test_traffic_condition_type is `header` | `string` | `"X-Ravion-Test"` | no | +| test_header_value | Value paired with test_header_name when test_traffic_condition_type is `header` | `string` | `"1"` | no | +| test_query_string_key | Query-string key that routes test traffic to the green target group when test_traffic_condition_type is `query-string` (e.g. `?__x-rvn-test__=1`) | `string` | `"__x-rvn-test__"` | no | +| test_query_string_value | Value paired with test_query_string_key when test_traffic_condition_type is `query-string` | `string` | `"1"` | no | | deployment_minimum_healthy_percent | Minimum healthy percent during deployment | `number` | `100` | no | | deployment_maximum_percent | Maximum percent during deployment | `number` | `200` | no | | execute_command_enabled | Enable ECS Exec for debugging | `bool` | `false` | no | @@ -405,23 +416,20 @@ The `service_discovery` object includes: | security_group_id | The ID of the service security group | | security_group_arn | The ARN of the service security group | -### Target Groups - Rolling Deployment +### Target Groups -| Name | Description | -|------|-------------| -| target_group_arn | Target group ARN (null if LB disabled or blue/green) | -| target_group_arn_suffix | Target group ARN suffix for CloudWatch metrics | -| target_group_name | Target group name | - -### Target Groups - Blue/Green Deployment +A production (tg-1) + alternate (tg-2) pair always exists when a load balancer is attached. Rolling deployments only ever serve from the production target group; native traffic-shift deployments alternate between the two. | Name | Description | |------|-------------| -| blue_target_group_arn | Blue target group ARN | -| blue_target_group_name | Blue target group name | -| green_target_group_arn | Green target group ARN | -| green_target_group_name | Green target group name | -| target_group_arns | Map of all target group ARNs (primary for rolling, blue/green for blue_green) | +| production_target_group_arn | Production target group ARN (null if LB disabled) | +| production_target_group_name | Production target group name | +| alternate_target_group_arn | Alternate target group ARN ECS shifts traffic to during native deployments | +| alternate_target_group_name | Alternate target group name | +| target_group_arn | Alias of production_target_group_arn | +| target_group_arn_suffix | Production target group ARN suffix for CloudWatch metrics | +| target_group_arns | Map of all target group ARNs (production + alternate) | +| ecs_infrastructure_role_arn | IAM role ECS assumes to manage listener wiring during native traffic-shift deployments | ### NLB Listener @@ -525,7 +533,7 @@ The `service_discovery` object includes: ║ │ ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ ║ ║ │ │ • default_tags = { ManagedBy = "terraform", Module = "compute/ecs_service" } │ │ ║ ║ │ │ • tags = merge(default_tags, var.tags) │ │ ║ -║ │ │ • deployment_controller_type = var.deployment_type == "blue_green" ? "CODE_DEPLOY" : "ECS" │ │ ║ +║ │ │ • deployment_controller_type = "ECS" (always; strategy is per-deployment) │ │ ║ ║ │ │ • placeholder_container_name = "app" │ │ ║ ║ │ │ │ │ ║ ║ │ │ FEATURE FLAGS: │ │ ║ @@ -541,7 +549,7 @@ The `service_discovery` object includes: ║ ┌─────────────────────────────┐ ┌─────────────────────────────────┐ ┌─────────────────────────────────────────┐ ║ ║ │ TASK DEFINITION │ │ SERVICE CONFIG │ │ DEPLOYMENT │ ║ ║ ├─────────────────────────────┤ ├─────────────────────────────────┤ ├─────────────────────────────────────────┤ ║ -║ │ • task_cpu │ │ • desired_count │ │ • deployment_type (rolling/blue_green) │ ║ +║ │ • task_cpu │ │ • desired_count │ │ • deployment_type (strategy seed) │ ║ ║ │ • task_memory │ │ • execute_command_enabled │ │ • deployment_minimum_healthy_percent │ ║ ║ │ • container_port │ │ • new_deployment_forcing_enabled│ │ • deployment_maximum_percent │ ║ ║ │ • launch_type │ │ • steady_state_wait_enabled │ │ • deployment_circuit_breaker │ ║ @@ -634,7 +642,7 @@ The `service_discovery` object includes: ║ │ │ _breaker(dynamic)│ │ strategy (dynamic)│ │ ║ ║ │ └──────────────────┘ └───────────────────┘ │ ║ ║ │ │ ║ -║ │ deployment_controller.type = ECS | CODE_DEPLOY │ ║ +║ │ deployment_controller.type = ECS (always) │ ║ ║ └────────────────────────────────────┬─────────────────────────────────────┘ ║ ║ │ ║ ║ ┌─────────────────────────────────────────┬───────────────────────┼───────────────────────┬───────────────┐ ║ @@ -644,15 +652,15 @@ The `service_discovery` object includes: ║ │ TARGET GROUPS │ │ aws_lb_listener_rule.alb │ │ aws_lb_listener │ │aws_service_discovery │ ║ ║ │ (conditional) │ │ (for_each: listener_rules) │ │ .nlb[0] │ │ _service.this[0] │ ║ ║ ├───────────────────────┤ ├───────────────────────────────┤ │ (count: 0 or 1) │ │(count: 0 or 1) │ ║ -║ │ Rolling: │ │ • path-pattern condition │ ├──────────────────┤ ├────────────────────────┤ ║ +║ │ Always (when LB): │ │ • path-pattern condition │ ├──────────────────┤ ├────────────────────────┤ ║ ║ │ aws_lb_target_group │ │ • host-header condition │ │ • TCP/TLS/UDP │ │ • Cloud Map DNS │ ║ -║ │ .this[0] │ │ • http-header condition │ │ • Certificate │ │ • A or SRV records │ ║ -║ │ │ │ • query-string condition │ │ • SSL policy │ │ • Custom health check │ ║ -║ │ Blue/Green: │ │ • source-ip condition │ └──────────────────┘ └────────────────────────┘ ║ -║ │ aws_lb_target_group │ │ lifecycle: ignore action │ ║ -║ │ .tg_1[0] (blue) │ │ (external controller swaps) │ ║ -║ │ aws_lb_target_group │ └───────────────────────────────┘ ║ -║ │ .tg_2[0] (green) │ ║ +║ │ .tg_1[0] (prod) │ │ • http-header condition │ │ • Certificate │ │ • A or SRV records │ ║ +║ │ aws_lb_target_group │ │ • query-string condition │ │ • SSL policy │ │ • Custom health check │ ║ +║ │ .tg_2[0] (alt) │ │ • source-ip condition │ └──────────────────┘ └────────────────────────┘ ║ +║ │ │ │ lifecycle: ignore action │ ║ +║ │ │ │ (ECS controller rewrites) │ ║ +║ │ │ └───────────────────────────────┘ ║ +║ │ │ ║ ║ └───────────────────────┘ ║ ║ ║ ║ ┌─────────────────────────────────────────────────────────────────────────────────────┐ ║ @@ -699,13 +707,13 @@ The `service_discovery` object includes: ║ └─────────────────────────────────────────┘ ║ ║ ║ ║ ┌─────────────────────────────────────────┐ ┌─────────────────────────────────────────┐ ║ -║ │ TARGET GROUPS (Rolling) │ │ TARGET GROUPS (Blue/Green) │ ║ +║ │ TARGET GROUPS (always w/ LB) │ │ TRAFFIC-SHIFT INFRA │ ║ ║ ├─────────────────────────────────────────┤ ├─────────────────────────────────────────┤ ║ -║ │ • target_group_arn │ │ • blue_target_group_arn │ ║ -║ │ • target_group_arn_suffix │ │ • blue_target_group_name │ ║ -║ │ • target_group_name │ │ • green_target_group_arn │ ║ -║ └─────────────────────────────────────────┘ │ • green_target_group_name │ ║ -║ │ • target_group_arns (map) │ ║ +║ │ • production_target_group_arn │ │ • alternate_target_group_arn │ ║ +║ │ • production_target_group_name │ │ • alternate_target_group_name │ ║ +║ │ • target_group_arn (alias) │ │ • ecs_infrastructure_role_arn │ ║ +║ └─────────────────────────────────────────┘ │ • target_group_arns (map) │ ║ +║ │ │ ║ ║ └─────────────────────────────────────────┘ ║ ║ ║ ║ ┌─────────────────────────────────────────┐ ┌─────────────────────────────────────────┐ ║ @@ -777,8 +785,8 @@ The `service_discovery` object includes: ║ │ │ │ │ │ ║ ║ ▼ ▼ ▼ ▼ ▼ ║ ║ aws_lb_target_group aws_lb_listener_rule aws_lb_listener aws_appautoscaling_ aws_service_discovery_ ║ -║ .this[0] / .tg_1[0] .alb (for_each) .nlb[0] target.this[0] service.this[0] ║ -║ / .tg_2[0] │ ║ +║ .tg_1[0] + .tg_2[0] .alb (for_each) .nlb[0] target.this[0] service.this[0] ║ +║ │ ║ ║ │ ║ ║ ┌───────────────────────────────────────────┴───────────────────────────┐ ║ ║ │ │ ║ @@ -801,9 +809,9 @@ The `service_discovery` object includes: | `aws_ecs_task_definition` | 1 | Container configuration (placeholder) | | `aws_ecs_service` | 1 | Core ECS service resource | | `module.security_group` | 1 | Security group for tasks | -| `aws_lb_target_group.this` | 0 or 1 | Target group for rolling deployment | -| `aws_lb_target_group.tg_1` | 0 or 1 | Blue target group for blue/green | -| `aws_lb_target_group.tg_2` | 0 or 1 | Green target group for blue/green | +| `aws_lb_target_group.tg_1` | 0 or 1 | Production target group (created whenever a load balancer is attached) | +| `aws_lb_target_group.tg_2` | 0 or 1 | Alternate target group ECS shifts traffic to during native deployments | +| `aws_iam_role.ecs_infrastructure` | 0 or 1 | Role ECS assumes for load-balancer wiring during traffic shifts | | `aws_lb_listener_rule.alb` | for_each | ALB listener rules | | `aws_lb_listener.nlb` | 0 or 1 | NLB listener | | `aws_service_discovery_service` | 0 or 1 | Cloud Map service | @@ -823,25 +831,72 @@ The module deploys `public.ecr.aws/docker/library/hello-world:latest` as a place The placeholder container prints a message and exits, so load balancer health checks will fail until the actual application is deployed. This is expected behavior. -### When should I use rolling vs blue/green deployment? +### When should I use which deployment strategy? + +All four strategies run on the native ECS deployment controller — no +CodeDeploy and no external controller. + +The same infrastructure (2 target groups + infrastructure role) backs every load-balanced service, so eligible services can switch strategy on their next deployment. ALB traffic-shift deployments require a single production listener rule. + +| Feature | Rolling | Blue/Green | Linear | Canary | +|---------|---------|------------|--------|--------| +| **Traffic shift** | Task replacement (min/max healthy %) | All-at-once + bake | Equal % steps + per-step bake | Small % first, then the rest | +| **Rollback** | Circuit breaker | Instant (old revision kept through bake) | Instant | Instant | +| **Testing** | None | Test-listener validation before shift | Per-step validation | Canary validation | +| **Target groups used** | Production only | Both | Both | Both | + +**Use rolling when:** simple deployments with automatic rollback are sufficient. + +**Use blue/green when:** you want full validation of the new revision (optionally via a test listener rule) before shifting all production traffic at once, with instant rollback during the bake window. + +**Use linear/canary when:** you want production traffic to shift gradually with monitoring between steps. + +### How do I access the standby service during a traffic-shift deployment? + +For ALB-backed blue_green, linear, and canary deployments, the module creates a test listener rule that routes matching requests to the standby, or green, task set on the alternate target group. The request must match the same host/path conditions as the production listener rule and include the test selector. + +By default, the selector is the query parameter `__x-rvn-test__=1`: + +```bash +curl "https://api.example.com/health?__x-rvn-test__=1" +``` + +The alternate target group only has registered targets while ECS is running a traffic-shift deployment. Outside that window, the standby route may have no healthy targets. -| Feature | Rolling (ECS) | Blue/Green (external controller) | -|---------|--------------|----------------------------------| -| **Complexity** | Simple | More complex (requires an external deployment controller) | -| **Rollback** | Automatic via circuit breaker | Instant traffic switch | -| **Traffic shift** | Gradual (min/max healthy %) | All-at-once or gradual | -| **Testing** | No pre-production testing | Test green before switching | -| **Infrastructure** | 1 target group | 2 target groups | +To override the query parameter in Terraform: -**Use rolling when:** -- Simple deployments with automatic rollback are sufficient -- You want minimal infrastructure complexity -- Built-in ECS deployment features meet your needs +```hcl +test_query_string_key = "preview" +test_query_string_value = "green" +``` -**Use blue/green when:** -- You need instant rollback capability -- You want to test in production before switching traffic -- You need advanced deployment strategies (canary, linear) +Then request `?preview=green`. + +To use an HTTP header instead of a query parameter: + +```hcl +test_traffic_condition_type = "header" +test_header_name = "X-Ravion-Test" +test_header_value = "1" +``` + +Then send the header with the request: + +```bash +curl -H "X-Ravion-Test: 1" "https://api.example.com/health" +``` + +When using the Ravion ECS Web Server module definition, set the same lower-level variables through Advanced Terraform variables. For example, to use a header selector: + +```json +{ + "test_traffic_condition_type": "header", + "test_header_name": "X-Ravion-Test", + "test_header_value": "1" +} +``` + +The ALB rule can use one selector type per service: either `query-string` or `header`. ### How do I use this module with an NLB instead of an ALB? @@ -1001,22 +1056,22 @@ Uses the ECS deployment controller for zero-downtime rolling updates: - Built-in circuit breaker with optional rollback - Simple and fully managed by ECS -### Blue/Green Deployment +### Native Traffic-Shift Strategies (blue_green / linear / canary) -Sets up infrastructure for blue/green deployments managed by an external controller: -- Creates two target groups (tg-1 and tg-2) -- Sets deployment controller to CODE_DEPLOY -- Outputs all ARNs needed to wire up the external controller -- The external deployment controller (application, deployment group, etc.) must be managed outside of this module +The infrastructure for the ECS deployment controller's built-in traffic shifting is provisioned for **every** load-balanced service — not just those created with a native `deployment_type` — so the strategy can change between deployments without Terraform changes: +- Two target groups (tg-1 = production, tg-2 = alternate); rolling deployments only ever use tg-1 +- An ECS infrastructure IAM role (AmazonECSInfrastructureRolePolicyForLoadBalancers) that ECS assumes to rewrite listener rules and (de)register targets during the shift +- The service's `load_balancer.advanced_configuration` (alternate target group, production listener rule, optional test listener rule, infrastructure role) +- `deployment_configuration` is seeded from `deployment_type` / `deployment_strategy_config` at create time only; the Flightcontrol deploy manager passes the authoritative configuration — including pause lifecycle hooks — on every UpdateService call, so the block is in `ignore_changes` ## Notes - The module creates a security group that allows inbound traffic from the VPC CIDR on the container port - For Fargate tasks in public subnets without NAT, set `public_ip_assignment_enabled = true` - The placeholder container uses hello-world from public ECR - no special permissions needed -- For blue/green deployments, the module only creates the infrastructure; the external deployment controller must be configured separately +- For blue_green/linear/canary deployments, ECS itself executes the traffic shift; the Flightcontrol deploy manager drives it via UpdateService and pause lifecycle hooks - The task definition has `lifecycle { ignore_changes = all }` since the external deployment controller manages updates -- Listener rules have `lifecycle { ignore_changes = [action] }` for blue/green deployments where the external controller switches target groups +- Listener rules have `lifecycle { ignore_changes = [action] }` — the ECS deployment controller rewrites the forward action (weighted target groups) during native traffic shifts - When using `ALBRequestCountPerTarget` metric for auto scaling, a load balancer must be configured - The `desired_count` defaults to 0 for infrastructure-first provisioning; the external controller will manage the actual count - Target group names are truncated to meet AWS naming requirements (max 32 characters) diff --git a/compute/ecs_service/auto_scaling.tf b/compute/ecs_service/auto_scaling.tf index ee49c10..b13c7e5 100644 --- a/compute/ecs_service/auto_scaling.tf +++ b/compute/ecs_service/auto_scaling.tf @@ -96,8 +96,6 @@ resource "aws_appautoscaling_scheduled_action" "this" { ################################################################################ locals { - primary_target_group_arn_suffix = local.enable_load_balancer ? ( - var.deployment_type == "rolling" ? aws_lb_target_group.this[0].arn_suffix : aws_lb_target_group.tg_1[0].arn_suffix - ) : "" + primary_target_group_arn_suffix = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].arn_suffix : "" } diff --git a/compute/ecs_service/ecs_service.tf b/compute/ecs_service/ecs_service.tf index e719829..737295f 100644 --- a/compute/ecs_service/ecs_service.tf +++ b/compute/ecs_service/ecs_service.tf @@ -40,7 +40,8 @@ resource "aws_ecs_service" "this" { type = local.deployment_controller_type } - # Deployment circuit breaker (only for ECS deployment controller) + # Deployment circuit breaker (rolling strategy only — native + # traffic-shift strategies have their own rollback semantics) dynamic "deployment_circuit_breaker" { for_each = var.deployment_type == "rolling" && var.deployment_circuit_breaker.enable ? [1] : [] content { @@ -49,27 +50,63 @@ resource "aws_ecs_service" "this" { } } - # Deployment min/max healthy percent - deployment_minimum_healthy_percent = var.deployment_type == "rolling" ? var.deployment_minimum_healthy_percent : null - deployment_maximum_percent = var.deployment_type == "rolling" ? var.deployment_maximum_percent : null - - # Load balancer configuration - Rolling deployment - dynamic "load_balancer" { - for_each = local.enable_load_balancer && var.deployment_type == "rolling" ? [1] : [] + # Native deployment strategy. Seeds the strategy + traffic-shift + # tuning at create time; the Flightcontrol deploy manager passes the + # authoritative deploymentConfiguration (including pause lifecycle + # hooks) on every UpdateService call, so this block is in + # ignore_changes below. + dynamic "deployment_configuration" { + for_each = local.is_native_traffic_shift ? [1] : [] content { - target_group_arn = aws_lb_target_group.this[0].arn - container_name = local.lb_container_name - container_port = local.lb_container_port + strategy = local.deployment_strategy + bake_time_in_minutes = var.deployment_strategy_config.bake_time_in_minutes + + dynamic "canary_configuration" { + for_each = var.deployment_type == "canary" ? [1] : [] + content { + canary_percent = var.deployment_strategy_config.canary.canary_percent + canary_bake_time_in_minutes = var.deployment_strategy_config.canary.canary_bake_time_in_minutes + } + } + + dynamic "linear_configuration" { + for_each = var.deployment_type == "linear" ? [1] : [] + content { + step_percent = var.deployment_strategy_config.linear.step_percent + step_bake_time_in_minutes = var.deployment_strategy_config.linear.step_bake_time_in_minutes + } + } } } - # Load balancer configuration - Blue/Green deployment (attach to blue initially) + # Deployment min/max healthy percent + deployment_minimum_healthy_percent = var.deployment_type == "rolling" ? var.deployment_minimum_healthy_percent : null + deployment_maximum_percent = var.deployment_type == "rolling" ? var.deployment_maximum_percent : null + + # Load balancer configuration. advanced_configuration is always wired + # (production + alternate target groups, listener rule, infrastructure + # role) so the deployment strategy stays a per-deployment decision: + # rolling deployments serve from the production target group (tg-1) + # only, while native traffic-shift deployments alternate between tg-1 + # and tg-2, rewriting the production listener rule via the + # infrastructure role. dynamic "load_balancer" { - for_each = local.enable_load_balancer && var.deployment_type == "blue_green" ? [1] : [] + for_each = local.enable_load_balancer ? [1] : [] content { target_group_arn = aws_lb_target_group.tg_1[0].arn container_name = local.lb_container_name container_port = local.lb_container_port + + advanced_configuration { + alternate_target_group_arn = aws_lb_target_group.tg_2[0].arn + production_listener_rule = ( + local.enable_nlb_listener + ? aws_lb_listener.nlb[0].arn + : aws_lb_listener_rule.alb["0"].arn + ) + test_listener_rule = local.test_listener_rule_arn + role_arn = aws_iam_role.ecs_infrastructure[0].arn + } } } @@ -106,16 +143,45 @@ resource "aws_ecs_service" "this" { # Dependencies depends_on = [ aws_iam_role_policy_attachment.execution_base, + aws_iam_role_policy_attachment.ecs_infrastructure_elb, aws_lb_listener_rule.alb, + aws_lb_listener_rule.test, ] - # Lifecycle: desired_count is managed by autoscaling (or external controllers), - # so Terraform must not fight it on subsequent applies. + # Lifecycle: desired_count is managed by autoscaling, task_definition / + # load_balancer / deployment_configuration by the Flightcontrol deploy + # manager (UpdateService passes the authoritative strategy + pause + # lifecycle hooks on every deploy, and native traffic-shift deploys + # alternate the service between the production and alternate target + # groups), so Terraform must not fight them on subsequent applies. lifecycle { ignore_changes = [ desired_count, task_definition, load_balancer, + deployment_configuration, ] + + precondition { + condition = ( + !local.enable_load_balancer + || local.enable_nlb_listener + || length(var.load_balancer_attachment.listener_rules) > 0 + ) + error_message = "load_balancer_attachment requires either listener_rules (ALB) or nlb_listener so the production listener rule can be wired into advanced_configuration." + } + + # The ECS advanced_configuration API accepts a single production + # listener rule, so during native traffic-shift deployments only the + # first rule is rewritten — any additional rules would keep + # forwarding to the old revision for the entire deployment. + precondition { + condition = ( + !local.is_native_traffic_shift + || local.enable_nlb_listener + || length(try(var.load_balancer_attachment.listener_rules, [])) <= 1 + ) + error_message = "Native traffic-shift strategies (blue_green/linear/canary) rewrite a single production listener rule; additional listener rules would keep serving the old revision throughout the deployment. Use at most one listener rule with these strategies." + } } } diff --git a/compute/ecs_service/iam_infrastructure.tf b/compute/ecs_service/iam_infrastructure.tf new file mode 100644 index 0000000..d16e831 --- /dev/null +++ b/compute/ecs_service/iam_infrastructure.tf @@ -0,0 +1,45 @@ +################################################################################ +# ECS Infrastructure Role +# +# Native traffic-shift deployments (blue_green / linear / canary) hand +# the load-balancer wiring to the ECS deployment controller: ECS assumes +# this role to register/deregister targets and rewrite the production / +# test listener rules while it shifts traffic between the production and +# alternate target groups. +# +# Created whenever a load balancer is attached (not just for native +# strategies) so the deploy manager can switch any service to a +# traffic-shift strategy on a per-deployment basis without a Terraform +# apply. Rolling deployments never cause ECS to assume it. +################################################################################ + +data "aws_iam_policy_document" "ecs_infrastructure_assume" { + count = local.enable_load_balancer ? 1 : 0 + + statement { + actions = ["sts:AssumeRole"] + + principals { + type = "Service" + identifiers = ["ecs.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "ecs_infrastructure" { + count = local.enable_load_balancer ? 1 : 0 + + name_prefix = "${substr(var.name, 0, min(length(var.name), 26))}-infra-" + assume_role_policy = data.aws_iam_policy_document.ecs_infrastructure_assume[0].json + + tags = merge(local.tags, { + Name = "${var.name}-ecs-infrastructure" + }) +} + +resource "aws_iam_role_policy_attachment" "ecs_infrastructure_elb" { + count = local.enable_load_balancer ? 1 : 0 + + role = aws_iam_role.ecs_infrastructure[0].name + policy_arn = "arn:${data.aws_partition.current.partition}:iam::aws:policy/AmazonECSInfrastructureRolePolicyForLoadBalancers" +} diff --git a/compute/ecs_service/listener_rules.tf b/compute/ecs_service/listener_rules.tf index 7d238a1..46187a5 100644 --- a/compute/ecs_service/listener_rules.tf +++ b/compute/ecs_service/listener_rules.tf @@ -1,6 +1,10 @@ ################################################################################ # ALB Listener Rules -# For blue/green deployments, an external controller manages target group switching +# +# Rules initially forward to the production target group (tg-1). During +# native traffic-shift deployments (blue_green/linear/canary) the ECS +# deployment controller rewrites the rule's forward action between tg-1 +# and tg-2 via the infrastructure role, hence ignore_changes on action. ################################################################################ resource "aws_lb_listener_rule" "alb" { @@ -9,15 +13,35 @@ resource "aws_lb_listener_rule" "alb" { } : {} listener_arn = each.value.listener_arn - priority = each.value.priority + # The mirrored rule (index 0) takes the module-managed base priority when + # the green test rule is enabled so the test rule can sit one slot ahead; + # every other rule keeps its configured priority. + priority = ( + local.green_alb_listener_rule_enabled && each.key == "0" + ? local.green_production_priority + : each.value.priority + ) + # When the target groups have target-level stickiness the forward + # action must carry group-level stickiness or ELBv2 rejects the + # weighted forward ECS writes during traffic-shift deployments — see + # alb_group_stickiness_enabled in locals.tf. action { - type = "forward" - target_group_arn = ( - var.deployment_type == "blue_green" - ? aws_lb_target_group.tg_1[0].arn - : aws_lb_target_group.this[0].arn - ) + type = "forward" + target_group_arn = local.alb_group_stickiness_enabled ? null : aws_lb_target_group.tg_1[0].arn + + dynamic "forward" { + for_each = local.alb_group_stickiness_enabled ? [1] : [] + content { + target_group { + arn = aws_lb_target_group.tg_1[0].arn + } + stickiness { + enabled = true + duration = local.alb_group_stickiness_duration + } + } + } } dynamic "condition" { @@ -80,8 +104,147 @@ resource "aws_lb_listener_rule" "alb" { Name = "${var.name}-rule-${each.key}" }) - # Ignore changes to action as the external deployment controller manages target group switching for blue/green - # This is a no-op for rolling deployments (nothing external modifies the action) + # The ECS deployment controller rewrites the forward action during + # native traffic-shift deployments; a no-op for rolling deployments. + lifecycle { + ignore_changes = [action] + } +} + +################################################################################ +# ALB Test (Green) Listener Rule +# +# Dedicated rule, created by default for ALB services, that routes test +# traffic to the alternate (green) target group (tg-2) during native +# traffic-shift deployments. It reuses the production listener and +# routing conditions (listener_rules[0]) but forwards to the green target +# group; the ECS deployment controller rewrites its forward action through +# the TEST_TRAFFIC_SHIFT lifecycle stages so the green revision can be +# validated before production traffic shifts, hence ignore_changes on +# action. Outside a deployment tg-2 is empty, so it returns no targets until +# a deployment registers the green revision. +################################################################################ + +resource "aws_lb_listener_rule" "test" { + count = local.green_alb_listener_rule_enabled ? 1 : 0 + + listener_arn = var.load_balancer_attachment.listener_rules[0].listener_arn + # One slot ahead of the production rule so a request carrying the test + # header matches this rule first; ALB routes by priority order, not by + # specificity, so without this it would fall through to production. + priority = local.green_test_priority + + # Same group-stickiness requirement as the production rule: ECS + # rewrites this rule's forward action through the TEST_TRAFFIC_SHIFT + # stages, and ELBv2 rejects the rewrite when the sticky target groups + # are referenced without group-level stickiness on the action. + action { + type = "forward" + target_group_arn = local.alb_group_stickiness_enabled ? null : aws_lb_target_group.tg_2[0].arn + + dynamic "forward" { + for_each = local.alb_group_stickiness_enabled ? [1] : [] + content { + target_group { + arn = aws_lb_target_group.tg_2[0].arn + } + stickiness { + enabled = true + duration = local.alb_group_stickiness_duration + } + } + } + } + + # Distinguishing condition: only requests carrying the configured test + # selector reach the green target group. The selector is a header or a + # query string (test_traffic_condition_type) — ALB AND-combines all + # conditions on a rule and ECS native blue/green drives exactly one test + # rule, so it is one type per service, not both at once. Combined with the + # mirrored production conditions below, normal traffic still matches + # production. + dynamic "condition" { + for_each = var.test_traffic_condition_type == "header" ? [1] : [] + content { + http_header { + http_header_name = var.test_header_name + values = [var.test_header_value] + } + } + } + + dynamic "condition" { + for_each = var.test_traffic_condition_type == "query-string" ? [1] : [] + content { + query_string { + key = var.test_query_string_key + value = var.test_query_string_value + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "path-pattern"] + content { + path_pattern { + values = condition.value.values + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "host-header"] + content { + host_header { + values = condition.value.values + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "http-header"] + content { + http_header { + http_header_name = condition.value.values[0] + values = slice(condition.value.values, 1, length(condition.value.values)) + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "http-request-method"] + content { + http_request_method { + values = condition.value.values + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "query-string"] + content { + query_string { + key = try(condition.value.values[0], null) + value = try(condition.value.values[1], condition.value.values[0]) + } + } + } + + dynamic "condition" { + for_each = [for c in var.load_balancer_attachment.listener_rules[0].conditions : c if c.type == "source-ip"] + content { + source_ip { + values = condition.value.values + } + } + } + + tags = merge(local.tags, { + Name = "${var.name}-test-rule" + }) + + # The ECS deployment controller rewrites the forward action during + # native traffic-shift deployments (TEST_TRAFFIC_SHIFT stages). lifecycle { ignore_changes = [action] } @@ -89,8 +252,9 @@ resource "aws_lb_listener_rule" "alb" { ################################################################################ # NLB Listeners -# For NLB, we create the listener directly (no listener rules in NLB) -# For blue/green deployments, an external controller manages target group switching +# For NLB, we create the listener directly (no listener rules in NLB). +# The ECS deployment controller rewrites the default action during +# native traffic-shift deployments. ################################################################################ resource "aws_lb_listener" "nlb" { @@ -106,20 +270,16 @@ resource "aws_lb_listener" "nlb" { alpn_policy = var.load_balancer_attachment.nlb_listener.protocol == "TLS" ? var.load_balancer_attachment.nlb_listener.alpn_policy : null default_action { - type = "forward" - target_group_arn = ( - var.deployment_type == "blue_green" - ? aws_lb_target_group.tg_1[0].arn - : aws_lb_target_group.this[0].arn - ) + type = "forward" + target_group_arn = aws_lb_target_group.tg_1[0].arn } tags = merge(local.tags, { Name = "${var.name}-nlb-listener" }) - # Ignore changes to default_action as the external deployment controller manages target group switching for blue/green - # This is a no-op for rolling deployments (nothing external modifies the action) + # The ECS deployment controller rewrites the default action during + # native traffic-shift deployments; a no-op for rolling deployments. lifecycle { ignore_changes = [default_action] } diff --git a/compute/ecs_service/locals.tf b/compute/ecs_service/locals.tf index 7e3debd..2afde47 100644 --- a/compute/ecs_service/locals.tf +++ b/compute/ecs_service/locals.tf @@ -15,8 +15,26 @@ locals { tags = merge(local.default_tags, var.tags) - # Determine deployment controller type - deployment_controller_type = var.deployment_type == "blue_green" ? "CODE_DEPLOY" : "ECS" + # Every strategy runs on the native ECS deployment controller — the + # blue_green / linear / canary traffic shifts are executed by ECS + # itself (deployment_configuration.strategy), not CodeDeploy. + deployment_controller_type = "ECS" + + # Strategies that run the ECS controller's traffic-shift state machine + # over two target groups (production + alternate). Only used to seed + # deployment_configuration at create time — the target-group pair, + # infrastructure role, and advanced_configuration are provisioned for + # every load-balanced service so the strategy can change per + # deployment without Terraform changes. + is_native_traffic_shift = contains(["blue_green", "linear", "canary"], var.deployment_type) + + # Map the module's strategy name to the AWS deploymentConfiguration enum. + deployment_strategy = { + rolling = "ROLLING" + blue_green = "BLUE_GREEN" + linear = "LINEAR" + canary = "CANARY" + }[var.deployment_type] # Determine if load balancer is configured enable_load_balancer = var.load_balancer_attachment != null && var.load_balancer_attachment.enabled @@ -24,6 +42,60 @@ locals { # Determine if NLB listener should be created (vs ALB listener rules) enable_nlb_listener = local.enable_load_balancer && var.load_balancer_attachment.nlb_listener != null + # Determine if a dedicated test (green) ALB listener rule should be + # created. Drives the advanced_configuration.test_listener_rule wiring + # and the TEST_TRAFFIC_SHIFT lifecycle stages on native traffic-shift + # deploys. ALB-only — requires a production listener rule to mirror; a + # no-op for NLB services. + green_alb_listener_rule_enabled = ( + local.enable_load_balancer + && !local.enable_nlb_listener + && var.green_alb_listener_rule_enabled + && length(var.load_balancer_attachment.listener_rules) > 0 + ) + + # When the green rule is enabled the module owns both priorities so the + # test rule (production conditions + the configured test selector) is always + # evaluated before the production rule — otherwise ALB, which routes by + # priority order and not specificity, would match production first and a + # test request would never reach green. The production rule's + # priority becomes the base (its configured priority, else the default + # below) and the test rule sits one slot ahead at base - 1. Both numbers + # must be unique across all rules on a shared listener; set an explicit + # Listener rule priority per service when several green services share a + # listener. + green_default_production_priority = 1000 + green_production_priority = local.green_alb_listener_rule_enabled ? coalesce( + var.load_balancer_attachment.listener_rules[0].priority, + local.green_default_production_priority, + ) : null + green_test_priority = local.green_alb_listener_rule_enabled ? local.green_production_priority - 1 : null + + # ARN passed to advanced_configuration.test_listener_rule and exported: + # the module-created rule when configured, else an externally-managed + # rule ARN supplied by the caller, else null. + test_listener_rule_arn = local.green_alb_listener_rule_enabled ? aws_lb_listener_rule.test[0].arn : var.test_listener_rule_arn + + # ALB rules whose forward action ECS rewrites during native + # traffic-shift deployments must carry group-level stickiness when the + # target groups have target-level stickiness: ELBv2 rejects a + # multi-target-group forward referencing a sticky target group unless + # the action itself has TargetGroupStickinessConfig enabled ("You must + # enable group stickiness on a rule if you enabled target stickiness + # on one of its target groups"), which fails the deployment's + # PRE_SCALE_UP stage. ALB-only — NLB listeners forward to one target + # group at a time. + alb_group_stickiness_enabled = ( + local.enable_load_balancer + && !local.enable_nlb_listener + && var.load_balancer_attachment.target_group.stickiness != null + && var.load_balancer_attachment.target_group.stickiness.enabled + ) + # Reuse the target-group cookie duration so a client pinned to the + # blue or green group stays pinned for the same window as its + # in-group target pinning. + alb_group_stickiness_duration = local.alb_group_stickiness_enabled ? var.load_balancer_attachment.target_group.stickiness.cookie_duration : null + # Placeholder container name and port placeholder_container_name = "app" placeholder_container_port = var.container_port @@ -110,4 +182,3 @@ locals { # Service discovery settings enable_service_discovery = var.service_discovery != null } - diff --git a/compute/ecs_service/moved.tf b/compute/ecs_service/moved.tf new file mode 100644 index 0000000..923b328 --- /dev/null +++ b/compute/ecs_service/moved.tf @@ -0,0 +1,8 @@ +################################################################################ +# State Migrations +################################################################################ + +moved { + from = aws_lb_target_group.this[0] + to = aws_lb_target_group.tg_1[0] +} diff --git a/compute/ecs_service/outputs.tf b/compute/ecs_service/outputs.tf index 8f3d97f..3c397ce 100644 --- a/compute/ecs_service/outputs.tf +++ b/compute/ecs_service/outputs.tf @@ -85,62 +85,64 @@ output "security_group_arn" { } ################################################################################ -# Target Groups - Rolling Deployment +# Target Groups +# +# A production (tg-1) + alternate (tg-2) pair always exists when a load +# balancer is attached, so the deployment strategy can change per +# deployment without Terraform changes. Rolling deployments only ever +# use the production target group. ################################################################################ output "target_group_arn" { - description = "The ARN of the target group (null if load balancer disabled or blue/green deployment)." - value = local.enable_load_balancer && var.deployment_type == "rolling" ? aws_lb_target_group.this[0].arn : null + description = "The ARN of the production target group the service serves from (alias of production_target_group_arn; null if load balancer disabled)." + value = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].arn : null } output "target_group_arn_suffix" { - description = "The ARN suffix of the target group for CloudWatch metrics." - value = local.enable_load_balancer && var.deployment_type == "rolling" ? aws_lb_target_group.this[0].arn_suffix : null + description = "The ARN suffix of the production target group for CloudWatch metrics." + value = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].arn_suffix : null } output "target_group_name" { - description = "The name of the target group." - value = local.enable_load_balancer && var.deployment_type == "rolling" ? aws_lb_target_group.this[0].name : null + description = "The name of the production target group the service serves from (alias of production_target_group_name; null if load balancer disabled)." + value = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].name : null } -################################################################################ -# Target Groups - Blue/Green Deployment -################################################################################ +output "production_target_group_arn" { + description = "The ARN of the production target group (null if load balancer disabled)." + value = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].arn : null +} -output "blue_target_group_arn" { - description = "The ARN of the blue target group (null if not blue/green deployment)." - value = local.enable_load_balancer && var.deployment_type == "blue_green" ? aws_lb_target_group.tg_1[0].arn : null +output "production_target_group_name" { + description = "The name of the production target group." + value = local.enable_load_balancer ? aws_lb_target_group.tg_1[0].name : null } -output "blue_target_group_name" { - description = "The name of the blue target group." - value = local.enable_load_balancer && var.deployment_type == "blue_green" ? aws_lb_target_group.tg_1[0].name : null +output "alternate_target_group_arn" { + description = "The ARN of the alternate target group ECS shifts traffic to during native traffic-shift deployments (null if load balancer disabled)." + value = local.enable_load_balancer ? aws_lb_target_group.tg_2[0].arn : null } -output "green_target_group_arn" { - description = "The ARN of the green target group (null if not blue/green deployment)." - value = local.enable_load_balancer && var.deployment_type == "blue_green" ? aws_lb_target_group.tg_2[0].arn : null +output "alternate_target_group_name" { + description = "The name of the alternate target group." + value = local.enable_load_balancer ? aws_lb_target_group.tg_2[0].name : null } -output "green_target_group_name" { - description = "The name of the green target group." - value = local.enable_load_balancer && var.deployment_type == "blue_green" ? aws_lb_target_group.tg_2[0].name : null +output "target_group_arns" { + description = "Map of all target group ARNs created by this module." + value = local.enable_load_balancer ? { + production = aws_lb_target_group.tg_1[0].arn + alternate = aws_lb_target_group.tg_2[0].arn + } : {} } ################################################################################ -# Combined Target Group Outputs (for convenience) +# ECS Infrastructure Role ################################################################################ -output "target_group_arns" { - description = "Map of all target group ARNs created by this module." - value = local.enable_load_balancer ? ( - var.deployment_type == "rolling" ? { - primary = aws_lb_target_group.this[0].arn - } : { - blue = aws_lb_target_group.tg_1[0].arn - green = aws_lb_target_group.tg_2[0].arn - } - ) : {} +output "ecs_infrastructure_role_arn" { + description = "The ARN of the IAM role ECS assumes to manage load-balancer wiring during native traffic-shift deployments (null if load balancer disabled)." + value = local.enable_load_balancer ? aws_iam_role.ecs_infrastructure[0].arn : null } ################################################################################ @@ -157,6 +159,18 @@ output "nlb_listener_arn" { value = local.enable_nlb_listener ? aws_lb_listener.nlb[0].arn : null } +output "production_listener_rule_arn" { + description = "ARN of the production listener rule (ALB) or listener (NLB) the ECS deployment controller rewrites during native traffic-shift deployments. This is the value the deploy manager passes as advanced_configuration.production_listener_rule on UpdateService (null if load balancer disabled)." + value = local.enable_load_balancer ? ( + local.enable_nlb_listener ? aws_lb_listener.nlb[0].arn : aws_lb_listener_rule.alb["0"].arn + ) : null +} + +output "test_listener_rule_arn" { + description = "ARN of the test listener rule the ECS deployment controller rewrites during the TEST_TRAFFIC_SHIFT lifecycle stages, routing test traffic to the green revision before the production cutover. The deploy manager passes it as advanced_configuration.test_listener_rule on UpdateService. Null when no module-created or externally-managed test listener rule is configured." + value = local.test_listener_rule_arn +} + ################################################################################ # Auto Scaling ################################################################################ @@ -252,4 +266,3 @@ output "region" { description = "The AWS region where the resources are deployed." value = local.region } - diff --git a/compute/ecs_service/rvn-ecs-web-definition.yml b/compute/ecs_service/rvn-ecs-web-definition.yml index 8d86710..483dc09 100644 --- a/compute/ecs_service/rvn-ecs-web-definition.yml +++ b/compute/ecs_service/rvn-ecs-web-definition.yml @@ -3,8 +3,8 @@ definition: name: ECS Web Server description: Web server ECS service for running an HTTP application behind an ECS cluster load balancer. release: - version: 0.5.1 - description: Share ECS service config and desired task count behavior + version: 0.6.0 + description: Add native ECS blue/green, linear, and canary deployments with per-deploy strategy controls, manual approval gates, standby validation traffic, production/alternate target groups, ALB group stickiness guidance, ECS infrastructure role, and load-balancer advanced configuration outputs. module: inputs: - id: section_cluster @@ -42,6 +42,139 @@ module: description: Recommended. Requires a NAT gateway or equivalent for internet access and a static IP. default: true - $include: ../../partials/inputs/ecs-service-build-inputs.yml + - id: section_deployment + label: Deployment + type: section + - id: deployment_strategy + label: Deployment strategy + type: string + description: Choose how traffic moves from the current deployment to the new deployment. + required: true + default: rolling + values: + - label: Rolling + description: Replace tasks in place while the service keeps serving from the production target group. + value: rolling + - label: Blue/green + description: Start the new deployment on the alternate target group, then shift production traffic after test validation. + value: blue_green + - label: Linear + description: Shift production traffic to the new deployment in equal percentage steps with a wait between steps. + value: linear + - label: Canary + description: Send a small percentage of production traffic to the new deployment first, then shift the rest after the canary bake time. + value: canary + - id: deployment_bake_time_in_minutes + label: Bake time in minutes + type: number + description: Minutes to keep the current and new deployments running after production traffic has fully shifted, before the old deployment is removed. + max: 1440 + min: 0 + show_when: + deployment_strategy: + - blue_green + - linear + - canary + default: 10 + - id: linear_step_percentage + label: Linear step percentage + type: number + description: Percentage of production traffic to move to the new deployment at each linear step. + max: 100 + min: 1 + show_when: + deployment_strategy: linear + default: 20 + - id: linear_step_bake_time_in_minutes + label: Linear step bake time in minutes + type: number + description: Minutes to wait between linear traffic steps before shifting the next percentage. + max: 1440 + min: 0 + show_when: + deployment_strategy: linear + default: 5 + - id: canary_percent + label: Canary percent + type: number + description: Percentage of production traffic to send to the new deployment during the canary phase. + max: 100 + min: 1 + show_when: + deployment_strategy: canary + default: 5 + - id: canary_bake_time_in_minutes + label: Canary bake time in minutes + type: number + description: Minutes to hold canary traffic before shifting the remaining production traffic to the new deployment. + max: 1440 + min: 0 + show_when: + deployment_strategy: canary + default: 10 + - id: deployment_pause_stages + label: Manual approval gates + type: object_array + description: Optional gates that pause the deployment at chosen lifecycle stages until you approve it. Applies only to the blue/green, linear, and canary strategies. + collapsible: true + item_inputs: + - id: stage + label: Stage + type: string + description: Deployment lifecycle stage at which to pause and wait for manual approval. + default: POST_TEST_TRAFFIC_SHIFT + values: + - label: Reconcile service + description: Pause during service reconciliation, before ECS finalizes the deployment. + value: RECONCILE_SERVICE + - label: Pre scale up + description: Pause before the new task set scales up. + value: PRE_SCALE_UP + - label: Post scale up + description: Pause after the new task set has scaled up and become healthy. + value: POST_SCALE_UP + - label: Post test traffic shift + description: Pause after test traffic is shifted to the new (green) tasks, so you can validate them before production traffic moves. + value: POST_TEST_TRAFFIC_SHIFT + - label: Pre production traffic shift + description: Pause before production traffic begins shifting to the new tasks. + value: PRE_PRODUCTION_TRAFFIC_SHIFT + - label: Post production traffic shift + description: Pause after production traffic has fully shifted to the new tasks, before the old tasks are removed. + value: POST_PRODUCTION_TRAFFIC_SHIFT + - id: timeout_in_minutes + label: Timeout (minutes) + type: number + description: Minutes to wait for manual approval before the timeout action runs. Matches the AWS default of 1,440 minutes (24 hours). + max: 20160 + min: 1 + required: true + default: 1440 + - id: timeout_action + label: On timeout + type: string + description: Action to take if approval is not given before the timeout elapses. Defaults to rolling back. + required: true + default: ROLLBACK + values: + - label: Roll back + description: Roll the deployment back to the previous revision. + value: ROLLBACK + - label: Continue + description: Proceed with the deployment as if it were approved. + value: CONTINUE + item_label: Pause stage + item_title_field: stage + item_description: "Timeout: {timeout_in_minutes} minutes / {timeout_action}" + show_when: + deployment_strategy: + - blue_green + - linear + - canary + default: + - stage: POST_TEST_TRAFFIC_SHIFT + timeout_in_minutes: 1440 + timeout_action: ROLLBACK - id: section_health label: Health check type: section @@ -118,7 +251,7 @@ module: - id: target_group_stickiness_enabled label: Sticky sessions type: boolean - description: Enable load balancer cookie stickiness so repeat requests are routed to the same task when possible. + description: Enable load balancer cookie stickiness so repeat requests are routed to the same task when possible. When enabled, traffic-shift deployments also keep clients on the first production or alternate target group they reach. collapsible: true default: true - id: target_group_stickiness_type @@ -257,6 +390,46 @@ module: deploy: $merge: - ../../partials/deploy/ecs-service-deploy-common.yml + strategy: | + << + module.input.deployment_strategy == "rolling" ? nil : + module.input.deployment_strategy == "blue_green" ? { + "type": "blue_green", + "bake_time_in_minutes": module.input.deployment_bake_time_in_minutes, + "pause_stages": module.input.deployment_pause_stages + } : + module.input.deployment_strategy == "linear" ? { + "type": "linear", + "bake_time_in_minutes": module.input.deployment_bake_time_in_minutes, + "pause_stages": module.input.deployment_pause_stages, + "linear": { + "step_percentage": module.input.linear_step_percentage, + "step_bake_time_in_minutes": module.input.linear_step_bake_time_in_minutes + } + } : { + "type": "canary", + "bake_time_in_minutes": module.input.deployment_bake_time_in_minutes, + "pause_stages": module.input.deployment_pause_stages, + "canary": { + "canary_percent": module.input.canary_percent, + "canary_bake_time_in_minutes": module.input.canary_bake_time_in_minutes + } + } + >> + infrastructure: + ecs_cluster_arn: <> + ecs_service_arns: + - <> + ecs_target_group_arn: <> + # Load-balancer advanced configuration for native traffic-shift + # strategies (blue_green / linear / canary). The deploy manager + # attaches these to UpdateService loadBalancers[].advancedConfiguration + # so ECS can shift traffic between the production and alternate target + # groups. Null/absent for non-load-balanced services (rolling only). + ecs_alternate_target_group_arn: <> + ecs_production_listener_rule_arn: <> + ecs_infrastructure_role_arn: <> + ecs_test_listener_rule_arn: <> task_definition: $merge: - ../../partials/deploy/ecs-service-task-definition-common.yml @@ -498,7 +671,7 @@ module: ## Deployment - Deployment type defaults to Rolling. Blue/green is available when you need separate target groups for CodeDeploy-style traffic shifting. + Deployment strategy defaults to Rolling. Blue/green, Linear, and Canary use the native ECS traffic-shift controller with production and alternate target groups. The deployment strategy can change between deploys without changing Terraform infrastructure. Deployments update the ECS service task definition with the selected image, generated container definition, awslogs configuration, runtime platform, environment variables, secrets, and capacity-provider-compatible task settings. @@ -516,6 +689,53 @@ module: Hook-specific environment variables are appended to the app container override for the one-off task. Optional hook CPU, memory, ephemeral storage, and timeout settings let release tasks use different resources from the web service without changing the steady-state app task. + ## Standby validation traffic + + For Blue/green, Linear, and Canary deployments, Ravion creates a test listener rule on the same Application Load Balancer listener as the production rule. During the test traffic stage, requests that match the service's normal Domain host rules and Path rules and include the standby selector route to the standby, or green, task set on the alternate target group. + + By default, the standby selector is the query parameter `__x-rvn-test__=1`. For example, if production traffic uses `https://app.example.com/health`, validate the standby service with: + + ```text + https://app.example.com/health?__x-rvn-test__=1 + ``` + + The alternate target group only has registered targets while ECS is running a traffic-shift deployment. Outside that window, the standby route may have no healthy targets. + + Use Advanced Terraform variables to override the standby selector. Values in Advanced Terraform variables override the generated Terraform variables for the service. + + To use a different query parameter: + + ```json + { + "test_query_string_key": "preview", + "test_query_string_value": "green" + } + ``` + + Then validate standby traffic with `?preview=green`. + + To use an HTTP header instead of a query parameter: + + ```json + { + "test_traffic_condition_type": "header", + "test_header_name": "X-Ravion-Test", + "test_header_value": "1" + } + ``` + + Then send requests with `X-Ravion-Test: 1`. A service can use either the query-string selector or the header selector, not both at once. + + ## Sticky sessions and traffic shifts + + The module definition sets Sticky sessions to true by default. When that setting is enabled, Ravion configures target-group stickiness for the service's production and alternate target groups, so the load balancer keeps repeat requests on the same task when possible. With the default Load balancer cookie stickiness type, the target-level cookie is `AWSALB`; AWS may also set `AWSALBCORS` for CORS support. With Application cookie stickiness, the target-level cookie is the configured Application cookie name, and AWS may also set `AWSALBAPP-*` cookies. + + During Blue/green, Linear, and Canary deployments, Ravion also enables ALB group stickiness on the production and standby listener rule forward actions when Sticky sessions is enabled. A client that first reaches the production target group or the alternate target group keeps using that same group for the stickiness cookie duration, even while the deployment's weighted traffic shift changes for new clients. + + The ALB group stickiness cookie is `AWSALBTG`. For CORS requests, AWS may also set `AWSALBTGCORS`. Application cookie name applies only to `app_cookie` stickiness inside the selected target group; the ALB group stickiness cookie name is managed by AWS and cannot be changed. + + To clear stickiness for a browser, open the browser developer tools, go to Application or Storage > Cookies for the service domain, and delete the relevant cookies: `AWSALBTG` and `AWSALBTGCORS` for ALB group stickiness; `AWSALB` and `AWSALBCORS` for the default target-level stickiness; and the configured Application cookie name plus any `AWSALBAPP-*` cookies when Application cookie stickiness is selected. For API clients, remove those cookie names from the cookie jar or stop sending them in the `Cookie` header. The next request can then enter the current traffic split like a new client. If Sticky sessions is turned off, Ravion does not enable target-group stickiness or ALB group stickiness for this behavior. + ## Builder settings Builder settings apply to Nixpacks and Dockerfile builds. @@ -545,6 +765,7 @@ module: | Start command | No | [] | Command arguments that override a prebuilt image default CMD | | Container port | Yes | 80 | Port exposed by the app container | | Health check path | Yes | / | HTTP path used by the target group health check | + | Sticky sessions | No | true | Keep clients on the same task, and on the same traffic-shift target group when enabled | | Domain host rules | No | - | Hostnames such as app.example.com or *.example.com | | Path rules | No | - | Path patterns such as /*, /api/*, or /app/* | | Capacity provider | Yes | fargate | Primary service capacity provider | @@ -569,7 +790,7 @@ module: | Post-deploy timeout (secs) | No | 1800 | Maximum post-deploy task wait time | | Autoscaling | No | true | Enable CPU and optional memory target tracking | | Desired tasks | Yes* | 1 | Number of web tasks when autoscaling is disabled | - | Deployment type | No | rolling | Rolling or blue/green deployment infrastructure | + | Deployment strategy | Yes | rolling | Rolling, Blue/green, Linear, or Canary | | Tags | No | Standard Ravion tags | Additional tags applied to resources | | Advanced Terraform variables | No | {} | Raw lower-level overrides for exceptional cases | | OpenTofu version override | No | Ravion default | Override the OpenTofu version for the stack | diff --git a/compute/ecs_service/target_groups.tf b/compute/ecs_service/target_groups.tf index 44a3bdf..e0185e5 100644 --- a/compute/ecs_service/target_groups.tf +++ b/compute/ecs_service/target_groups.tf @@ -1,56 +1,17 @@ ################################################################################ -# Target Groups - Rolling Deployment -################################################################################ - -resource "aws_lb_target_group" "this" { - count = local.enable_load_balancer && var.deployment_type == "rolling" ? 1 : 0 - - name = "${substr(var.name, 0, min(length(var.name), 28))}-tg" - port = var.load_balancer_attachment.target_group.port - protocol = var.load_balancer_attachment.target_group.protocol - vpc_id = var.vpc_id - target_type = var.load_balancer_attachment.target_group.target_type - - deregistration_delay = var.load_balancer_attachment.target_group.deregistration_delay - slow_start = contains(["HTTP", "HTTPS"], var.load_balancer_attachment.target_group.protocol) ? var.load_balancer_attachment.target_group.slow_start : null - - health_check { - enabled = var.load_balancer_attachment.target_group.health_check.enabled - path = contains(["HTTP", "HTTPS"], var.load_balancer_attachment.target_group.protocol) ? var.load_balancer_attachment.target_group.health_check.path : null - port = var.load_balancer_attachment.target_group.health_check.port - protocol = coalesce(var.load_balancer_attachment.target_group.health_check.protocol, var.load_balancer_attachment.target_group.protocol) - matcher = contains(["HTTP", "HTTPS"], var.load_balancer_attachment.target_group.protocol) ? var.load_balancer_attachment.target_group.health_check.matcher : null - interval = var.load_balancer_attachment.target_group.health_check.interval - timeout = var.load_balancer_attachment.target_group.health_check.timeout - healthy_threshold = var.load_balancer_attachment.target_group.health_check.healthy_threshold - unhealthy_threshold = var.load_balancer_attachment.target_group.health_check.unhealthy_threshold - } - - dynamic "stickiness" { - for_each = var.load_balancer_attachment.target_group.stickiness != null ? [var.load_balancer_attachment.target_group.stickiness] : [] - content { - enabled = stickiness.value.enabled - type = stickiness.value.type - cookie_duration = contains(["HTTP", "HTTPS"], var.load_balancer_attachment.target_group.protocol) ? stickiness.value.cookie_duration : null - cookie_name = contains(["HTTP", "HTTPS"], var.load_balancer_attachment.target_group.protocol) ? stickiness.value.cookie_name : null - } - } - - tags = merge(local.tags, { - Name = "${var.name}-tg" - }) - - lifecycle { - create_before_destroy = true - } -} - -################################################################################ -# Target Groups - Blue/Green Deployment +# Target Groups +# +# A production (tg-1) + alternate (tg-2) pair is always created when a +# load balancer is attached, regardless of deployment strategy. This +# keeps the deployment strategy a pure per-deployment decision: the +# deploy manager can switch between rolling / blue_green / linear / +# canary on any UpdateService call without Terraform changes. Rolling +# deployments simply serve from the production target group and never +# touch the alternate. ################################################################################ resource "aws_lb_target_group" "tg_1" { - count = local.enable_load_balancer && var.deployment_type == "blue_green" ? 1 : 0 + count = local.enable_load_balancer ? 1 : 0 name = "${substr(var.name, 0, min(length(var.name), 24))}-tg-1" port = var.load_balancer_attachment.target_group.port @@ -90,11 +51,18 @@ resource "aws_lb_target_group" "tg_1" { lifecycle { create_before_destroy = true + # Re-adopting a pre-existing target group via the moved block (old name + # suffix `-tg`) must not force replacement just because the configured + # name is now `-tg-1`: the listener rule ignores `action`, so it would + # never repoint to the replacement and the old TG's destroy would fail + # ("currently in use by a listener rule"). Ignoring `name` keeps the + # existing TG (and its ARN) in place; fresh services still get `-tg-1`. + ignore_changes = [name] } } resource "aws_lb_target_group" "tg_2" { - count = local.enable_load_balancer && var.deployment_type == "blue_green" ? 1 : 0 + count = local.enable_load_balancer ? 1 : 0 name = "${substr(var.name, 0, min(length(var.name), 24))}-tg-2" port = var.load_balancer_attachment.target_group.port @@ -136,4 +104,3 @@ resource "aws_lb_target_group" "tg_2" { create_before_destroy = true } } - diff --git a/compute/ecs_service/tests/basic.tftest.hcl b/compute/ecs_service/tests/basic.tftest.hcl index c33f6ea..808313f 100644 --- a/compute/ecs_service/tests/basic.tftest.hcl +++ b/compute/ecs_service/tests/basic.tftest.hcl @@ -2,8 +2,63 @@ # Basic ECS Service Module Tests ################################################################################ -# Mock provider for testing -mock_provider "aws" {} +# Mock provider for testing. +# aws_iam_policy_document data sources need explicit json overrides — +# the mock provider's generated string is not valid JSON and fails the +# provider-side assume_role_policy validation at plan time. +mock_provider "aws" { + mock_data "aws_iam_policy_document" { + defaults = { + json = "{\"Version\":\"2012-10-17\",\"Statement\":[]}" + } + } + mock_data "aws_partition" { + defaults = { + partition = "aws" + } + } + mock_data "aws_region" { + defaults = { + id = "us-east-1" + name = "us-east-1" + } + } + mock_data "aws_caller_identity" { + defaults = { + account_id = "123456789012" + } + } + mock_data "aws_vpc" { + defaults = { + cidr_block = "10.0.0.0/16" + } + } + + # Computed ARNs must look like real ARNs to pass provider-side + # validation on referencing resources (task definition, listener + # rules, advanced_configuration). + mock_resource "aws_iam_role" { + defaults = { + arn = "arn:aws:iam::123456789012:role/mock-role" + } + } + mock_resource "aws_lb_target_group" { + defaults = { + arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:targetgroup/mock-tg/1234567890123456" + arn_suffix = "targetgroup/mock-tg/1234567890123456" + } + } + mock_resource "aws_lb_listener_rule" { + defaults = { + arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener-rule/app/mock-alb/1234567890123456/1234567890123456/1234567890123456" + } + } + mock_resource "aws_lb_listener" { + defaults = { + arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/net/mock-nlb/1234567890123456/1234567890123456" + } + } +} ################################################################################ # Variables for Tests @@ -44,7 +99,7 @@ run "basic_service" { } assert { - condition = module.security_group.aws_security_group.this.vpc_id == "vpc-12345678" + condition = module.security_group.security_group_vpc_id == "vpc-12345678" error_message = "Security group should be in the correct VPC" } } @@ -75,19 +130,35 @@ run "service_with_load_balancer" { } assert { - condition = length(aws_lb_target_group.this) == 1 - error_message = "Should create one target group for rolling deployment" + condition = length(aws_lb_target_group.tg_1) == 1 && length(aws_lb_target_group.tg_2) == 1 + error_message = "Should create the production + alternate target group pair whenever a load balancer is attached" } assert { - condition = aws_lb_target_group.this[0].port == 8080 + condition = aws_lb_target_group.tg_1[0].port == 8080 && aws_lb_target_group.tg_2[0].port == 8080 error_message = "Target group port should be 8080" } assert { - condition = aws_lb_target_group.this[0].protocol == "HTTP" + condition = aws_lb_target_group.tg_1[0].protocol == "HTTP" error_message = "Target group protocol should be HTTP" } + + assert { + condition = length(aws_iam_role.ecs_infrastructure) == 1 + error_message = "Should create the ECS infrastructure role whenever a load balancer is attached" + } + + assert { + condition = aws_iam_role_policy_attachment.ecs_infrastructure_elb[0].policy_arn == "arn:aws:iam::aws:policy/AmazonECSInfrastructureRolePolicyForLoadBalancers" + error_message = "ECS infrastructure role should attach the documented AWS-managed load-balancer policy ARN" + } + + # Backward-compatible aliases for pre-traffic-shift callers. + assert { + condition = output.target_group_name == output.production_target_group_name + error_message = "target_group_name should alias the production target group name output" + } } ################################################################################ @@ -115,13 +186,13 @@ run "service_with_load_balancer_auto_priority" { } assert { - condition = length(aws_lb_target_group.this) == 1 - error_message = "Should create one target group for rolling deployment" + condition = length(aws_lb_target_group.tg_1) == 1 && length(aws_lb_target_group.tg_2) == 1 + error_message = "Should create the production + alternate target group pair whenever a load balancer is attached" } assert { - condition = aws_lb_listener_rule.alb["0"].priority == null - error_message = "Priority should be null (auto-assigned by AWS)" + condition = var.load_balancer_attachment.listener_rules[0].priority == null + error_message = "Priority should default to null so AWS auto-assigns it" } } @@ -153,17 +224,250 @@ run "blue_green_deployment" { assert { condition = length(aws_lb_target_group.tg_1) == 1 - error_message = "Should create blue target group for blue/green deployment" + error_message = "Should create production target group for blue/green deployment" } assert { condition = length(aws_lb_target_group.tg_2) == 1 - error_message = "Should create green target group for blue/green deployment" + error_message = "Should create alternate target group for blue/green deployment" + } + + assert { + condition = length(aws_iam_role.ecs_infrastructure) == 1 + error_message = "Should create the ECS infrastructure role for native traffic-shift strategies" + } + + assert { + condition = length(aws_lb_listener_rule.alb["0"].action[0].forward) == 0 + error_message = "Without target-group stickiness the rule should use a plain forward (no group-stickiness block)" + } + + assert { + condition = length([for c in aws_lb_listener_rule.test[0].condition : c if length([for q in c.query_string : q if q.key == "__x-rvn-test__" && q.value == "1"]) > 0]) == 1 + error_message = "Test (green) rule should distinguish traffic by the __x-rvn-test__ query string by default" + } + + assert { + condition = length([for c in aws_lb_listener_rule.test[0].condition : c if length(c.http_header) > 0]) == 0 + error_message = "Default (query-string) selector should not emit an http_header condition on the test rule" + } +} + +################################################################################ +# Test: Header selector for the green test rule +# +# test_traffic_condition_type = "header" swaps the distinguishing test +# condition from the default query string to an HTTP header so requests +# carrying : reach the green target group. +################################################################################ + +run "green_rule_header_selector" { + command = plan + + variables { + deployment_type = "blue_green" + container_port = 8080 + test_traffic_condition_type = "header" + test_header_name = "X-Ravion-Test" + test_header_value = "1" + load_balancer_attachment = { + target_group = { + port = 8080 + protocol = "HTTP" + } + listener_rules = [{ + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 100 + conditions = [{ + type = "host-header" + values = ["api.example.com"] + }] + }] + } + } + + assert { + condition = length([for c in aws_lb_listener_rule.test[0].condition : c if length(c.http_header) > 0 && c.http_header[0].http_header_name == "X-Ravion-Test"]) == 1 + error_message = "Test (green) rule should distinguish traffic by the X-Ravion-Test header when selector is header" + } + + assert { + condition = length([for c in aws_lb_listener_rule.test[0].condition : c if length(c.query_string) > 0]) == 0 + error_message = "Header selector should not emit a query-string condition on the test rule" + } +} + +################################################################################ +# Test: Sticky target groups require group-level stickiness on the rules +# +# ECS rewrites the production/test rules into a weighted forward across +# tg-1 + tg-2 during traffic-shift deployments; ELBv2 rejects that +# rewrite at PRE_SCALE_UP when the target groups have target-level +# stickiness but the rule's forward action lacks group stickiness. +################################################################################ + +run "sticky_target_groups_enable_group_stickiness" { + command = plan + + variables { + deployment_type = "blue_green" + container_port = 8080 + green_alb_listener_rule_enabled = true + load_balancer_attachment = { + target_group = { + port = 8080 + protocol = "HTTP" + stickiness = { + enabled = true + type = "lb_cookie" + cookie_duration = 3600 + } + } + listener_rules = [{ + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 100 + conditions = [{ + type = "host-header" + values = ["api.example.com"] + }] + }] + } + } + + assert { + condition = aws_lb_listener_rule.alb["0"].action[0].target_group_arn == null + error_message = "Sticky target groups should switch the production rule to the expanded forward block" + } + + assert { + condition = aws_lb_listener_rule.alb["0"].action[0].forward[0].stickiness[0].enabled && aws_lb_listener_rule.alb["0"].action[0].forward[0].stickiness[0].duration == 3600 + error_message = "Production rule forward action should enable group stickiness with the target-group cookie duration" + } + + assert { + condition = aws_lb_listener_rule.test[0].action[0].forward[0].stickiness[0].enabled && aws_lb_listener_rule.test[0].action[0].forward[0].stickiness[0].duration == 3600 + error_message = "Test (green) rule forward action should enable group stickiness with the target-group cookie duration" + } +} + +################################################################################ +# Test: Native traffic-shift strategies reject multiple listener rules +# +# advanced_configuration accepts a single production listener rule, so +# ECS would only ever shift traffic on the first rule — additional +# rules would silently keep serving the old revision. +################################################################################ + +run "blue_green_rejects_multiple_listener_rules" { + command = plan + + variables { + deployment_type = "blue_green" + container_port = 8080 + load_balancer_attachment = { + target_group = { + port = 8080 + protocol = "HTTP" + } + listener_rules = [ + { + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 100 + conditions = [{ + type = "host-header" + values = ["api.example.com"] + }] + }, + { + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 101 + conditions = [{ + type = "host-header" + values = ["www.example.com"] + }] + }, + ] + } + } + + expect_failures = [aws_ecs_service.this] +} + +################################################################################ +# Test: Canary Deployment +################################################################################ + +run "canary_deployment" { + command = plan + + variables { + deployment_type = "canary" + container_port = 8080 + deployment_strategy_config = { + bake_time_in_minutes = 15 + canary = { + canary_percent = 10.0 + canary_bake_time_in_minutes = 5 + } + } + load_balancer_attachment = { + target_group = { + port = 8080 + protocol = "HTTP" + } + listener_rules = [{ + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 100 + conditions = [{ + type = "host-header" + values = ["api.example.com"] + }] + }] + } + } + + assert { + condition = length(aws_lb_target_group.tg_1) == 1 && length(aws_lb_target_group.tg_2) == 1 + error_message = "Should create production + alternate target groups for canary deployment" + } +} + +################################################################################ +# Test: Linear Deployment +################################################################################ + +run "linear_deployment" { + command = plan + + variables { + deployment_type = "linear" + container_port = 8080 + deployment_strategy_config = { + bake_time_in_minutes = 10 + linear = { + step_percent = 20.0 + step_bake_time_in_minutes = 5 + } + } + load_balancer_attachment = { + target_group = { + port = 8080 + protocol = "HTTP" + } + listener_rules = [{ + listener_arn = "arn:aws:elasticloadbalancing:us-east-1:123456789012:listener/app/my-alb/1234567890123456/1234567890123456" + priority = 100 + conditions = [{ + type = "host-header" + values = ["api.example.com"] + }] + }] + } } assert { - condition = length(aws_lb_target_group.this) == 0 - error_message = "Should not create single target group for blue/green deployment" + condition = length(aws_lb_target_group.tg_1) == 1 && length(aws_lb_target_group.tg_2) == 1 + error_message = "Should create production + alternate target groups for linear deployment" } } diff --git a/compute/ecs_service/variables.tf b/compute/ecs_service/variables.tf index ef754d7..015f848 100644 --- a/compute/ecs_service/variables.tf +++ b/compute/ecs_service/variables.tf @@ -286,15 +286,92 @@ variable "desired_count" { variable "deployment_type" { type = string - description = "The deployment type: 'rolling' (ECS) or 'blue_green' (CODE_DEPLOY)." + description = "Initial deployment strategy for direct Terraform use ('rolling', 'blue_green', 'linear', 'canary'). Ravion ECS Web stack provisioning passes 'rolling' and the Flightcontrol deploy manager passes the authoritative blue_green/linear/canary strategy on each UpdateService call, so strategy changes in Ravion do not require Terraform changes." default = "rolling" validation { - condition = contains(["rolling", "blue_green"], var.deployment_type) - error_message = "The deployment_type must be either 'rolling' or 'blue_green'." + condition = contains(["rolling", "blue_green", "linear", "canary"], var.deployment_type) + error_message = "The deployment_type must be one of: 'rolling', 'blue_green', 'linear', 'canary'." } } +variable "deployment_strategy_config" { + type = object({ + # Minutes both revisions keep running after production traffic has + # fully shifted, before the old revision is terminated. + bake_time_in_minutes = optional(number, 10) + + # Canary tuning — only used when deployment_type is 'canary'. + canary = optional(object({ + canary_percent = optional(number, 5.0) + canary_bake_time_in_minutes = optional(number, 10) + }), {}) + + # Linear tuning — only used when deployment_type is 'linear'. + linear = optional(object({ + step_percent = optional(number, 25.0) + step_bake_time_in_minutes = optional(number, 5) + }), {}) + }) + description = <<-EOT + Initial tuning for direct Terraform use with native traffic-shift + strategies (blue_green / linear / canary). Ravion ECS Web stack + provisioning uses rolling and the Flightcontrol deploy manager passes + the authoritative deploymentConfiguration (including pause lifecycle + hooks) on every UpdateService call, so post-create changes to these + values are ignored by Terraform (see ignore_changes on + aws_ecs_service.this). + EOT + default = {} +} + +variable "test_listener_rule_arn" { + type = string + description = "Optional ARN of an externally-managed ALB listener rule that routes test traffic for blue/green validation (drives the TEST_TRAFFIC_SHIFT lifecycle stages). Only used for native traffic-shift strategies when the module-created green listener rule is not enabled." + default = null +} + +variable "green_alb_listener_rule_enabled" { + type = bool + description = "Create a dedicated ALB listener rule that routes test traffic to the green (alternate) target group during native traffic-shift deployments (blue_green/linear/canary), so the new revision can be validated before production traffic shifts. The rule reuses the production listener and routing conditions plus a distinguishing test selector (query string by default, or header when test_traffic_condition_type is \"header\") and forwards to the alternate target group; the ECS deployment controller rewrites it through the TEST_TRAFFIC_SHIFT lifecycle stages. Created by default; no effect for NLB services." + default = true +} + +variable "test_header_name" { + type = string + description = "HTTP header name that distinguishes test traffic for the green listener rule. Requests carrying this header (with test_header_value) match the green rule and reach the alternate target group; requests without it fall through to production. Only used when green_alb_listener_rule_enabled is true and test_traffic_condition_type is \"header\"." + default = "X-Ravion-Test" +} + +variable "test_header_value" { + type = string + description = "Value paired with test_header_name for routing test traffic to the green target group. Only used when green_alb_listener_rule_enabled is true and test_traffic_condition_type is \"header\"." + default = "1" +} + +variable "test_traffic_condition_type" { + type = string + description = "Which request attribute distinguishes test traffic for the green listener rule: \"header\" (matches test_header_name/test_header_value) or \"query-string\" (matches test_query_string_key/test_query_string_value). ALB AND-combines conditions within a single rule and ECS native blue/green wires exactly one test rule, so the selector is one type per service, not both at once. Only used when green_alb_listener_rule_enabled is true." + default = "query-string" + + validation { + condition = contains(["header", "query-string"], var.test_traffic_condition_type) + error_message = "test_traffic_condition_type must be either \"header\" or \"query-string\"." + } +} + +variable "test_query_string_key" { + type = string + description = "Query-string key that distinguishes test traffic for the green listener rule (e.g. \"__x-rvn-test__\" matches ?__x-rvn-test__=...). Requests carrying this key/value match the green rule and reach the alternate target group; requests without it fall through to production. Only used when green_alb_listener_rule_enabled is true and test_traffic_condition_type is \"query-string\"." + default = "__x-rvn-test__" +} + +variable "test_query_string_value" { + type = string + description = "Value paired with test_query_string_key for routing test traffic to the green target group. Only used when green_alb_listener_rule_enabled is true and test_traffic_condition_type is \"query-string\"." + default = "1" +} + variable "deployment_minimum_healthy_percent" { type = number description = "The minimum healthy percent during deployment (rolling deployments only)." @@ -452,6 +529,16 @@ variable "load_balancer_attachment" { }) # ALB: Listener rules (attach to existing ALB listener) + # + # IMPORTANT: only the first rule is wired into the service's + # advanced_configuration as the production listener rule. Native + # traffic-shift deployments (blue_green/linear/canary) rewrite only + # that rule — traffic on any additional rules never shifts to the + # new revision. Terraform rejects >1 rule when deployment_type is a + # traffic-shift strategy, but because the strategy is a + # per-deployment decision on the native ECS controller, services + # that may ever deploy with a traffic-shift strategy must also keep + # to a single rule. listener_rules = optional(list(object({ listener_arn = string priority = optional(number, null) # null = AWS auto-assigns next available priority diff --git a/compute/ecs_service/versions.tf b/compute/ecs_service/versions.tf index bec739b..283914b 100644 --- a/compute/ecs_service/versions.tf +++ b/compute/ecs_service/versions.tf @@ -9,8 +9,10 @@ terraform { required_providers { aws = { - source = "hashicorp/aws" - version = ">= 6.0" + source = "hashicorp/aws" + # 6.21 adds linear_configuration / canary_configuration on the + # aws_ecs_service deployment_configuration block. + version = ">= 6.21" } } } diff --git a/test/ecs_service_test.go b/test/ecs_service_test.go index 6ec779f..6afee5e 100644 --- a/test/ecs_service_test.go +++ b/test/ecs_service_test.go @@ -4,6 +4,7 @@ package test import ( "testing" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/flightcontrolhq/modules/test/helpers" "github.com/gruntwork-io/terratest/modules/terraform" "github.com/stretchr/testify/assert" @@ -168,6 +169,42 @@ func TestEcsServiceWithAlb(t *testing.T) { hasTargetGroup := helpers.EcsServiceHasTargetGroup(t, clusterArn, serviceName, targetGroupArn, awsRegion) assert.True(t, hasTargetGroup, "ECS service should be registered with the target group") + // The module wires load_balancer.advanced_configuration (alternate + // target group, production listener rule, infrastructure role) + // unconditionally — including for the rolling strategy used here, where + // CreateService carries no deployment_configuration. This asserts the + // real AWS API accepted that combination and persisted it on the + // service; if AWS ever rejected it, every rolling service with a load + // balancer (the module default) would fail to provision. + alternateTargetGroupArn := terraform.Output(t, terraformOptions, "alternate_target_group_arn") + require.NotEmpty(t, alternateTargetGroupArn, "alternate_target_group_arn should not be empty") + + loadBalancers := helpers.GetEcsServiceLoadBalancers(t, clusterArn, serviceName, awsRegion) + require.Len(t, loadBalancers, 1, "ECS service should have exactly one load balancer attachment") + + advancedConfig := loadBalancers[0].AdvancedConfiguration + require.NotNil(t, advancedConfig, "load balancer advanced configuration should be set on a rolling service") + assert.Equal(t, alternateTargetGroupArn, aws.ToString(advancedConfig.AlternateTargetGroupArn), "alternate target group should match the module output") + assert.NotEmpty(t, aws.ToString(advancedConfig.ProductionListenerRule), "production listener rule should be set") + assert.NotEmpty(t, aws.ToString(advancedConfig.RoleArn), "infrastructure role should be set") + + // The production_listener_rule_arn output is what the deploy manager + // plumbs into UpdateService advancedConfiguration for native + // traffic-shift deployments, so it must match the rule AWS actually + // persisted on the service. + productionListenerRuleArn := terraform.Output(t, terraformOptions, "production_listener_rule_arn") + require.NotEmpty(t, productionListenerRuleArn, "production_listener_rule_arn should not be empty") + assert.Equal(t, productionListenerRuleArn, aws.ToString(advancedConfig.ProductionListenerRule), "production_listener_rule_arn output should match the rule on the service") + + // The fixture configures a dedicated test listener rule, so the module + // must create it, export its ARN, and wire it into the service's + // advanced_configuration.test_listener_rule — the value the deploy + // manager forwards to drive the TEST_TRAFFIC_SHIFT lifecycle stages. + testListenerRuleArn := terraform.Output(t, terraformOptions, "test_listener_rule_arn") + require.NotEmpty(t, testListenerRuleArn, "test_listener_rule_arn should not be empty when a test listener rule is configured") + assert.NotEqual(t, productionListenerRuleArn, testListenerRuleArn, "test and production listener rules must be distinct") + assert.Equal(t, testListenerRuleArn, aws.ToString(advancedConfig.TestListenerRule), "test_listener_rule_arn output should match the rule on the service") + // Wait for targets to be registered in the target group // The ECS service needs time to register tasks with the target group t.Log("Waiting for targets to be registered with the target group...") diff --git a/test/fixtures/ecs_service/with_alb/main.tf b/test/fixtures/ecs_service/with_alb/main.tf index c5e1f8a..d955a3c 100644 --- a/test/fixtures/ecs_service/with_alb/main.tf +++ b/test/fixtures/ecs_service/with_alb/main.tf @@ -228,6 +228,26 @@ output "target_group_arn" { value = module.ecs_service.target_group_arn } +output "alternate_target_group_arn" { + description = "The ARN of the alternate target group ECS shifts traffic to during native traffic-shift deployments." + value = module.ecs_service.alternate_target_group_arn +} + +output "ecs_infrastructure_role_arn" { + description = "The ARN of the IAM role ECS assumes to manage load-balancer wiring during native traffic-shift deployments." + value = module.ecs_service.ecs_infrastructure_role_arn +} + +output "production_listener_rule_arn" { + description = "The ARN of the production listener rule wired into the service's advanced_configuration." + value = module.ecs_service.production_listener_rule_arn +} + +output "test_listener_rule_arn" { + description = "The ARN of the test listener rule wired into the service's advanced_configuration." + value = module.ecs_service.test_listener_rule_arn +} + output "alb_security_group_id" { description = "The ID of the ALB security group." value = module.ecs_cluster.public_alb_security_group_id