feat(phase-2): workstream 8 — Multi-Region Terraform Deployment

AWS environment:
- VPC (3-AZ, public + private subnets, NAT gateways, VPC endpoints for ECR/SM/CW)
- ECS Fargate service (sentryagent/agentidp) — secrets from Secrets Manager
- RDS PostgreSQL 14 (Multi-AZ, encrypted, VPC-internal, storage autoscaling)
- ElastiCache Redis 7 (primary + replica, at-rest + in-transit encryption)
- ALB with HTTPS/443, HTTP→HTTPS redirect, ACM certificate
- Route 53 alias record

GCP environment:
- VPC + private services access + Serverless VPC connector
- Cloud Run service — secrets from Secret Manager
- Cloud SQL PostgreSQL 14 (private IP, no public endpoint)
- Cloud Memorystore Redis 7 (VPC-internal, AUTH enabled)

Shared:
- 4 reusable modules: agentidp (dual AWS/GCP), rds, redis, lb
- No hardcoded secrets; all sensitive vars marked sensitive=true
- terraform.tfvars.example for both environments
- docs/devops/deployment.md — AWS + GCP step-by-step walkthrough, rollback procedures

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
SentryAgent.ai Developer
2026-03-29 06:25:14 +00:00
parent a504964e5f
commit 6913d62648
22 changed files with 4138 additions and 8 deletions

View File

@@ -0,0 +1,426 @@
################################################################################
# Module: agentidp
# Main — ECS Fargate (AWS) or Cloud Run (GCP)
#
# Deploys the sentryagent/agentidp container.
# All sensitive environment variables are injected from AWS Secrets Manager
# (AWS path) or GCP Secret Manager (GCP path) — no plaintext secrets here.
################################################################################
terraform {
required_version = ">= 1.6.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.40.0"
}
google = {
source = "hashicorp/google"
version = ">= 5.20.0"
}
}
}
################################################################################
# Locals
################################################################################
locals {
common_tags = {
environment = var.environment
project = var.project
managed_by = "terraform"
}
# Build the list of Vault-related env vars conditionally.
# If vault_addr is empty we omit all Vault env vars entirely.
aws_vault_env_plain = var.aws_vault_addr != "" ? [
{
name = "VAULT_ADDR"
value = var.aws_vault_addr
},
{
name = "VAULT_MOUNT"
value = var.aws_vault_mount
}
] : []
aws_vault_secret_env = var.aws_secret_vault_token_arn != "" ? [
{
name = "VAULT_TOKEN"
valueFrom = var.aws_secret_vault_token_arn
}
] : []
gcp_vault_env_plain = var.gcp_vault_addr != "" ? {
VAULT_ADDR = var.gcp_vault_addr
VAULT_MOUNT = var.gcp_vault_mount
} : {}
}
################################################################################
# ── AWS PATH ──────────────────────────────────────────────────────────────────
################################################################################
# Security group: allow inbound traffic only from the ALB on app_port,
# allow all outbound (needed for Secrets Manager and ECR API calls over HTTPS).
resource "aws_security_group" "app" {
count = var.provider_type == "aws" ? 1 : 0
name = "${var.project}-${var.environment}-app-sg"
description = "Security group for AgentIdP ECS tasks — inbound from ALB only"
vpc_id = var.aws_vpc_id
ingress {
description = "App port from ALB"
from_port = var.app_port
to_port = var.app_port
protocol = "tcp"
# The ALB security group ID is not directly available here; in the root
# environment module the ALB SG and this SG are cross-referenced.
# The environment module passes the ALB SG id via aws_lb_security_group_id
# below using a separate ingress rule resource to avoid circular dependency.
cidr_blocks = []
self = false
}
egress {
description = "All outbound"
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = merge(local.common_tags, {
Name = "${var.project}-${var.environment}-app-sg"
})
}
# ECS Cluster
resource "aws_ecs_cluster" "main" {
count = var.provider_type == "aws" ? 1 : 0
name = "${var.project}-${var.environment}"
setting {
name = "containerInsights"
value = "enabled"
}
tags = local.common_tags
}
# ECS Cluster Capacity Providers — use FARGATE and FARGATE_SPOT
resource "aws_ecs_cluster_capacity_providers" "main" {
count = var.provider_type == "aws" ? 1 : 0
cluster_name = aws_ecs_cluster.main[0].name
capacity_providers = ["FARGATE", "FARGATE_SPOT"]
default_capacity_provider_strategy {
capacity_provider = "FARGATE"
weight = 1
base = 1
}
}
# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "app" {
count = var.provider_type == "aws" ? 1 : 0
name = var.aws_log_group_name
retention_in_days = 30
tags = local.common_tags
}
# ECS Task Definition
resource "aws_ecs_task_definition" "app" {
count = var.provider_type == "aws" ? 1 : 0
family = "${var.project}-${var.environment}"
network_mode = "awsvpc"
requires_compatibilities = ["FARGATE"]
cpu = tostring(var.aws_cpu)
memory = tostring(var.aws_memory)
execution_role_arn = var.aws_execution_role_arn
task_role_arn = var.aws_task_role_arn
container_definitions = jsonencode([
{
name = "agentidp"
image = var.app_image
essential = true
portMappings = [
{
containerPort = var.app_port
protocol = "tcp"
}
]
# Plain (non-sensitive) environment variables
environment = concat(
[
{ name = "PORT", value = tostring(var.app_port) },
{ name = "NODE_ENV", value = "production" },
{ name = "CORS_ORIGIN", value = var.aws_cors_origin },
{ name = "POLICY_DIR", value = var.aws_policy_dir }
],
local.aws_vault_env_plain
)
# Sensitive values fetched from Secrets Manager at task launch.
# Each entry is injected as the named environment variable.
secrets = concat(
[
{
name = "DATABASE_URL"
valueFrom = var.aws_secret_database_url_arn
},
{
name = "REDIS_URL"
valueFrom = var.aws_secret_redis_url_arn
},
{
name = "JWT_PRIVATE_KEY"
valueFrom = var.aws_secret_jwt_private_key_arn
},
{
name = "JWT_PUBLIC_KEY"
valueFrom = var.aws_secret_jwt_public_key_arn
}
],
local.aws_vault_secret_env
)
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-group" = var.aws_log_group_name
"awslogs-region" = var.aws_region
"awslogs-stream-prefix" = "agentidp"
}
}
healthCheck = {
command = ["CMD-SHELL", "wget -qO- http://localhost:${var.app_port}/health || exit 1"]
interval = 30
timeout = 5
retries = 3
startPeriod = 60
}
readonlyRootFilesystem = false
user = "node"
}
])
tags = local.common_tags
}
# ECS Service
resource "aws_ecs_service" "app" {
count = var.provider_type == "aws" ? 1 : 0
name = "${var.project}-${var.environment}"
cluster = aws_ecs_cluster.main[0].id
task_definition = aws_ecs_task_definition.app[0].arn
desired_count = var.aws_desired_count
launch_type = "FARGATE"
# Rolling update: keep at least 100% healthy tasks during deployment
deployment_minimum_healthy_percent = 100
deployment_maximum_percent = 200
network_configuration {
subnets = var.aws_subnet_ids
security_groups = [aws_security_group.app[0].id]
assign_public_ip = false
}
load_balancer {
target_group_arn = var.aws_target_group_arn
container_name = "agentidp"
container_port = var.app_port
}
# Ignore task_definition changes driven by image tag updates — deployments
# are managed externally (CI/CD pipeline updates the image tag).
lifecycle {
ignore_changes = [task_definition, desired_count]
}
tags = local.common_tags
depends_on = [aws_ecs_cluster_capacity_providers.main]
}
################################################################################
# ── GCP PATH ──────────────────────────────────────────────────────────────────
################################################################################
# Cloud Run Service
resource "google_cloud_run_v2_service" "app" {
count = var.provider_type == "gcp" ? 1 : 0
name = "${var.project}-${var.environment}"
location = var.gcp_region
project = var.gcp_project_id
# Ingress: allow only requests from the load balancer / public internet.
# Cloud Run provides Google-managed TLS on the default *.run.app domain
# and on any custom domains mapped via Cloud Run domain mappings.
ingress = "INGRESS_TRAFFIC_ALL"
template {
service_account = var.gcp_service_account_email
scaling {
min_instance_count = var.gcp_min_instances
max_instance_count = var.gcp_max_instances
}
# VPC access — route outbound traffic through the VPC connector so the
# container can reach Cloud SQL (private IP) and Memorystore.
vpc_access {
connector = var.gcp_vpc_connector_name
egress = "PRIVATE_RANGES_ONLY"
}
containers {
image = var.app_image
ports {
container_port = var.app_port
}
resources {
limits = {
cpu = var.gcp_cpu
memory = var.gcp_memory
}
cpu_idle = false
startup_cpu_boost = true
}
# Plain environment variables
dynamic "env" {
for_each = merge(
{
PORT = tostring(var.app_port)
NODE_ENV = "production"
CORS_ORIGIN = var.gcp_cors_origin
POLICY_DIR = var.gcp_policy_dir
},
local.gcp_vault_env_plain
)
content {
name = env.key
value = env.value
}
}
# DATABASE_URL from Secret Manager
env {
name = "DATABASE_URL"
value_source {
secret_key_ref {
secret = var.gcp_secret_database_url_id
version = "latest"
}
}
}
# REDIS_URL from Secret Manager
env {
name = "REDIS_URL"
value_source {
secret_key_ref {
secret = var.gcp_secret_redis_url_id
version = "latest"
}
}
}
# JWT_PRIVATE_KEY from Secret Manager
env {
name = "JWT_PRIVATE_KEY"
value_source {
secret_key_ref {
secret = var.gcp_secret_jwt_private_key_id
version = "latest"
}
}
}
# JWT_PUBLIC_KEY from Secret Manager
env {
name = "JWT_PUBLIC_KEY"
value_source {
secret_key_ref {
secret = var.gcp_secret_jwt_public_key_id
version = "latest"
}
}
}
# VAULT_TOKEN from Secret Manager (conditional)
dynamic "env" {
for_each = var.gcp_secret_vault_token_id != "" ? [1] : []
content {
name = "VAULT_TOKEN"
value_source {
secret_key_ref {
secret = var.gcp_secret_vault_token_id
version = "latest"
}
}
}
}
liveness_probe {
http_get {
path = "/health"
port = var.app_port
}
initial_delay_seconds = 30
period_seconds = 15
failure_threshold = 3
timeout_seconds = 5
}
startup_probe {
http_get {
path = "/health"
port = var.app_port
}
initial_delay_seconds = 10
period_seconds = 5
failure_threshold = 12
timeout_seconds = 3
}
}
}
labels = {
environment = var.environment
project = replace(var.project, "-", "_")
managed_by = "terraform"
}
}
# Allow unauthenticated (public internet) invocations of the Cloud Run service.
# Authentication for AgentIdP clients is handled by the application layer
# (JWT Bearer tokens), not by Cloud Run's built-in IAM auth.
resource "google_cloud_run_v2_service_iam_member" "public_invoker" {
count = var.provider_type == "gcp" ? 1 : 0
project = var.gcp_project_id
location = var.gcp_region
name = google_cloud_run_v2_service.app[0].name
role = "roles/run.invoker"
member = "allUsers"
}