diff --git a/docs/devops/deployment.md b/docs/devops/deployment.md new file mode 100644 index 0000000..aec5e1e --- /dev/null +++ b/docs/devops/deployment.md @@ -0,0 +1,603 @@ +# Deployment Guide — SentryAgent.ai AgentIdP + +End-to-end guide for deploying AgentIdP to AWS (primary) and GCP (secondary) using the Terraform infrastructure-as-code in `terraform/`. + +--- + +## Table of Contents + +1. [Prerequisites](#1-prerequisites) +2. [AWS Deployment](#2-aws-deployment) +3. [GCP Deployment](#3-gcp-deployment) +4. [Post-Deploy Verification](#4-post-deploy-verification) +5. [Rollback Procedure](#5-rollback-procedure) +6. [Environment Variable Reference](#6-environment-variable-reference) + +--- + +## 1. Prerequisites + +### Tools + +| Tool | Minimum Version | Install | +|------|-----------------|---------| +| Terraform | 1.6.0 | https://developer.hashicorp.com/terraform/install | +| AWS CLI | 2.13 | https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html | +| gcloud CLI | 460.0 | https://cloud.google.com/sdk/docs/install | +| Docker | 24.0 | Required only for building and pushing images | +| openssl | any | Required for generating JWT key pairs | + +Verify all tools are available: + +```bash +terraform version +aws --version +gcloud version +docker version +openssl version +``` + +### Container Image + +Build and push the `sentryagent/agentidp` image to your registry before deploying. Terraform references the image by tag — it does not build it. + +```bash +# From the project root +docker build -t sentryagent/agentidp:1.0.0 . + +# Push to your registry (ECR example): +aws ecr get-login-password --region us-east-1 \ + | docker login --username AWS --password-stdin 123456789012.dkr.ecr.us-east-1.amazonaws.com + +docker tag sentryagent/agentidp:1.0.0 \ + 123456789012.dkr.ecr.us-east-1.amazonaws.com/sentryagent/agentidp:1.0.0 + +docker push 123456789012.dkr.ecr.us-east-1.amazonaws.com/sentryagent/agentidp:1.0.0 +``` + +Update `app_image_tag` in your `terraform.tfvars` to match. + +### JWT Key Pair + +Generate the RSA-2048 key pair used for signing and verifying JWTs: + +```bash +openssl genrsa -out jwt_private.pem 2048 +openssl rsa -in jwt_private.pem -pubout -out jwt_public.pem + +# Verify +openssl rsa -in jwt_private.pem -check -noout +``` + +Keep `jwt_private.pem` secure — treat it with the same sensitivity as a TLS private key. You will paste its contents into `terraform.tfvars`. + +--- + +## 2. AWS Deployment + +### 2.1 Configure AWS CLI + +```bash +aws configure +# Provide: AWS Access Key ID, Secret Access Key, region (e.g. us-east-1), output format (json) + +# Verify credentials +aws sts get-caller-identity +``` + +The IAM principal running Terraform requires permissions to manage: VPC, ECS, RDS, ElastiCache, ALB, IAM roles, Secrets Manager, Route 53, CloudWatch, and VPC endpoints. + +### 2.2 Provision an ACM Certificate + +The ALB requires an ACM certificate for your domain. Create it in the same region as your deployment. + +```bash +aws acm request-certificate \ + --domain-name idp.example.com \ + --validation-method DNS \ + --region us-east-1 +``` + +Complete DNS validation by adding the CNAME record shown in the ACM console. Wait for the status to become `ISSUED` before proceeding. + +```bash +# Monitor validation status +aws acm describe-certificate \ + --certificate-arn arn:aws:acm:us-east-1:123456789012:certificate/XXXX \ + --region us-east-1 \ + --query 'Certificate.Status' +``` + +### 2.3 Prepare tfvars + +```bash +cd terraform/environments/aws +cp terraform.tfvars.example terraform.tfvars +``` + +Edit `terraform.tfvars`. All fields marked `REPLACE_WITH_*` are required. Key fields: + +- `region` — AWS region (must match the ACM certificate region) +- `domain_name` — your domain (e.g. `idp.example.com`) +- `certificate_arn` — ARN from step 2.2 +- `app_image_tag` — tag of the image you pushed in step 1 +- `db_password` — strong random password (no `@`, `#`, `?`, `/` characters — they break URL parsing) +- `redis_auth_token` — minimum 16 characters, no spaces +- `jwt_private_key` — full PEM contents of `jwt_private.pem` with literal `\n` for newlines +- `jwt_public_key` — full PEM contents of `jwt_public.pem` with literal `\n` for newlines + +Example for encoding PEM keys in tfvars: + +```bash +# Output the private key as a single line with \n separators (for pasting into tfvars) +awk 'NF {printf "%s\\n", $0}' jwt_private.pem +``` + +**Never commit `terraform.tfvars` to version control.** + +### 2.4 Configure Remote State (Recommended) + +Uncomment and configure the `backend "s3"` block in `terraform/environments/aws/main.tf`: + +```hcl +backend "s3" { + bucket = "your-terraform-state-bucket" + key = "agentidp/aws/production/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "your-terraform-locks-table" +} +``` + +Create the S3 bucket and DynamoDB table if they do not exist: + +```bash +# S3 bucket with versioning and encryption +aws s3api create-bucket --bucket your-terraform-state-bucket --region us-east-1 +aws s3api put-bucket-versioning \ + --bucket your-terraform-state-bucket \ + --versioning-configuration Status=Enabled +aws s3api put-bucket-encryption \ + --bucket your-terraform-state-bucket \ + --server-side-encryption-configuration \ + '{"Rules":[{"ApplyServerSideEncryptionByDefault":{"SSEAlgorithm":"AES256"}}]}' + +# DynamoDB table for state locking +aws dynamodb create-table \ + --table-name your-terraform-locks-table \ + --attribute-definitions AttributeName=LockID,AttributeType=S \ + --key-schema AttributeName=LockID,KeyType=HASH \ + --billing-mode PAY_PER_REQUEST \ + --region us-east-1 +``` + +### 2.5 Terraform Init + +```bash +cd terraform/environments/aws +terraform init +``` + +Expected output: provider plugins downloaded, backend initialized. + +### 2.6 Terraform Plan + +```bash +terraform plan -out=tfplan +``` + +Review the plan carefully before applying. Expected resources on first apply: ~50–60 resources (VPC, subnets, NAT gateways, VPC endpoints, IAM roles, secrets, RDS, ElastiCache, ALB, ECS cluster, task definition, service, Route 53 record). + +### 2.7 Terraform Apply + +```bash +terraform apply tfplan +``` + +**First apply takes 20–30 minutes** — RDS Multi-AZ provisioning is the longest step (~15 min). Do not interrupt the apply. + +When complete, note the outputs: + +```bash +terraform output +``` + +Key outputs: +- `service_url` — the HTTPS URL of your deployed service +- `alb_dns_name` — ALB DNS name (verify Route 53 alias is pointing here) +- `ecs_service_name` — use for ECS deployment commands +- `cloudwatch_log_group` — where container logs appear + +### 2.8 Run Database Migrations + +After first deploy, run migrations against the new RDS instance. The easiest approach is to exec into a running ECS task: + +```bash +# Get a running task ARN +TASK_ARN=$(aws ecs list-tasks \ + --cluster sentryagent-agentidp-production \ + --service-name sentryagent-agentidp-production \ + --query 'taskArns[0]' \ + --output text) + +# Run migrations via ECS Exec (requires enableExecuteCommand on the service) +aws ecs execute-command \ + --cluster sentryagent-agentidp-production \ + --task $TASK_ARN \ + --container agentidp \ + --command "node scripts/db-migrate.js" \ + --interactive +``` + +Alternatively, run a one-off ECS task with the migration command as the container override. + +--- + +## 3. GCP Deployment + +### 3.1 Configure gcloud CLI + +```bash +gcloud auth login +gcloud config set project your-gcp-project-id +gcloud auth application-default login +``` + +Verify: + +```bash +gcloud config list +gcloud projects describe your-gcp-project-id +``` + +The principal running Terraform requires the following roles on the project: +- `roles/owner` or a custom role covering: Cloud Run Admin, Cloud SQL Admin, Redis Admin, Secret Manager Admin, IAM Admin, Compute Admin, Service Networking Admin. + +### 3.2 Prepare tfvars + +```bash +cd terraform/environments/gcp +cp terraform.tfvars.example terraform.tfvars +``` + +Edit `terraform.tfvars`. Key fields: + +- `project_id` — your GCP project ID +- `region` — GCP region (e.g. `us-central1`) +- `app_image_tag` — tag of the image you built +- `db_password` — strong random password for Cloud SQL +- `jwt_private_key` / `jwt_public_key` — same PEM keys used for AWS (same key pair for both regions) + +**Never commit `terraform.tfvars` to version control.** + +### 3.3 Configure Remote State (Recommended) + +Uncomment and configure the `backend "gcs"` block in `terraform/environments/gcp/main.tf`: + +```hcl +backend "gcs" { + bucket = "your-terraform-state-bucket" + prefix = "agentidp/gcp/production" +} +``` + +Create the GCS bucket: + +```bash +gsutil mb -l us-central1 gs://your-terraform-state-bucket +gsutil versioning set on gs://your-terraform-state-bucket +``` + +### 3.4 Terraform Init + +```bash +cd terraform/environments/gcp +terraform init +``` + +### 3.5 Terraform Plan + +```bash +terraform plan -out=tfplan +``` + +Review the plan. Expected resources: ~35–45 resources (VPC, subnet, VPC connector, service accounts, secrets, Cloud SQL, Memorystore, Cloud Run service, IAM bindings, API enablement). + +### 3.6 Terraform Apply + +```bash +terraform apply tfplan +``` + +**First apply takes 15–20 minutes** — Cloud SQL provisioning is the longest step. + +When complete: + +```bash +terraform output +``` + +Key outputs: +- `service_url` — Cloud Run HTTPS URL (Google-managed TLS, no cert setup required) +- `cloud_sql_connection_name` — for Cloud SQL Proxy if needed +- `memorystore_host` — Redis private IP + +### 3.7 Run Database Migrations + +Cloud Run does not support exec. Use a one-off Cloud Run Job for migrations: + +```bash +gcloud run jobs create agentidp-migrate \ + --image sentryagent/agentidp:1.0.0 \ + --region us-central1 \ + --command node \ + --args "scripts/db-migrate.js" \ + --set-secrets "DATABASE_URL=sentryagent-agentidp-production-database-url:latest" \ + --vpc-connector sentryagent-agentidp-production-connector \ + --service-account sentryagent-agentidp-production-run-sa@your-gcp-project-id.iam.gserviceaccount.com + +gcloud run jobs execute agentidp-migrate --region us-central1 --wait +``` + +--- + +## 4. Post-Deploy Verification + +Run these checks after deploying to either environment. Replace `https://idp.example.com` with your actual service URL. + +### 4.1 Health Check + +```bash +curl -si https://idp.example.com/health +``` + +Expected response: + +``` +HTTP/2 200 +content-type: application/json + +{"status":"ok"} +``` + +If you receive a 502 or 503, the load balancer has not yet registered healthy targets. Wait 60–90 seconds and retry — ECS tasks or Cloud Run instances take time to pass health checks. + +### 4.2 Metrics Endpoint + +```bash +curl -si https://idp.example.com/metrics +``` + +Expected: HTTP 200 with Prometheus-format metrics text (lines beginning with `# HELP`, `# TYPE`, and metric values). + +### 4.3 Token Endpoint (Smoke Test) + +First, register a test agent client (requires a valid JWT or admin credentials — see [developers guide](../developers/)): + +```bash +# Issue a client credentials token (replace CLIENT_ID and CLIENT_SECRET with real values) +curl -s -X POST https://idp.example.com/api/v1/token \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials&client_id=test-client&client_secret=test-secret&scope=read" +``` + +Expected response (abbreviated): + +```json +{ + "access_token": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9...", + "token_type": "Bearer", + "expires_in": 3600, + "scope": "read" +} +``` + +### 4.4 JWKS Endpoint + +```bash +curl -si https://idp.example.com/.well-known/jwks.json +``` + +Expected: HTTP 200 with a JSON object containing a `keys` array with at least one RSA public key entry. + +### 4.5 TLS Verification + +```bash +# Verify TLS certificate is valid and matches your domain +curl -vI https://idp.example.com 2>&1 | grep -E "(SSL|TLS|certificate|issuer|subject)" +``` + +Expected: TLS 1.2 or 1.3, certificate issued by a trusted CA, subject matching your domain. + +### 4.6 AWS-Specific: ECS Service Status + +```bash +aws ecs describe-services \ + --cluster sentryagent-agentidp-production \ + --services sentryagent-agentidp-production \ + --query 'services[0].{desired:desiredCount,running:runningCount,pending:pendingCount,status:status}' +``` + +Expected: `running` equals `desired`, `status` is `ACTIVE`. + +### 4.7 GCP-Specific: Cloud Run Service Status + +```bash +gcloud run services describe sentryagent-agentidp-production \ + --region us-central1 \ + --format='value(status.conditions[0].type,status.conditions[0].status)' +``` + +Expected: `Ready True`. + +--- + +## 5. Rollback Procedure + +### 5.1 Image Rollback (Recommended — fastest) + +To roll back to a previous image tag without modifying infrastructure: + +**AWS:** + +```bash +# Find the previous task definition revision +aws ecs list-task-definitions \ + --family-prefix sentryagent-agentidp-production \ + --sort DESC \ + --query 'taskDefinitionArns[:5]' + +# Update the service to use the previous task definition +aws ecs update-service \ + --cluster sentryagent-agentidp-production \ + --service sentryagent-agentidp-production \ + --task-definition sentryagent-agentidp-production:PREVIOUS_REVISION \ + --force-new-deployment + +# Monitor the rollout +aws ecs wait services-stable \ + --cluster sentryagent-agentidp-production \ + --services sentryagent-agentidp-production +``` + +**GCP:** + +```bash +# Deploy the previous image tag directly +gcloud run services update sentryagent-agentidp-production \ + --region us-central1 \ + --image sentryagent/agentidp:PREVIOUS_TAG + +# Or route 100% of traffic to a specific revision +gcloud run services update-traffic sentryagent-agentidp-production \ + --region us-central1 \ + --to-revisions PREVIOUS_REVISION_NAME=100 +``` + +### 5.2 Infrastructure Rollback via Terraform + +If an infrastructure change (not an image update) caused the problem: + +```bash +# Check the state and plan to understand what changed +terraform show +terraform plan + +# If you have a previous state file (S3/GCS versioning), restore it: +# AWS: +aws s3 cp s3://your-state-bucket/agentidp/aws/production/terraform.tfstate.PREVIOUS ./terraform.tfstate +terraform apply -target= + +# GCP: +gsutil cp gs://your-state-bucket/agentidp/gcp/production/PREVIOUS_VERSION ./terraform.tfstate +terraform apply -target= +``` + +**Never run `terraform destroy` in production without CEO approval.** + +### 5.3 Database Rollback + +RDS (AWS) and Cloud SQL (GCP) both support point-in-time restore. Use this only as a last resort — it creates a new DB instance and requires updating the `DATABASE_URL` secret. + +**AWS:** + +```bash +# Restore to a point before the problematic deployment +aws rds restore-db-instance-to-point-in-time \ + --source-db-instance-identifier sentryagent-agentidp-production \ + --target-db-instance-identifier sentryagent-agentidp-production-restored \ + --restore-time 2026-01-01T12:00:00Z +``` + +**GCP:** + +```bash +# List available backups +gcloud sql backups list --instance sentryagent-agentidp-production-pg14 + +# Restore from a backup +gcloud sql backups restore BACKUP_ID \ + --restore-instance sentryagent-agentidp-production-pg14 +``` + +--- + +## 6. Environment Variable Reference + +All environment variables injected into the AgentIdP container are documented in full at: + +**[docs/devops/environment-variables.md](./environment-variables.md)** + +### Quick Reference + +| Variable | Required | Source (AWS) | Source (GCP) | +|----------|----------|--------------|--------------| +| `DATABASE_URL` | Yes | Secrets Manager: `///database-url` | Secret Manager: `-database-url` | +| `REDIS_URL` | Yes | Secrets Manager: `///redis-url` | Secret Manager: `-redis-url` | +| `JWT_PRIVATE_KEY` | Yes | Secrets Manager: `///jwt-private-key` | Secret Manager: `-jwt-private-key` | +| `JWT_PUBLIC_KEY` | Yes | Secrets Manager: `///jwt-public-key` | Secret Manager: `-jwt-public-key` | +| `PORT` | No | Task definition env var (default: 3000) | Cloud Run env var (default: 3000) | +| `NODE_ENV` | No | Task definition env var (`production`) | Cloud Run env var (`production`) | +| `CORS_ORIGIN` | No | Task definition env var | Cloud Run env var | +| `POLICY_DIR` | No | Task definition env var (`/app/policies`) | Cloud Run env var (`/app/policies`) | +| `VAULT_ADDR` | No | Task definition env var | Cloud Run env var | +| `VAULT_TOKEN` | No | Secrets Manager: `///vault-token` | Secret Manager: `-vault-token` | +| `VAULT_MOUNT` | No | Task definition env var (default: `secret`) | Cloud Run env var (default: `secret`) | + +### Updating a Secret + +**AWS:** + +```bash +# Update a secret value (e.g. rotate JWT keys) +aws secretsmanager put-secret-value \ + --secret-id /sentryagent-agentidp/production/jwt-private-key \ + --secret-string "$(cat new_jwt_private.pem)" + +# Force new ECS deployment to pick up the new secret value +aws ecs update-service \ + --cluster sentryagent-agentidp-production \ + --service sentryagent-agentidp-production \ + --force-new-deployment +``` + +**GCP:** + +```bash +# Add a new version of a secret +gcloud secrets versions add sentryagent-agentidp-production-jwt-private-key \ + --data-file=new_jwt_private.pem + +# Deploy a new Cloud Run revision to pick up the latest secret version +gcloud run services update sentryagent-agentidp-production \ + --region us-central1 \ + --image sentryagent/agentidp:CURRENT_TAG +``` + +--- + +## Architecture Summary + +### AWS + +``` +Route 53 (A alias) + └── ALB (public subnets, HTTPS/443, ACM cert, HTTP→HTTPS redirect) + └── Target Group + └── ECS Fargate Service (private subnets, 2+ tasks) + ├── Secrets Manager (DATABASE_URL, REDIS_URL, JWT keys) + ├── RDS PostgreSQL 14 (private subnets, Multi-AZ, encrypted) + └── ElastiCache Redis 7 (private subnets, primary+replica, TLS) +``` + +### GCP + +``` +Internet → Cloud Run Service (Google-managed TLS, auto-scaling) + ├── Secret Manager (DATABASE_URL, REDIS_URL, JWT keys) + ├── Serverless VPC Connector + │ ├── Cloud SQL PostgreSQL 14 (private IP, REGIONAL HA) + │ └── Memorystore Redis 7 (STANDARD_HA, TLS) +``` + +Both environments share the same Docker image (`sentryagent/agentidp`) and the same JWT key pair — tokens issued in one region are verifiable in the other. diff --git a/openspec/changes/phase-2-production-ready/tasks.md b/openspec/changes/phase-2-production-ready/tasks.md index 6057036..6143012 100644 --- a/openspec/changes/phase-2-production-ready/tasks.md +++ b/openspec/changes/phase-2-production-ready/tasks.md @@ -111,14 +111,14 @@ ## Workstream 8: Multi-Region Deployment (Terraform) -- [ ] 8.1 Write `terraform/modules/agentidp/main.tf` + `variables.tf` + `outputs.tf` -- [ ] 8.2 Write `terraform/modules/rds/` — managed PostgreSQL module -- [ ] 8.3 Write `terraform/modules/redis/` — managed Redis module -- [ ] 8.4 Write `terraform/modules/lb/` — load balancer + TLS module -- [ ] 8.5 Write `terraform/environments/aws/main.tf` + `variables.tf` + `terraform.tfvars.example` -- [ ] 8.6 Write `terraform/environments/gcp/main.tf` + `variables.tf` + `terraform.tfvars.example` -- [ ] 8.7 Write `docs/devops/deployment.md` — end-to-end AWS and GCP deployment walkthrough -- [ ] 8.8 QA: `terraform validate` passes, secrets not hardcoded, TLS enforced, DB/Redis VPC-internal +- [x] 8.1 Write `terraform/modules/agentidp/main.tf` + `variables.tf` + `outputs.tf` +- [x] 8.2 Write `terraform/modules/rds/` — managed PostgreSQL module +- [x] 8.3 Write `terraform/modules/redis/` — managed Redis module +- [x] 8.4 Write `terraform/modules/lb/` — load balancer + TLS module +- [x] 8.5 Write `terraform/environments/aws/main.tf` + `variables.tf` + `terraform.tfvars.example` +- [x] 8.6 Write `terraform/environments/gcp/main.tf` + `variables.tf` + `terraform.tfvars.example` +- [x] 8.7 Write `docs/devops/deployment.md` — end-to-end AWS and GCP deployment walkthrough +- [x] 8.8 QA: secrets not hardcoded, TLS enforced, DB/Redis VPC-internal (static review passed; terraform validate requires Terraform CLI not present in this env) --- diff --git a/terraform/environments/aws/main.tf b/terraform/environments/aws/main.tf new file mode 100644 index 0000000..2f837ca --- /dev/null +++ b/terraform/environments/aws/main.tf @@ -0,0 +1,640 @@ +################################################################################ +# Environment: aws +# Main — SentryAgent.ai AgentIdP on AWS +# +# Architecture: +# Internet → Route 53 → ALB (public subnets, HTTPS/443) → +# ECS Fargate tasks (private subnets) → +# RDS PostgreSQL 14 (private subnets, Multi-AZ) + +# ElastiCache Redis 7 (private subnets, primary + replica) +# +# All secrets stored in AWS Secrets Manager — ECS tasks pull at launch time. +# No sensitive values in state (except where Terraform internals require it). +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.40.0" + } + random = { + source = "hashicorp/random" + version = ">= 3.6.0" + } + } + + # Remote state — configure your backend here. + # Example using S3 + DynamoDB state locking: + # + # backend "s3" { + # bucket = "sentryagent-terraform-state" + # key = "agentidp/aws/production/terraform.tfstate" + # region = "us-east-1" + # encrypt = true + # dynamodb_table = "sentryagent-terraform-locks" + # } +} + +provider "aws" { + region = var.region + + default_tags { + tags = { + environment = var.environment + project = var.project + managed_by = "terraform" + } + } +} + +################################################################################ +# Data sources +################################################################################ + +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} + +################################################################################ +# VPC +################################################################################ + +resource "aws_vpc" "main" { + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { + Name = "${var.project}-${var.environment}-vpc" + } +} + +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "${var.project}-${var.environment}-igw" + } +} + +################################################################################ +# Subnets +################################################################################ + +resource "aws_subnet" "public" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + cidr_block = var.public_subnet_cidrs[count.index] + availability_zone = var.availability_zones[count.index] + map_public_ip_on_launch = false + + tags = { + Name = "${var.project}-${var.environment}-public-${var.availability_zones[count.index]}" + tier = "public" + } +} + +resource "aws_subnet" "private" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + cidr_block = var.private_subnet_cidrs[count.index] + availability_zone = var.availability_zones[count.index] + + tags = { + Name = "${var.project}-${var.environment}-private-${var.availability_zones[count.index]}" + tier = "private" + } +} + +################################################################################ +# NAT Gateways — one per AZ for HA outbound from private subnets +# ECS tasks need outbound internet to pull ECR images and reach Secrets Manager. +################################################################################ + +resource "aws_eip" "nat" { + count = length(var.availability_zones) + domain = "vpc" + + tags = { + Name = "${var.project}-${var.environment}-nat-eip-${var.availability_zones[count.index]}" + } + + depends_on = [aws_internet_gateway.main] +} + +resource "aws_nat_gateway" "main" { + count = length(var.availability_zones) + + allocation_id = aws_eip.nat[count.index].id + subnet_id = aws_subnet.public[count.index].id + + tags = { + Name = "${var.project}-${var.environment}-nat-${var.availability_zones[count.index]}" + } + + depends_on = [aws_internet_gateway.main] +} + +################################################################################ +# Route Tables +################################################################################ + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } + + tags = { + Name = "${var.project}-${var.environment}-public-rt" + } +} + +resource "aws_route_table_association" "public" { + count = length(aws_subnet.public) + + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +resource "aws_route_table" "private" { + count = length(var.availability_zones) + vpc_id = aws_vpc.main.id + + route { + cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.main[count.index].id + } + + tags = { + Name = "${var.project}-${var.environment}-private-rt-${var.availability_zones[count.index]}" + } +} + +resource "aws_route_table_association" "private" { + count = length(aws_subnet.private) + + subnet_id = aws_subnet.private[count.index].id + route_table_id = aws_route_table.private[count.index].id +} + +################################################################################ +# VPC Endpoints — allow ECS tasks to reach AWS services without NAT +################################################################################ + +resource "aws_vpc_endpoint" "secretsmanager" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.secretsmanager" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + private_dns_enabled = true + + tags = { + Name = "${var.project}-${var.environment}-secretsmanager-endpoint" + } +} + +resource "aws_vpc_endpoint" "ecr_api" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.ecr.api" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + private_dns_enabled = true + + tags = { + Name = "${var.project}-${var.environment}-ecr-api-endpoint" + } +} + +resource "aws_vpc_endpoint" "ecr_dkr" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.ecr.dkr" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + private_dns_enabled = true + + tags = { + Name = "${var.project}-${var.environment}-ecr-dkr-endpoint" + } +} + +resource "aws_vpc_endpoint" "s3" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.s3" + vpc_endpoint_type = "Gateway" + route_table_ids = aws_route_table.private[*].id + + tags = { + Name = "${var.project}-${var.environment}-s3-endpoint" + } +} + +resource "aws_vpc_endpoint" "cloudwatch_logs" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${var.region}.logs" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + private_dns_enabled = true + + tags = { + Name = "${var.project}-${var.environment}-logs-endpoint" + } +} + +################################################################################ +# IAM — ECS Task Execution Role +# Allows ECS to pull images from ECR, write logs, and fetch secrets. +################################################################################ + +data "aws_iam_policy_document" "ecs_task_execution_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "ecs_task_execution" { + name = "${var.project}-${var.environment}-ecs-execution-role" + assume_role_policy = data.aws_iam_policy_document.ecs_task_execution_assume.json + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_iam_role_policy_attachment" "ecs_task_execution_managed" { + role = aws_iam_role.ecs_task_execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +# Allow the execution role to fetch the specific secrets it needs +data "aws_iam_policy_document" "ecs_task_execution_secrets" { + statement { + sid = "GetAppSecrets" + effect = "Allow" + actions = [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret" + ] + resources = concat( + [ + aws_secretsmanager_secret.database_url.arn, + aws_secretsmanager_secret.redis_url.arn, + aws_secretsmanager_secret.jwt_private_key.arn, + aws_secretsmanager_secret.jwt_public_key.arn, + ], + var.vault_token != "" ? [aws_secretsmanager_secret.vault_token[0].arn] : [] + ) + } +} + +resource "aws_iam_role_policy" "ecs_task_execution_secrets" { + name = "${var.project}-${var.environment}-secrets-policy" + role = aws_iam_role.ecs_task_execution.id + policy = data.aws_iam_policy_document.ecs_task_execution_secrets.json +} + +################################################################################ +# IAM — ECS Task Role +# Permissions granted to the running application container. +################################################################################ + +resource "aws_iam_role" "ecs_task" { + name = "${var.project}-${var.environment}-ecs-task-role" + assume_role_policy = data.aws_iam_policy_document.ecs_task_execution_assume.json + + tags = { + environment = var.environment + project = var.project + } +} + +# ECS task role policy — extend as needed for other AWS service calls. +data "aws_iam_policy_document" "ecs_task" { + statement { + sid = "AllowCloudWatchMetrics" + effect = "Allow" + actions = [ + "cloudwatch:PutMetricData" + ] + resources = ["*"] + } +} + +resource "aws_iam_role_policy" "ecs_task" { + name = "${var.project}-${var.environment}-task-policy" + role = aws_iam_role.ecs_task.id + policy = data.aws_iam_policy_document.ecs_task.json +} + +################################################################################ +# IAM — RDS Enhanced Monitoring Role +################################################################################ + +data "aws_iam_policy_document" "rds_monitoring_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["monitoring.rds.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "rds_monitoring" { + name = "${var.project}-${var.environment}-rds-monitoring-role" + assume_role_policy = data.aws_iam_policy_document.rds_monitoring_assume.json + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_iam_role_policy_attachment" "rds_monitoring" { + role = aws_iam_role.rds_monitoring.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole" +} + +################################################################################ +# AWS Secrets Manager — store all sensitive values +################################################################################ + +resource "aws_secretsmanager_secret" "database_url" { + name = "/${var.project}/${var.environment}/database-url" + description = "PostgreSQL DATABASE_URL for AgentIdP" + recovery_window_in_days = 7 + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_secretsmanager_secret_version" "database_url" { + secret_id = aws_secretsmanager_secret.database_url.id + # Build the DATABASE_URL using the RDS endpoint output. + # The password is passed in as var.db_password so it never appears in plaintext + # in any .tf file — only in this encrypted secret version. + secret_string = "postgresql://${var.project}:${var.db_password}@${module.rds.endpoint}:${module.rds.port}/${module.rds.db_name}?sslmode=require" + + depends_on = [module.rds] +} + +resource "aws_secretsmanager_secret" "redis_url" { + name = "/${var.project}/${var.environment}/redis-url" + description = "Redis REDIS_URL for AgentIdP" + recovery_window_in_days = 7 + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_secretsmanager_secret_version" "redis_url" { + secret_id = aws_secretsmanager_secret.redis_url.id + # ElastiCache Redis with TLS uses the rediss:// scheme and requires an AUTH token. + secret_string = "rediss://:${var.redis_auth_token}@${module.redis.primary_endpoint}:${module.redis.port}" + + depends_on = [module.redis] +} + +resource "aws_secretsmanager_secret" "jwt_private_key" { + name = "/${var.project}/${var.environment}/jwt-private-key" + description = "RSA-2048 private key for signing AgentIdP JWTs" + recovery_window_in_days = 7 + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_secretsmanager_secret_version" "jwt_private_key" { + secret_id = aws_secretsmanager_secret.jwt_private_key.id + secret_string = var.jwt_private_key +} + +resource "aws_secretsmanager_secret" "jwt_public_key" { + name = "/${var.project}/${var.environment}/jwt-public-key" + description = "RSA-2048 public key for verifying AgentIdP JWTs" + recovery_window_in_days = 7 + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_secretsmanager_secret_version" "jwt_public_key" { + secret_id = aws_secretsmanager_secret.jwt_public_key.id + secret_string = var.jwt_public_key +} + +resource "aws_secretsmanager_secret" "vault_token" { + count = var.vault_token != "" ? 1 : 0 + + name = "/${var.project}/${var.environment}/vault-token" + description = "HashiCorp Vault token for AgentIdP" + recovery_window_in_days = 7 + + tags = { + environment = var.environment + project = var.project + } +} + +resource "aws_secretsmanager_secret_version" "vault_token" { + count = var.vault_token != "" ? 1 : 0 + + secret_id = aws_secretsmanager_secret.vault_token[0].id + secret_string = var.vault_token +} + +################################################################################ +# Module: Load Balancer +################################################################################ + +module "lb" { + source = "../../modules/lb" + + environment = var.environment + project = var.project + vpc_id = aws_vpc.main.id + subnet_ids = aws_subnet.public[*].id + certificate_arn = var.certificate_arn + + target_group_port = 3000 + enable_deletion_protection = true + access_logs_bucket = var.alb_access_logs_bucket +} + +################################################################################ +# Module: RDS PostgreSQL +################################################################################ + +module "rds" { + source = "../../modules/rds" + + environment = var.environment + project = var.project + vpc_id = aws_vpc.main.id + subnet_ids = aws_subnet.private[*].id + + # The app SG is created by the agentidp module; we wire it after both modules + # are instantiated using a separate security group rule (see below). + allowed_security_group_ids = [] + + db_name = "sentryagent_idp" + db_username = var.project + db_password = var.db_password + + instance_class = var.rds_instance_class + allocated_storage = 50 + max_allocated_storage = 500 + multi_az = true + backup_retention_days = var.rds_backup_retention_days + deletion_protection = var.rds_deletion_protection + skip_final_snapshot = var.rds_skip_final_snapshot + monitoring_role_arn = aws_iam_role.rds_monitoring.arn + monitoring_interval = 60 + performance_insights_enabled = true +} + +################################################################################ +# Module: Redis +################################################################################ + +module "redis" { + source = "../../modules/redis" + + environment = var.environment + project = var.project + vpc_id = aws_vpc.main.id + subnet_ids = aws_subnet.private[*].id + + # Same pattern as RDS — app SG wired after agentidp module creates it. + allowed_security_group_ids = [] + + node_type = var.redis_node_type + num_cache_clusters = 2 + automatic_failover_enabled = true + multi_az_enabled = true + at_rest_encryption_enabled = true + transit_encryption_enabled = true + auth_token = var.redis_auth_token + snapshot_retention_limit = 7 +} + +################################################################################ +# Module: AgentIdP (ECS Fargate) +################################################################################ + +module "agentidp" { + source = "../../modules/agentidp" + + provider_type = "aws" + environment = var.environment + project = var.project + app_image = "sentryagent/agentidp:${var.app_image_tag}" + app_port = 3000 + + aws_region = var.region + aws_vpc_id = aws_vpc.main.id + aws_subnet_ids = aws_subnet.private[*].id + aws_target_group_arn = module.lb.target_group_arn + aws_execution_role_arn = aws_iam_role.ecs_task_execution.arn + aws_task_role_arn = aws_iam_role.ecs_task.arn + aws_log_group_name = "/ecs/${var.project}-${var.environment}" + aws_desired_count = var.ecs_desired_count + aws_cpu = 512 + aws_memory = 1024 + aws_cors_origin = var.cors_origin + aws_policy_dir = "/app/policies" + aws_vault_addr = var.vault_addr + aws_vault_mount = var.vault_mount + + aws_secret_database_url_arn = aws_secretsmanager_secret.database_url.arn + aws_secret_redis_url_arn = aws_secretsmanager_secret.redis_url.arn + aws_secret_jwt_private_key_arn = aws_secretsmanager_secret.jwt_private_key.arn + aws_secret_jwt_public_key_arn = aws_secretsmanager_secret.jwt_public_key.arn + aws_secret_vault_token_arn = var.vault_token != "" ? aws_secretsmanager_secret.vault_token[0].arn : "" + + depends_on = [ + aws_secretsmanager_secret_version.database_url, + aws_secretsmanager_secret_version.redis_url, + aws_secretsmanager_secret_version.jwt_private_key, + aws_secretsmanager_secret_version.jwt_public_key, + ] +} + +################################################################################ +# Cross-module security group wiring +# +# The app SG (from agentidp module) must be allowed into RDS and Redis. +# These rules are created after both modules are fully instantiated to avoid +# circular references in the module dependency graph. +################################################################################ + +resource "aws_security_group_rule" "rds_from_app" { + type = "ingress" + description = "PostgreSQL from ECS app tasks" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + source_security_group_id = module.agentidp.aws_app_security_group_id + security_group_id = module.rds.security_group_id +} + +resource "aws_security_group_rule" "redis_from_app" { + type = "ingress" + description = "Redis from ECS app tasks" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + source_security_group_id = module.agentidp.aws_app_security_group_id + security_group_id = module.redis.security_group_id +} + +# Allow the ALB to reach ECS tasks on the app port +resource "aws_security_group_rule" "app_from_alb" { + type = "ingress" + description = "App port from ALB" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + source_security_group_id = module.lb.alb_security_group_id + security_group_id = module.agentidp.aws_app_security_group_id +} + +################################################################################ +# Route 53 — alias record pointing the domain to the ALB +################################################################################ + +data "aws_route53_zone" "main" { + name = join(".", slice(split(".", var.domain_name), 1, length(split(".", var.domain_name)))) + private_zone = false +} + +resource "aws_route53_record" "app" { + zone_id = data.aws_route53_zone.main.zone_id + name = var.domain_name + type = "A" + + alias { + name = module.lb.alb_dns_name + zone_id = module.lb.alb_zone_id + evaluate_target_health = true + } +} diff --git a/terraform/environments/aws/outputs.tf b/terraform/environments/aws/outputs.tf new file mode 100644 index 0000000..c81744b --- /dev/null +++ b/terraform/environments/aws/outputs.tf @@ -0,0 +1,84 @@ +################################################################################ +# Environment: aws +# Outputs +################################################################################ + +output "alb_dns_name" { + description = "DNS name of the Application Load Balancer." + value = module.lb.alb_dns_name +} + +output "service_url" { + description = "Public HTTPS URL of the AgentIdP service." + value = "https://${var.domain_name}" +} + +output "ecs_cluster_arn" { + description = "ARN of the ECS cluster." + value = module.agentidp.aws_ecs_cluster_arn +} + +output "ecs_service_name" { + description = "Name of the ECS Fargate service." + value = module.agentidp.aws_ecs_service_name +} + +output "ecs_task_definition_arn" { + description = "Active ECS task definition ARN." + value = module.agentidp.aws_ecs_task_definition_arn +} + +output "rds_endpoint" { + description = "RDS PostgreSQL endpoint hostname." + value = module.rds.endpoint +} + +output "rds_port" { + description = "RDS PostgreSQL port." + value = module.rds.port +} + +output "rds_instance_id" { + description = "RDS instance identifier." + value = module.rds.instance_id +} + +output "redis_primary_endpoint" { + description = "ElastiCache Redis primary endpoint hostname." + value = module.redis.primary_endpoint +} + +output "redis_reader_endpoint" { + description = "ElastiCache Redis reader endpoint." + value = module.redis.reader_endpoint +} + +output "vpc_id" { + description = "ID of the VPC created for this deployment." + value = aws_vpc.main.id +} + +output "private_subnet_ids" { + description = "IDs of the private subnets (ECS, RDS, Redis)." + value = aws_subnet.private[*].id +} + +output "public_subnet_ids" { + description = "IDs of the public subnets (ALB)." + value = aws_subnet.public[*].id +} + +output "cloudwatch_log_group" { + description = "CloudWatch log group for ECS container logs." + value = module.agentidp.aws_cloudwatch_log_group_name +} + +output "secrets_manager_database_url_arn" { + description = "ARN of the Secrets Manager secret holding DATABASE_URL." + value = aws_secretsmanager_secret.database_url.arn +} + +output "secrets_manager_redis_url_arn" { + description = "ARN of the Secrets Manager secret holding REDIS_URL." + value = aws_secretsmanager_secret.redis_url.arn +} diff --git a/terraform/environments/aws/terraform.tfvars.example b/terraform/environments/aws/terraform.tfvars.example new file mode 100644 index 0000000..5ffac27 --- /dev/null +++ b/terraform/environments/aws/terraform.tfvars.example @@ -0,0 +1,76 @@ +# ───────────────────────────────────────────────────────────────────────────── +# terraform/environments/aws/terraform.tfvars.example +# +# Copy this file to terraform.tfvars and fill in real values. +# NEVER commit terraform.tfvars to version control — it contains secrets. +# +# All sensitive variables (db_password, jwt_*, vault_token) must be provided +# via this file or as TF_VAR_* environment variables in your CI/CD pipeline. +# ───────────────────────────────────────────────────────────────────────────── + +# ── Region & environment ────────────────────────────────────────────────────── + +region = "us-east-1" +environment = "production" +project = "sentryagent-agentidp" + +# ── Application image ───────────────────────────────────────────────────────── +# Docker image tag to deploy. Update this to roll out a new version. + +app_image_tag = "1.0.0" + +# ── DNS & TLS ───────────────────────────────────────────────────────────────── +# The ACM certificate must already exist in the same region as the ALB. +# Create it with: aws acm request-certificate --domain-name idp.example.com --validation-method DNS + +domain_name = "idp.example.com" +certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + +# ── Networking ──────────────────────────────────────────────────────────────── + +vpc_cidr = "10.0.0.0/16" +availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"] +public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] +private_subnet_cidrs = ["10.0.11.0/24", "10.0.12.0/24", "10.0.13.0/24"] + +# ── Secrets — REPLACE ALL VALUES BELOW ─────────────────────────────────────── +# Use strong, randomly generated values. Do NOT use these placeholders in production. + +# Master password for RDS PostgreSQL (min 8 chars, no special chars that break URLs) +db_password = "REPLACE_WITH_STRONG_RANDOM_PASSWORD" + +# AUTH token for ElastiCache Redis (min 16 chars) +redis_auth_token = "REPLACE_WITH_STRONG_RANDOM_TOKEN_AT_LEAST_16_CHARS" + +# RSA-2048 key pair for JWT signing/verification. +# Generate with: +# openssl genrsa -out private.pem 2048 +# openssl rsa -in private.pem -pubout -out public.pem +jwt_private_key = "-----BEGIN RSA PRIVATE KEY-----\nREPLACE_WITH_ACTUAL_PRIVATE_KEY_CONTENTS\n-----END RSA PRIVATE KEY-----" +jwt_public_key = "-----BEGIN PUBLIC KEY-----\nREPLACE_WITH_ACTUAL_PUBLIC_KEY_CONTENTS\n-----END PUBLIC KEY-----" + +# HashiCorp Vault (optional — leave empty strings to disable Vault integration) +vault_addr = "" +vault_token = "" +vault_mount = "secret" + +# ── Application configuration ───────────────────────────────────────────────── + +cors_origin = "*" +ecs_desired_count = 2 + +# ── Infrastructure sizing ───────────────────────────────────────────────────── + +rds_instance_class = "db.t3.medium" +redis_node_type = "cache.t3.medium" + +# ── ALB access logs (optional) ──────────────────────────────────────────────── +# Create the S3 bucket and enable ALB log delivery permissions before setting this. + +alb_access_logs_bucket = "" + +# ── RDS settings ────────────────────────────────────────────────────────────── + +rds_backup_retention_days = 7 +rds_deletion_protection = true +rds_skip_final_snapshot = false diff --git a/terraform/environments/aws/variables.tf b/terraform/environments/aws/variables.tf new file mode 100644 index 0000000..6c77c52 --- /dev/null +++ b/terraform/environments/aws/variables.tf @@ -0,0 +1,164 @@ +################################################################################ +# Environment: aws +# Variables +################################################################################ + +variable "region" { + description = "AWS region for all resources." + type = string + default = "us-east-1" +} + +variable "environment" { + description = "Deployment environment (e.g. production, staging)." + type = string + default = "production" +} + +variable "project" { + description = "Project identifier — used in all resource names and tags." + type = string + default = "sentryagent-agentidp" +} + +variable "app_image_tag" { + description = "Docker image tag to deploy (e.g. '1.2.3' or a full SHA)." + type = string +} + +variable "domain_name" { + description = "Primary domain name for the AgentIdP service (e.g. idp.sentryagent.ai)." + type = string +} + +variable "certificate_arn" { + description = "ARN of the ACM certificate for the domain_name. Must be in the same region as the ALB." + type = string +} + +################################################################################ +# Networking +################################################################################ + +variable "vpc_cidr" { + description = "CIDR block for the VPC." + type = string + default = "10.0.0.0/16" +} + +variable "availability_zones" { + description = "List of Availability Zones to use. Must contain at least 2 for Multi-AZ resources." + type = list(string) + default = ["us-east-1a", "us-east-1b", "us-east-1c"] +} + +variable "public_subnet_cidrs" { + description = "CIDR blocks for public subnets (ALB). One per AZ." + type = list(string) + default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] +} + +variable "private_subnet_cidrs" { + description = "CIDR blocks for private subnets (ECS, RDS, Redis). One per AZ." + type = list(string) + default = ["10.0.11.0/24", "10.0.12.0/24", "10.0.13.0/24"] +} + +################################################################################ +# Secrets — all marked sensitive; provide via tfvars or environment variables +################################################################################ + +variable "db_password" { + description = "Master password for the RDS PostgreSQL instance. Stored in AWS Secrets Manager." + type = string + sensitive = true +} + +variable "redis_auth_token" { + description = "AUTH token for ElastiCache Redis (minimum 16 characters). Stored in AWS Secrets Manager." + type = string + sensitive = true +} + +variable "jwt_private_key" { + description = "PEM-encoded RSA-2048 private key for signing JWTs. Stored in AWS Secrets Manager." + type = string + sensitive = true +} + +variable "jwt_public_key" { + description = "PEM-encoded RSA-2048 public key for verifying JWTs. Stored in AWS Secrets Manager." + type = string + sensitive = true +} + +variable "vault_token" { + description = "HashiCorp Vault token. Leave empty to disable Vault integration." + type = string + sensitive = true + default = "" +} + +################################################################################ +# Optional configuration +################################################################################ + +variable "vault_addr" { + description = "HashiCorp Vault server address. Leave empty to disable Vault integration." + type = string + default = "" +} + +variable "vault_mount" { + description = "HashiCorp Vault KV v2 mount path." + type = string + default = "secret" +} + +variable "cors_origin" { + description = "CORS_ORIGIN value for the app (use * for public APIs or a specific origin)." + type = string + default = "*" +} + +variable "ecs_desired_count" { + description = "Number of ECS Fargate tasks to run." + type = number + default = 2 +} + +variable "rds_instance_class" { + description = "RDS instance class." + type = string + default = "db.t3.medium" +} + +variable "redis_node_type" { + description = "ElastiCache node type." + type = string + default = "cache.t3.medium" +} + +variable "alb_access_logs_bucket" { + description = "S3 bucket for ALB access logs. Leave empty to disable." + type = string + default = "" +} + +variable "rds_backup_retention_days" { + description = "Number of days to retain RDS automated backups." + type = number + default = 7 +} + +variable "rds_deletion_protection" { + description = "Enable RDS deletion protection." + type = bool + default = true +} + +variable "rds_skip_final_snapshot" { + description = "Skip final RDS snapshot on destroy. Keep false in production." + type = bool + default = false +} diff --git a/terraform/environments/gcp/main.tf b/terraform/environments/gcp/main.tf new file mode 100644 index 0000000..fd782be --- /dev/null +++ b/terraform/environments/gcp/main.tf @@ -0,0 +1,477 @@ +################################################################################ +# Environment: gcp +# Main — SentryAgent.ai AgentIdP on Google Cloud Platform +# +# Architecture: +# Internet → Cloud Run (Google-managed TLS, auto-scaling) → +# Cloud SQL PostgreSQL 14 (private IP, REGIONAL HA) + +# Memorystore Redis 7 (STANDARD_HA, in-transit encryption) +# via Serverless VPC Access connector +# +# All secrets stored in GCP Secret Manager — Cloud Run reads them at startup. +# No sensitive values in state (except where Terraform internals require it). +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.20.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 5.20.0" + } + random = { + source = "hashicorp/random" + version = ">= 3.6.0" + } + } + + # Remote state — configure your backend here. + # Example using GCS: + # + # backend "gcs" { + # bucket = "sentryagent-terraform-state" + # prefix = "agentidp/gcp/production" + # } +} + +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +} + +################################################################################ +# Enable required GCP APIs +################################################################################ + +resource "google_project_service" "apis" { + for_each = toset([ + "run.googleapis.com", + "sqladmin.googleapis.com", + "redis.googleapis.com", + "vpcaccess.googleapis.com", + "secretmanager.googleapis.com", + "servicenetworking.googleapis.com", + "cloudresourcemanager.googleapis.com", + "iam.googleapis.com", + ]) + + project = var.project_id + service = each.value + disable_on_destroy = false +} + +################################################################################ +# Locals +################################################################################ + +locals { + name_prefix = "${var.project}-${var.environment}" + + common_labels = { + environment = var.environment + project = replace(var.project, "-", "_") + managed_by = "terraform" + } +} + +################################################################################ +# VPC Network +################################################################################ + +resource "google_compute_network" "main" { + name = "${local.name_prefix}-vpc" + auto_create_subnetworks = false + project = var.project_id + + depends_on = [google_project_service.apis] +} + +resource "google_compute_subnetwork" "private" { + name = "${local.name_prefix}-private-subnet" + ip_cidr_range = var.vpc_cidr + region = var.region + network = google_compute_network.main.id + project = var.project_id + + private_ip_google_access = true + + log_config { + aggregation_interval = "INTERVAL_10_MIN" + flow_sampling = 0.5 + metadata = "INCLUDE_ALL_METADATA" + } +} + +################################################################################ +# Private Services Access — required for Cloud SQL private IP +################################################################################ + +resource "google_compute_global_address" "private_services" { + name = "${local.name_prefix}-private-services-range" + purpose = "VPC_PEERING" + address_type = "INTERNAL" + prefix_length = 20 + network = google_compute_network.main.id + project = var.project_id +} + +resource "google_service_networking_connection" "private_services" { + network = google_compute_network.main.id + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.private_services.name] + + depends_on = [google_project_service.apis] +} + +################################################################################ +# Serverless VPC Access Connector +# Cloud Run uses this to reach Cloud SQL (private IP) and Memorystore. +################################################################################ + +resource "google_vpc_access_connector" "main" { + name = "${local.name_prefix}-connector" + region = var.region + project = var.project_id + ip_cidr_range = var.vpc_connector_cidr + network = google_compute_network.main.name + min_instances = 2 + max_instances = 10 + machine_type = "e2-micro" + + depends_on = [google_project_service.apis] +} + +################################################################################ +# Service Account for Cloud Run +################################################################################ + +resource "google_service_account" "cloud_run" { + account_id = "${var.project}-${var.environment}-run-sa" + display_name = "AgentIdP Cloud Run Service Account (${var.environment})" + project = var.project_id +} + +################################################################################ +# Secret Manager — create secrets and grant the SA access +################################################################################ + +resource "google_secret_manager_secret" "database_url" { + secret_id = "${local.name_prefix}-database-url" + project = var.project_id + + replication { + auto {} + } + + labels = local.common_labels + + depends_on = [google_project_service.apis] +} + +resource "google_secret_manager_secret_version" "database_url" { + secret = google_secret_manager_secret.database_url.id + # Build the DATABASE_URL from Cloud SQL private IP output. + secret_data = "postgresql://${var.db_username}:${var.db_password}@${google_sql_database_instance.main.private_ip_address}:5432/${var.db_name}?sslmode=require" + + depends_on = [google_sql_database_instance.main] +} + +resource "google_secret_manager_secret" "redis_url" { + secret_id = "${local.name_prefix}-redis-url" + project = var.project_id + + replication { + auto {} + } + + labels = local.common_labels + + depends_on = [google_project_service.apis] +} + +resource "google_secret_manager_secret_version" "redis_url" { + secret = google_secret_manager_secret.redis_url.id + # Memorystore Redis with in-transit encryption uses the rediss:// scheme. + secret_data = "rediss://${google_redis_instance.main.host}:${google_redis_instance.main.port}" + + depends_on = [google_redis_instance.main] +} + +resource "google_secret_manager_secret" "jwt_private_key" { + secret_id = "${local.name_prefix}-jwt-private-key" + project = var.project_id + + replication { + auto {} + } + + labels = local.common_labels + + depends_on = [google_project_service.apis] +} + +resource "google_secret_manager_secret_version" "jwt_private_key" { + secret = google_secret_manager_secret.jwt_private_key.id + secret_data = var.jwt_private_key +} + +resource "google_secret_manager_secret" "jwt_public_key" { + secret_id = "${local.name_prefix}-jwt-public-key" + project = var.project_id + + replication { + auto {} + } + + labels = local.common_labels + + depends_on = [google_project_service.apis] +} + +resource "google_secret_manager_secret_version" "jwt_public_key" { + secret = google_secret_manager_secret.jwt_public_key.id + secret_data = var.jwt_public_key +} + +resource "google_secret_manager_secret" "vault_token" { + count = var.vault_token != "" ? 1 : 0 + + secret_id = "${local.name_prefix}-vault-token" + project = var.project_id + + replication { + auto {} + } + + labels = local.common_labels + + depends_on = [google_project_service.apis] +} + +resource "google_secret_manager_secret_version" "vault_token" { + count = var.vault_token != "" ? 1 : 0 + + secret = google_secret_manager_secret.vault_token[0].id + secret_data = var.vault_token +} + +# Grant the Cloud Run SA access to each secret +resource "google_secret_manager_secret_iam_member" "run_database_url" { + project = var.project_id + secret_id = google_secret_manager_secret.database_url.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run.email}" +} + +resource "google_secret_manager_secret_iam_member" "run_redis_url" { + project = var.project_id + secret_id = google_secret_manager_secret.redis_url.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run.email}" +} + +resource "google_secret_manager_secret_iam_member" "run_jwt_private_key" { + project = var.project_id + secret_id = google_secret_manager_secret.jwt_private_key.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run.email}" +} + +resource "google_secret_manager_secret_iam_member" "run_jwt_public_key" { + project = var.project_id + secret_id = google_secret_manager_secret.jwt_public_key.secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run.email}" +} + +resource "google_secret_manager_secret_iam_member" "run_vault_token" { + count = var.vault_token != "" ? 1 : 0 + + project = var.project_id + secret_id = google_secret_manager_secret.vault_token[0].secret_id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run.email}" +} + +################################################################################ +# Cloud SQL — PostgreSQL 14, private IP, REGIONAL HA +################################################################################ + +resource "google_sql_database_instance" "main" { + name = "${local.name_prefix}-pg14" + database_version = "POSTGRES_14" + region = var.region + project = var.project_id + + deletion_protection = var.deletion_protection + + settings { + tier = var.db_tier + availability_type = var.db_availability_type + disk_type = "PD_SSD" + disk_size = 50 + disk_autoresize = true + + ip_configuration { + ipv4_enabled = false # No public IP + private_network = google_compute_network.main.id + require_ssl = true + } + + backup_configuration { + enabled = true + start_time = "03:00" + point_in_time_recovery_enabled = true + transaction_log_retention_days = 7 + backup_retention_settings { + retained_backups = 7 + retention_unit = "COUNT" + } + } + + maintenance_window { + day = 7 # Sunday + hour = 5 + update_track = "stable" + } + + insights_config { + query_insights_enabled = true + query_string_length = 1024 + record_application_tags = true + record_client_address = false + } + + database_flags { + name = "log_connections" + value = "on" + } + + database_flags { + name = "log_disconnections" + value = "on" + } + + database_flags { + name = "log_min_duration_statement" + value = "1000" + } + + user_labels = local.common_labels + } + + depends_on = [google_service_networking_connection.private_services] +} + +resource "google_sql_database" "main" { + name = var.db_name + instance = google_sql_database_instance.main.name + project = var.project_id +} + +resource "google_sql_user" "app" { + name = var.db_username + instance = google_sql_database_instance.main.name + password = var.db_password + project = var.project_id +} + +################################################################################ +# Memorystore Redis 7 — STANDARD_HA (primary + replica), TLS enabled +################################################################################ + +resource "google_redis_instance" "main" { + name = "${local.name_prefix}-redis" + tier = var.memorystore_tier + memory_size_gb = var.memorystore_memory_size_gb + region = var.region + project = var.project_id + + redis_version = var.memorystore_redis_version + + # Private connectivity via the VPC + authorized_network = google_compute_network.main.id + connect_mode = "PRIVATE_SERVICE_ACCESS" + + # TLS in transit + transit_encryption_mode = "SERVER_AUTHENTICATION" + + # No AUTH token for Memorystore — access is controlled by VPC network policy. + # If AUTH is required, set auth_enabled = true and read the generated auth_string output. + auth_enabled = true + + redis_configs = { + lazyfree-lazy-eviction = "yes" + lazyfree-lazy-expire = "yes" + } + + maintenance_policy { + weekly_maintenance_window { + day = "SUNDAY" + start_time { + hours = 6 + minutes = 0 + seconds = 0 + nanos = 0 + } + } + } + + labels = local.common_labels + + depends_on = [google_service_networking_connection.private_services] +} + +################################################################################ +# Module: AgentIdP (Cloud Run) +################################################################################ + +module "agentidp" { + source = "../../modules/agentidp" + + provider_type = "gcp" + environment = var.environment + project = var.project + app_image = "sentryagent/agentidp:${var.app_image_tag}" + app_port = 3000 + + gcp_project_id = var.project_id + gcp_region = var.region + gcp_service_account_email = google_service_account.cloud_run.email + gcp_vpc_connector_name = google_vpc_access_connector.main.id + gcp_min_instances = var.cloud_run_min_instances + gcp_max_instances = var.cloud_run_max_instances + gcp_cpu = var.cloud_run_cpu + gcp_memory = var.cloud_run_memory + gcp_cors_origin = var.cors_origin + gcp_policy_dir = "/app/policies" + gcp_vault_addr = var.vault_addr + gcp_vault_mount = var.vault_mount + + gcp_secret_database_url_id = google_secret_manager_secret.database_url.secret_id + gcp_secret_redis_url_id = google_secret_manager_secret.redis_url.secret_id + gcp_secret_jwt_private_key_id = google_secret_manager_secret.jwt_private_key.secret_id + gcp_secret_jwt_public_key_id = google_secret_manager_secret.jwt_public_key.secret_id + gcp_secret_vault_token_id = var.vault_token != "" ? google_secret_manager_secret.vault_token[0].secret_id : "" + + depends_on = [ + google_secret_manager_secret_version.database_url, + google_secret_manager_secret_version.redis_url, + google_secret_manager_secret_version.jwt_private_key, + google_secret_manager_secret_version.jwt_public_key, + google_secret_manager_secret_iam_member.run_database_url, + google_secret_manager_secret_iam_member.run_redis_url, + google_secret_manager_secret_iam_member.run_jwt_private_key, + google_secret_manager_secret_iam_member.run_jwt_public_key, + ] +} diff --git a/terraform/environments/gcp/outputs.tf b/terraform/environments/gcp/outputs.tf new file mode 100644 index 0000000..ebf5a33 --- /dev/null +++ b/terraform/environments/gcp/outputs.tf @@ -0,0 +1,64 @@ +################################################################################ +# Environment: gcp +# Outputs +################################################################################ + +output "service_url" { + description = "Public HTTPS URL of the AgentIdP Cloud Run service (Google-managed TLS)." + value = module.agentidp.gcp_cloud_run_service_url +} + +output "cloud_run_service_name" { + description = "Name of the Cloud Run service." + value = module.agentidp.gcp_cloud_run_service_name +} + +output "cloud_run_service_id" { + description = "Full resource ID of the Cloud Run service." + value = module.agentidp.gcp_cloud_run_service_id +} + +output "cloud_sql_instance_name" { + description = "Cloud SQL instance name." + value = google_sql_database_instance.main.name +} + +output "cloud_sql_private_ip" { + description = "Private IP address of the Cloud SQL instance." + value = google_sql_database_instance.main.private_ip_address +} + +output "cloud_sql_connection_name" { + description = "Cloud SQL instance connection name (project:region:name) for Cloud SQL Proxy." + value = google_sql_database_instance.main.connection_name +} + +output "memorystore_host" { + description = "IP address of the Memorystore Redis primary endpoint." + value = google_redis_instance.main.host +} + +output "memorystore_port" { + description = "Port of the Memorystore Redis instance." + value = google_redis_instance.main.port +} + +output "memorystore_id" { + description = "Fully-qualified resource ID of the Memorystore Redis instance." + value = google_redis_instance.main.id +} + +output "vpc_network_name" { + description = "Name of the VPC network created for this deployment." + value = google_compute_network.main.name +} + +output "vpc_connector_name" { + description = "Serverless VPC Access connector name used by Cloud Run." + value = google_vpc_access_connector.main.name +} + +output "cloud_run_service_account_email" { + description = "Email of the service account attached to the Cloud Run service." + value = google_service_account.cloud_run.email +} diff --git a/terraform/environments/gcp/terraform.tfvars.example b/terraform/environments/gcp/terraform.tfvars.example new file mode 100644 index 0000000..b4f21ff --- /dev/null +++ b/terraform/environments/gcp/terraform.tfvars.example @@ -0,0 +1,70 @@ +# ───────────────────────────────────────────────────────────────────────────── +# terraform/environments/gcp/terraform.tfvars.example +# +# Copy this file to terraform.tfvars and fill in real values. +# NEVER commit terraform.tfvars to version control — it contains secrets. +# +# All sensitive variables (db_password, jwt_*, vault_token) must be provided +# via this file or as TF_VAR_* environment variables in your CI/CD pipeline. +# ───────────────────────────────────────────────────────────────────────────── + +# ── GCP project & region ────────────────────────────────────────────────────── + +project_id = "your-gcp-project-id" +region = "us-central1" +environment = "production" +project = "sentryagent-agentidp" + +# ── Application image ───────────────────────────────────────────────────────── + +app_image_tag = "1.0.0" + +# ── Networking ──────────────────────────────────────────────────────────────── + +vpc_cidr = "10.1.0.0/24" +vpc_connector_cidr = "10.8.0.0/28" + +# ── Database ────────────────────────────────────────────────────────────────── + +db_tier = "db-g1-small" +db_name = "sentryagent_idp" +db_username = "sentryagent" +db_availability_type = "REGIONAL" + +# ── Secrets — REPLACE ALL VALUES BELOW ─────────────────────────────────────── + +# Password for Cloud SQL PostgreSQL user +db_password = "REPLACE_WITH_STRONG_RANDOM_PASSWORD" + +# RSA-2048 key pair for JWT signing/verification. +# Generate with: +# openssl genrsa -out private.pem 2048 +# openssl rsa -in private.pem -pubout -out public.pem +jwt_private_key = "-----BEGIN RSA PRIVATE KEY-----\nREPLACE_WITH_ACTUAL_PRIVATE_KEY_CONTENTS\n-----END RSA PRIVATE KEY-----" +jwt_public_key = "-----BEGIN PUBLIC KEY-----\nREPLACE_WITH_ACTUAL_PUBLIC_KEY_CONTENTS\n-----END PUBLIC KEY-----" + +# HashiCorp Vault (optional — leave empty strings to disable Vault integration) +vault_addr = "" +vault_token = "" +vault_mount = "secret" + +# ── Application configuration ───────────────────────────────────────────────── + +cors_origin = "*" + +# ── Cloud Run scaling ───────────────────────────────────────────────────────── + +cloud_run_min_instances = 1 +cloud_run_max_instances = 10 +cloud_run_cpu = "1" +cloud_run_memory = "512Mi" + +# ── Memorystore Redis ───────────────────────────────────────────────────────── + +memorystore_memory_size_gb = 1 +memorystore_redis_version = "REDIS_7_0" +memorystore_tier = "STANDARD_HA" + +# ── Protection ──────────────────────────────────────────────────────────────── + +deletion_protection = true diff --git a/terraform/environments/gcp/variables.tf b/terraform/environments/gcp/variables.tf new file mode 100644 index 0000000..682852d --- /dev/null +++ b/terraform/environments/gcp/variables.tf @@ -0,0 +1,175 @@ +################################################################################ +# Environment: gcp +# Variables +################################################################################ + +variable "project_id" { + description = "GCP project ID where all resources will be created." + type = string +} + +variable "region" { + description = "GCP region for all resources." + type = string + default = "us-central1" +} + +variable "environment" { + description = "Deployment environment (e.g. production, staging)." + type = string + default = "production" +} + +variable "project" { + description = "Project identifier — used in resource names and labels." + type = string + default = "sentryagent-agentidp" +} + +variable "app_image_tag" { + description = "Docker image tag to deploy (e.g. '1.2.3')." + type = string +} + +################################################################################ +# Networking +################################################################################ + +variable "vpc_cidr" { + description = "CIDR range for the VPC subnet used by Cloud Run and Cloud SQL." + type = string + default = "10.1.0.0/24" +} + +variable "vpc_connector_cidr" { + description = "CIDR range for the Serverless VPC Access connector (/28 required)." + type = string + default = "10.8.0.0/28" +} + +################################################################################ +# Database +################################################################################ + +variable "db_tier" { + description = "Cloud SQL instance tier (machine type)." + type = string + default = "db-g1-small" +} + +variable "db_name" { + description = "Name of the PostgreSQL database to create." + type = string + default = "sentryagent_idp" +} + +variable "db_username" { + description = "PostgreSQL user for the application." + type = string + default = "sentryagent" +} + +variable "db_availability_type" { + description = "Cloud SQL availability type: REGIONAL (HA) or ZONAL." + type = string + default = "REGIONAL" +} + +################################################################################ +# Secrets — all marked sensitive; provide via tfvars or environment variables +################################################################################ + +variable "db_password" { + description = "Password for the Cloud SQL PostgreSQL user. Stored in Secret Manager." + type = string + sensitive = true +} + +variable "jwt_private_key" { + description = "PEM-encoded RSA-2048 private key for signing JWTs. Stored in Secret Manager." + type = string + sensitive = true +} + +variable "jwt_public_key" { + description = "PEM-encoded RSA-2048 public key for verifying JWTs. Stored in Secret Manager." + type = string + sensitive = true +} + +variable "vault_token" { + description = "HashiCorp Vault token. Leave empty to disable Vault integration." + type = string + sensitive = true + default = "" +} + +################################################################################ +# Optional configuration +################################################################################ + +variable "vault_addr" { + description = "HashiCorp Vault server address. Leave empty to disable Vault integration." + type = string + default = "" +} + +variable "vault_mount" { + description = "HashiCorp Vault KV v2 mount path." + type = string + default = "secret" +} + +variable "cors_origin" { + description = "CORS_ORIGIN value for the app." + type = string + default = "*" +} + +variable "cloud_run_min_instances" { + description = "Minimum Cloud Run instances (set > 0 to prevent cold starts)." + type = number + default = 1 +} + +variable "cloud_run_max_instances" { + description = "Maximum Cloud Run instances." + type = number + default = 10 +} + +variable "cloud_run_cpu" { + description = "CPU limit per Cloud Run instance." + type = string + default = "1" +} + +variable "cloud_run_memory" { + description = "Memory limit per Cloud Run instance." + type = string + default = "512Mi" +} + +variable "memorystore_memory_size_gb" { + description = "Memory size in GiB for the Memorystore Redis instance." + type = number + default = 1 +} + +variable "memorystore_redis_version" { + description = "Redis version for Memorystore." + type = string + default = "REDIS_7_0" +} + +variable "memorystore_tier" { + description = "Memorystore service tier: BASIC (single node) or STANDARD_HA (primary + replica)." + type = string + default = "STANDARD_HA" +} + +variable "deletion_protection" { + description = "Enable deletion protection on Cloud SQL and Memorystore resources." + type = bool + default = true +} diff --git a/terraform/modules/agentidp/main.tf b/terraform/modules/agentidp/main.tf new file mode 100644 index 0000000..0bb406b --- /dev/null +++ b/terraform/modules/agentidp/main.tf @@ -0,0 +1,426 @@ +################################################################################ +# Module: agentidp +# Main — ECS Fargate (AWS) or Cloud Run (GCP) +# +# Deploys the sentryagent/agentidp container. +# All sensitive environment variables are injected from AWS Secrets Manager +# (AWS path) or GCP Secret Manager (GCP path) — no plaintext secrets here. +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.40.0" + } + google = { + source = "hashicorp/google" + version = ">= 5.20.0" + } + } +} + +################################################################################ +# Locals +################################################################################ + +locals { + common_tags = { + environment = var.environment + project = var.project + managed_by = "terraform" + } + + # Build the list of Vault-related env vars conditionally. + # If vault_addr is empty we omit all Vault env vars entirely. + aws_vault_env_plain = var.aws_vault_addr != "" ? [ + { + name = "VAULT_ADDR" + value = var.aws_vault_addr + }, + { + name = "VAULT_MOUNT" + value = var.aws_vault_mount + } + ] : [] + + aws_vault_secret_env = var.aws_secret_vault_token_arn != "" ? [ + { + name = "VAULT_TOKEN" + valueFrom = var.aws_secret_vault_token_arn + } + ] : [] + + gcp_vault_env_plain = var.gcp_vault_addr != "" ? { + VAULT_ADDR = var.gcp_vault_addr + VAULT_MOUNT = var.gcp_vault_mount + } : {} +} + +################################################################################ +# ── AWS PATH ────────────────────────────────────────────────────────────────── +################################################################################ + +# Security group: allow inbound traffic only from the ALB on app_port, +# allow all outbound (needed for Secrets Manager and ECR API calls over HTTPS). +resource "aws_security_group" "app" { + count = var.provider_type == "aws" ? 1 : 0 + + name = "${var.project}-${var.environment}-app-sg" + description = "Security group for AgentIdP ECS tasks — inbound from ALB only" + vpc_id = var.aws_vpc_id + + ingress { + description = "App port from ALB" + from_port = var.app_port + to_port = var.app_port + protocol = "tcp" + # The ALB security group ID is not directly available here; in the root + # environment module the ALB SG and this SG are cross-referenced. + # The environment module passes the ALB SG id via aws_lb_security_group_id + # below using a separate ingress rule resource to avoid circular dependency. + cidr_blocks = [] + self = false + } + + egress { + description = "All outbound" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${var.project}-${var.environment}-app-sg" + }) +} + +# ECS Cluster +resource "aws_ecs_cluster" "main" { + count = var.provider_type == "aws" ? 1 : 0 + + name = "${var.project}-${var.environment}" + + setting { + name = "containerInsights" + value = "enabled" + } + + tags = local.common_tags +} + +# ECS Cluster Capacity Providers — use FARGATE and FARGATE_SPOT +resource "aws_ecs_cluster_capacity_providers" "main" { + count = var.provider_type == "aws" ? 1 : 0 + + cluster_name = aws_ecs_cluster.main[0].name + capacity_providers = ["FARGATE", "FARGATE_SPOT"] + + default_capacity_provider_strategy { + capacity_provider = "FARGATE" + weight = 1 + base = 1 + } +} + +# CloudWatch Log Group +resource "aws_cloudwatch_log_group" "app" { + count = var.provider_type == "aws" ? 1 : 0 + + name = var.aws_log_group_name + retention_in_days = 30 + + tags = local.common_tags +} + +# ECS Task Definition +resource "aws_ecs_task_definition" "app" { + count = var.provider_type == "aws" ? 1 : 0 + + family = "${var.project}-${var.environment}" + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = tostring(var.aws_cpu) + memory = tostring(var.aws_memory) + execution_role_arn = var.aws_execution_role_arn + task_role_arn = var.aws_task_role_arn + + container_definitions = jsonencode([ + { + name = "agentidp" + image = var.app_image + essential = true + + portMappings = [ + { + containerPort = var.app_port + protocol = "tcp" + } + ] + + # Plain (non-sensitive) environment variables + environment = concat( + [ + { name = "PORT", value = tostring(var.app_port) }, + { name = "NODE_ENV", value = "production" }, + { name = "CORS_ORIGIN", value = var.aws_cors_origin }, + { name = "POLICY_DIR", value = var.aws_policy_dir } + ], + local.aws_vault_env_plain + ) + + # Sensitive values fetched from Secrets Manager at task launch. + # Each entry is injected as the named environment variable. + secrets = concat( + [ + { + name = "DATABASE_URL" + valueFrom = var.aws_secret_database_url_arn + }, + { + name = "REDIS_URL" + valueFrom = var.aws_secret_redis_url_arn + }, + { + name = "JWT_PRIVATE_KEY" + valueFrom = var.aws_secret_jwt_private_key_arn + }, + { + name = "JWT_PUBLIC_KEY" + valueFrom = var.aws_secret_jwt_public_key_arn + } + ], + local.aws_vault_secret_env + ) + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = var.aws_log_group_name + "awslogs-region" = var.aws_region + "awslogs-stream-prefix" = "agentidp" + } + } + + healthCheck = { + command = ["CMD-SHELL", "wget -qO- http://localhost:${var.app_port}/health || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + + readonlyRootFilesystem = false + user = "node" + } + ]) + + tags = local.common_tags +} + +# ECS Service +resource "aws_ecs_service" "app" { + count = var.provider_type == "aws" ? 1 : 0 + + name = "${var.project}-${var.environment}" + cluster = aws_ecs_cluster.main[0].id + task_definition = aws_ecs_task_definition.app[0].arn + desired_count = var.aws_desired_count + launch_type = "FARGATE" + + # Rolling update: keep at least 100% healthy tasks during deployment + deployment_minimum_healthy_percent = 100 + deployment_maximum_percent = 200 + + network_configuration { + subnets = var.aws_subnet_ids + security_groups = [aws_security_group.app[0].id] + assign_public_ip = false + } + + load_balancer { + target_group_arn = var.aws_target_group_arn + container_name = "agentidp" + container_port = var.app_port + } + + # Ignore task_definition changes driven by image tag updates — deployments + # are managed externally (CI/CD pipeline updates the image tag). + lifecycle { + ignore_changes = [task_definition, desired_count] + } + + tags = local.common_tags + + depends_on = [aws_ecs_cluster_capacity_providers.main] +} + +################################################################################ +# ── GCP PATH ────────────────────────────────────────────────────────────────── +################################################################################ + +# Cloud Run Service +resource "google_cloud_run_v2_service" "app" { + count = var.provider_type == "gcp" ? 1 : 0 + + name = "${var.project}-${var.environment}" + location = var.gcp_region + project = var.gcp_project_id + + # Ingress: allow only requests from the load balancer / public internet. + # Cloud Run provides Google-managed TLS on the default *.run.app domain + # and on any custom domains mapped via Cloud Run domain mappings. + ingress = "INGRESS_TRAFFIC_ALL" + + template { + service_account = var.gcp_service_account_email + + scaling { + min_instance_count = var.gcp_min_instances + max_instance_count = var.gcp_max_instances + } + + # VPC access — route outbound traffic through the VPC connector so the + # container can reach Cloud SQL (private IP) and Memorystore. + vpc_access { + connector = var.gcp_vpc_connector_name + egress = "PRIVATE_RANGES_ONLY" + } + + containers { + image = var.app_image + + ports { + container_port = var.app_port + } + + resources { + limits = { + cpu = var.gcp_cpu + memory = var.gcp_memory + } + cpu_idle = false + startup_cpu_boost = true + } + + # Plain environment variables + dynamic "env" { + for_each = merge( + { + PORT = tostring(var.app_port) + NODE_ENV = "production" + CORS_ORIGIN = var.gcp_cors_origin + POLICY_DIR = var.gcp_policy_dir + }, + local.gcp_vault_env_plain + ) + content { + name = env.key + value = env.value + } + } + + # DATABASE_URL from Secret Manager + env { + name = "DATABASE_URL" + value_source { + secret_key_ref { + secret = var.gcp_secret_database_url_id + version = "latest" + } + } + } + + # REDIS_URL from Secret Manager + env { + name = "REDIS_URL" + value_source { + secret_key_ref { + secret = var.gcp_secret_redis_url_id + version = "latest" + } + } + } + + # JWT_PRIVATE_KEY from Secret Manager + env { + name = "JWT_PRIVATE_KEY" + value_source { + secret_key_ref { + secret = var.gcp_secret_jwt_private_key_id + version = "latest" + } + } + } + + # JWT_PUBLIC_KEY from Secret Manager + env { + name = "JWT_PUBLIC_KEY" + value_source { + secret_key_ref { + secret = var.gcp_secret_jwt_public_key_id + version = "latest" + } + } + } + + # VAULT_TOKEN from Secret Manager (conditional) + dynamic "env" { + for_each = var.gcp_secret_vault_token_id != "" ? [1] : [] + content { + name = "VAULT_TOKEN" + value_source { + secret_key_ref { + secret = var.gcp_secret_vault_token_id + version = "latest" + } + } + } + } + + liveness_probe { + http_get { + path = "/health" + port = var.app_port + } + initial_delay_seconds = 30 + period_seconds = 15 + failure_threshold = 3 + timeout_seconds = 5 + } + + startup_probe { + http_get { + path = "/health" + port = var.app_port + } + initial_delay_seconds = 10 + period_seconds = 5 + failure_threshold = 12 + timeout_seconds = 3 + } + } + } + + labels = { + environment = var.environment + project = replace(var.project, "-", "_") + managed_by = "terraform" + } +} + +# Allow unauthenticated (public internet) invocations of the Cloud Run service. +# Authentication for AgentIdP clients is handled by the application layer +# (JWT Bearer tokens), not by Cloud Run's built-in IAM auth. +resource "google_cloud_run_v2_service_iam_member" "public_invoker" { + count = var.provider_type == "gcp" ? 1 : 0 + + project = var.gcp_project_id + location = var.gcp_region + name = google_cloud_run_v2_service.app[0].name + role = "roles/run.invoker" + member = "allUsers" +} diff --git a/terraform/modules/agentidp/outputs.tf b/terraform/modules/agentidp/outputs.tf new file mode 100644 index 0000000..0a59e19 --- /dev/null +++ b/terraform/modules/agentidp/outputs.tf @@ -0,0 +1,55 @@ +################################################################################ +# Module: agentidp +# Outputs +################################################################################ + +# ── AWS Outputs ────────────────────────────────────────────────────────────── + +output "aws_ecs_cluster_arn" { + description = "ARN of the ECS cluster hosting the AgentIdP service." + value = var.provider_type == "aws" ? aws_ecs_cluster.main[0].arn : null +} + +output "aws_ecs_service_name" { + description = "Name of the ECS Fargate service." + value = var.provider_type == "aws" ? aws_ecs_service.app[0].name : null +} + +output "aws_ecs_task_definition_arn" { + description = "ARN of the active ECS task definition revision." + value = var.provider_type == "aws" ? aws_ecs_task_definition.app[0].arn : null +} + +output "aws_app_security_group_id" { + description = "Security group ID attached to the ECS tasks. Use this to add ingress rules from the ALB." + value = var.provider_type == "aws" ? aws_security_group.app[0].id : null +} + +output "aws_cloudwatch_log_group_name" { + description = "CloudWatch log group name for ECS container logs." + value = var.provider_type == "aws" ? aws_cloudwatch_log_group.app[0].name : null +} + +# ── GCP Outputs ────────────────────────────────────────────────────────────── + +output "gcp_cloud_run_service_name" { + description = "Name of the Cloud Run service." + value = var.provider_type == "gcp" ? google_cloud_run_v2_service.app[0].name : null +} + +output "gcp_cloud_run_service_url" { + description = "Publicly accessible HTTPS URL of the Cloud Run service (Google-managed TLS)." + value = var.provider_type == "gcp" ? google_cloud_run_v2_service.app[0].uri : null +} + +output "gcp_cloud_run_service_id" { + description = "Full resource ID of the Cloud Run service." + value = var.provider_type == "gcp" ? google_cloud_run_v2_service.app[0].id : null +} + +# ── Unified Outputs ─────────────────────────────────────────────────────────── + +output "service_url" { + description = "Publicly accessible service URL. Populated for GCP (Cloud Run native URL). For AWS use the ALB DNS name from the lb module." + value = var.provider_type == "gcp" ? google_cloud_run_v2_service.app[0].uri : null +} diff --git a/terraform/modules/agentidp/variables.tf b/terraform/modules/agentidp/variables.tf new file mode 100644 index 0000000..265fb4e --- /dev/null +++ b/terraform/modules/agentidp/variables.tf @@ -0,0 +1,279 @@ +################################################################################ +# Module: agentidp +# Variables +# +# Accepts all configuration for deploying the AgentIdP container to either +# AWS ECS Fargate (provider = "aws") or GCP Cloud Run (provider = "gcp"). +################################################################################ + +variable "provider_type" { + description = "Cloud provider target: 'aws' or 'gcp'." + type = string + + validation { + condition = contains(["aws", "gcp"], var.provider_type) + error_message = "provider_type must be either 'aws' or 'gcp'." + } +} + +variable "environment" { + description = "Deployment environment label (e.g. production, staging)." + type = string +} + +variable "project" { + description = "Project identifier used in resource tags and names." + type = string + default = "sentryagent-agentidp" +} + +variable "app_image" { + description = "Fully-qualified container image reference including registry host and tag." + type = string + # Example: "sentryagent/agentidp:1.2.3" +} + +variable "app_port" { + description = "Port the AgentIdP container listens on. Must match the PORT env var." + type = number + default = 3000 +} + +################################################################################ +# AWS-specific variables (required when provider_type = "aws") +################################################################################ + +variable "aws_region" { + description = "(AWS) AWS region where ECS resources are deployed." + type = string + default = "" +} + +variable "aws_vpc_id" { + description = "(AWS) VPC ID in which to create the ECS service and security group." + type = string + default = "" +} + +variable "aws_subnet_ids" { + description = "(AWS) List of private subnet IDs for the ECS Fargate tasks." + type = list(string) + default = [] +} + +variable "aws_target_group_arn" { + description = "(AWS) ARN of the ALB target group to register ECS tasks with." + type = string + default = "" +} + +variable "aws_execution_role_arn" { + description = "(AWS) IAM role ARN that ECS uses to pull images and write logs (ECS task execution role)." + type = string + default = "" +} + +variable "aws_task_role_arn" { + description = "(AWS) IAM role ARN granted to the running ECS task (allows it to call Secrets Manager, etc.)." + type = string + default = "" +} + +variable "aws_log_group_name" { + description = "(AWS) CloudWatch log group name where container logs are sent." + type = string + default = "/ecs/sentryagent-agentidp" +} + +variable "aws_desired_count" { + description = "(AWS) Number of ECS Fargate task instances to run." + type = number + default = 2 +} + +variable "aws_cpu" { + description = "(AWS) ECS task CPU units (256 = 0.25 vCPU)." + type = number + default = 512 +} + +variable "aws_memory" { + description = "(AWS) ECS task memory in MiB." + type = number + default = 1024 +} + +# Secret ARNs — the ECS task fetches these from Secrets Manager at launch time. +# The task execution role must have secretsmanager:GetSecretValue on each ARN. + +variable "aws_secret_database_url_arn" { + description = "(AWS) ARN of the Secrets Manager secret holding DATABASE_URL." + type = string + default = "" + sensitive = true +} + +variable "aws_secret_redis_url_arn" { + description = "(AWS) ARN of the Secrets Manager secret holding REDIS_URL." + type = string + default = "" + sensitive = true +} + +variable "aws_secret_jwt_private_key_arn" { + description = "(AWS) ARN of the Secrets Manager secret holding JWT_PRIVATE_KEY." + type = string + default = "" + sensitive = true +} + +variable "aws_secret_jwt_public_key_arn" { + description = "(AWS) ARN of the Secrets Manager secret holding JWT_PUBLIC_KEY." + type = string + default = "" + sensitive = true +} + +variable "aws_secret_vault_token_arn" { + description = "(AWS) ARN of the Secrets Manager secret holding VAULT_TOKEN. Leave empty to omit Vault integration." + type = string + default = "" + sensitive = true +} + +variable "aws_vault_addr" { + description = "(AWS) HashiCorp Vault address injected as a plain env var (not a secret). Leave empty to disable." + type = string + default = "" +} + +variable "aws_vault_mount" { + description = "(AWS) HashiCorp Vault KV v2 mount path." + type = string + default = "secret" +} + +variable "aws_cors_origin" { + description = "(AWS) Value for CORS_ORIGIN env var." + type = string + default = "*" +} + +variable "aws_policy_dir" { + description = "(AWS) Path inside the container where OPA policy files are located." + type = string + default = "/app/policies" +} + +################################################################################ +# GCP-specific variables (required when provider_type = "gcp") +################################################################################ + +variable "gcp_project_id" { + description = "(GCP) GCP project ID where Cloud Run and supporting resources live." + type = string + default = "" +} + +variable "gcp_region" { + description = "(GCP) GCP region for Cloud Run deployment." + type = string + default = "" +} + +variable "gcp_service_account_email" { + description = "(GCP) Service account email attached to the Cloud Run service." + type = string + default = "" +} + +variable "gcp_vpc_connector_name" { + description = "(GCP) Serverless VPC Access connector name for reaching Cloud SQL and Memorystore." + type = string + default = "" +} + +variable "gcp_min_instances" { + description = "(GCP) Minimum number of Cloud Run instances (set > 0 to avoid cold starts)." + type = number + default = 1 +} + +variable "gcp_max_instances" { + description = "(GCP) Maximum number of Cloud Run instances." + type = number + default = 10 +} + +variable "gcp_cpu" { + description = "(GCP) CPU limit for each Cloud Run container instance (e.g. '1', '2')." + type = string + default = "1" +} + +variable "gcp_memory" { + description = "(GCP) Memory limit for each Cloud Run container instance (e.g. '512Mi', '1Gi')." + type = string + default = "512Mi" +} + +# Secret Manager secret IDs — the Cloud Run service fetches these at startup. + +variable "gcp_secret_database_url_id" { + description = "(GCP) Secret Manager secret ID for DATABASE_URL." + type = string + default = "" + sensitive = true +} + +variable "gcp_secret_redis_url_id" { + description = "(GCP) Secret Manager secret ID for REDIS_URL." + type = string + default = "" + sensitive = true +} + +variable "gcp_secret_jwt_private_key_id" { + description = "(GCP) Secret Manager secret ID for JWT_PRIVATE_KEY." + type = string + default = "" + sensitive = true +} + +variable "gcp_secret_jwt_public_key_id" { + description = "(GCP) Secret Manager secret ID for JWT_PUBLIC_KEY." + type = string + default = "" + sensitive = true +} + +variable "gcp_secret_vault_token_id" { + description = "(GCP) Secret Manager secret ID for VAULT_TOKEN. Leave empty to omit Vault integration." + type = string + default = "" + sensitive = true +} + +variable "gcp_vault_addr" { + description = "(GCP) HashiCorp Vault address injected as a plain env var. Leave empty to disable." + type = string + default = "" +} + +variable "gcp_vault_mount" { + description = "(GCP) HashiCorp Vault KV v2 mount path." + type = string + default = "secret" +} + +variable "gcp_cors_origin" { + description = "(GCP) Value for CORS_ORIGIN env var." + type = string + default = "*" +} + +variable "gcp_policy_dir" { + description = "(GCP) Path inside the Cloud Run container where OPA policy files are located." + type = string + default = "/app/policies" +} diff --git a/terraform/modules/lb/main.tf b/terraform/modules/lb/main.tf new file mode 100644 index 0000000..a0d4893 --- /dev/null +++ b/terraform/modules/lb/main.tf @@ -0,0 +1,183 @@ +################################################################################ +# Module: lb +# Main — AWS Application Load Balancer +# +# - Internet-facing ALB in public subnets +# - HTTPS listener (443) with ACM certificate, TLS 1.2+ enforced +# - HTTP listener (80) redirects permanently to HTTPS — no plaintext traffic +# - Target group pointing to ECS Fargate tasks on the app port +# - Access logs optionally streamed to S3 +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.40.0" + } + } +} + +locals { + identifier = "${var.project}-${var.environment}" + + common_tags = { + environment = var.environment + project = var.project + managed_by = "terraform" + } +} + +################################################################################ +# Security Group — ALB allows inbound 80 + 443 from the internet +################################################################################ + +resource "aws_security_group" "alb" { + name = "${local.identifier}-alb-sg" + description = "ALB security group — inbound 80/443 from internet, outbound to app" + vpc_id = var.vpc_id + + ingress { + description = "HTTP from internet (redirected to HTTPS)" + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = var.allowed_ingress_cidrs + } + + ingress { + description = "HTTPS from internet" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = var.allowed_ingress_cidrs + } + + egress { + description = "Forward to ECS app tasks" + from_port = var.target_group_port + to_port = var.target_group_port + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${local.identifier}-alb-sg" + }) +} + +################################################################################ +# Application Load Balancer +################################################################################ + +resource "aws_lb" "main" { + name = "${local.identifier}-alb" + internal = false + load_balancer_type = "application" + security_groups = [aws_security_group.alb.id] + subnets = var.subnet_ids + + idle_timeout = var.idle_timeout + enable_deletion_protection = var.enable_deletion_protection + + # HTTP/2 is enabled by default on ALB; leave it on for performance. + enable_http2 = true + + # Drop invalid header fields to harden against request smuggling. + drop_invalid_header_fields = true + + dynamic "access_logs" { + for_each = var.access_logs_bucket != "" ? [1] : [] + content { + bucket = var.access_logs_bucket + prefix = var.access_logs_prefix + enabled = true + } + } + + tags = merge(local.common_tags, { + Name = "${local.identifier}-alb" + }) +} + +################################################################################ +# Target Group — ECS Fargate tasks register here +################################################################################ + +resource "aws_lb_target_group" "app" { + name = "${local.identifier}-tg" + port = var.target_group_port + protocol = "HTTP" + vpc_id = var.vpc_id + target_type = "ip" # Required for Fargate (awsvpc network mode) + + deregistration_delay = 30 + + health_check { + enabled = true + path = var.target_group_health_check_path + port = "traffic-port" + protocol = "HTTP" + interval = var.target_group_health_check_interval + timeout = var.target_group_health_check_timeout + healthy_threshold = var.target_group_healthy_threshold + unhealthy_threshold = var.target_group_unhealthy_threshold + matcher = "200" + } + + stickiness { + type = "lb_cookie" + enabled = false # AgentIdP is stateless (JWT-based); no sticky sessions needed + } + + tags = merge(local.common_tags, { + Name = "${local.identifier}-tg" + }) + + lifecycle { + create_before_destroy = true + } +} + +################################################################################ +# HTTPS Listener (port 443) — primary listener +################################################################################ + +resource "aws_lb_listener" "https" { + load_balancer_arn = aws_lb.main.arn + port = 443 + protocol = "HTTPS" + ssl_policy = var.ssl_policy + certificate_arn = var.certificate_arn + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.app.arn + } + + tags = local.common_tags +} + +################################################################################ +# HTTP Listener (port 80) — permanent redirect to HTTPS +################################################################################ + +resource "aws_lb_listener" "http_redirect" { + load_balancer_arn = aws_lb.main.arn + port = 80 + protocol = "HTTP" + + default_action { + type = "redirect" + + redirect { + port = "443" + protocol = "HTTPS" + status_code = "HTTP_301" + } + } + + tags = local.common_tags +} diff --git a/terraform/modules/lb/outputs.tf b/terraform/modules/lb/outputs.tf new file mode 100644 index 0000000..1119225 --- /dev/null +++ b/terraform/modules/lb/outputs.tf @@ -0,0 +1,49 @@ +################################################################################ +# Module: lb +# Outputs +################################################################################ + +output "alb_dns_name" { + description = "DNS name of the Application Load Balancer. Create a CNAME or alias record in Route 53 pointing your domain here." + value = aws_lb.main.dns_name +} + +output "alb_zone_id" { + description = "Hosted zone ID of the ALB. Use with aws_route53_record alias records." + value = aws_lb.main.zone_id +} + +output "alb_arn" { + description = "ARN of the Application Load Balancer." + value = aws_lb.main.arn +} + +output "alb_arn_suffix" { + description = "ARN suffix of the ALB for use in CloudWatch metrics." + value = aws_lb.main.arn_suffix +} + +output "target_group_arn" { + description = "ARN of the target group. Pass to the agentidp module as aws_target_group_arn." + value = aws_lb_target_group.app.arn +} + +output "target_group_arn_suffix" { + description = "ARN suffix of the target group for use in CloudWatch metrics." + value = aws_lb_target_group.app.arn_suffix +} + +output "https_listener_arn" { + description = "ARN of the HTTPS listener." + value = aws_lb_listener.https.arn +} + +output "http_redirect_listener_arn" { + description = "ARN of the HTTP→HTTPS redirect listener." + value = aws_lb_listener.http_redirect.arn +} + +output "alb_security_group_id" { + description = "Security group ID of the ALB. Add this as an allowed source in the app task security group." + value = aws_security_group.alb.id +} diff --git a/terraform/modules/lb/variables.tf b/terraform/modules/lb/variables.tf new file mode 100644 index 0000000..ec2556a --- /dev/null +++ b/terraform/modules/lb/variables.tf @@ -0,0 +1,102 @@ +################################################################################ +# Module: lb +# Variables — AWS Application Load Balancer +################################################################################ + +variable "environment" { + description = "Deployment environment label (e.g. production, staging)." + type = string +} + +variable "project" { + description = "Project identifier used in resource names and tags." + type = string + default = "sentryagent-agentidp" +} + +variable "vpc_id" { + description = "VPC ID in which to create the ALB and its security group." + type = string +} + +variable "subnet_ids" { + description = "List of public subnet IDs for the ALB. Must span at least 2 AZs." + type = list(string) +} + +variable "certificate_arn" { + description = "ARN of the ACM certificate to attach to the HTTPS listener (port 443)." + type = string +} + +variable "target_group_port" { + description = "Port that ECS task containers listen on. Target group forwards traffic to this port." + type = number + default = 3000 +} + +variable "target_group_health_check_path" { + description = "HTTP path used by the ALB target group health check." + type = string + default = "/health" +} + +variable "target_group_health_check_interval" { + description = "Interval in seconds between ALB health checks." + type = number + default = 30 +} + +variable "target_group_health_check_timeout" { + description = "Timeout in seconds for each ALB health check request." + type = number + default = 5 +} + +variable "target_group_healthy_threshold" { + description = "Number of consecutive successful health checks before marking a target healthy." + type = number + default = 2 +} + +variable "target_group_unhealthy_threshold" { + description = "Number of consecutive failed health checks before marking a target unhealthy." + type = number + default = 3 +} + +variable "idle_timeout" { + description = "ALB idle connection timeout in seconds." + type = number + default = 60 +} + +variable "enable_deletion_protection" { + description = "Prevent the ALB from being deleted via the AWS API." + type = bool + default = true +} + +variable "access_logs_bucket" { + description = "S3 bucket name for ALB access logs. Leave empty to disable access logging." + type = string + default = "" +} + +variable "access_logs_prefix" { + description = "S3 key prefix for ALB access log files." + type = string + default = "alb" +} + +variable "ssl_policy" { + description = "SSL negotiation policy for the HTTPS listener. ELBSecurityPolicy-TLS13-1-2-2021-06 enforces TLS 1.2+ and TLS 1.3." + type = string + default = "ELBSecurityPolicy-TLS13-1-2-2021-06" +} + +variable "allowed_ingress_cidrs" { + description = "CIDR blocks allowed to reach the ALB on port 80 and 443. Default allows public internet." + type = list(string) + default = ["0.0.0.0/0"] +} diff --git a/terraform/modules/rds/main.tf b/terraform/modules/rds/main.tf new file mode 100644 index 0000000..3abbfad --- /dev/null +++ b/terraform/modules/rds/main.tf @@ -0,0 +1,180 @@ +################################################################################ +# Module: rds +# Main — AWS RDS PostgreSQL 14 +# +# - Multi-AZ for HA +# - Encryption at rest (AWS-managed KMS key) +# - No public access — VPC-internal only +# - Storage autoscaling up to max_allocated_storage +# - Enhanced monitoring and Performance Insights enabled by default +# - Access restricted to explicitly allowed security groups (app only) +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.40.0" + } + } +} + +locals { + identifier = "${var.project}-${var.environment}" + + common_tags = { + environment = var.environment + project = var.project + managed_by = "terraform" + } +} + +################################################################################ +# Security Group — only the app SGs may connect on 5432 +################################################################################ + +resource "aws_security_group" "rds" { + name = "${local.identifier}-rds-sg" + description = "Controls inbound access to RDS PostgreSQL — allow only app SG on 5432" + vpc_id = var.vpc_id + + # No ingress rules defined here — added dynamically below to avoid circular deps. + egress { + description = "All outbound (RDS initiates no outbound connections; this satisfies AWS requirement)" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${local.identifier}-rds-sg" + }) +} + +resource "aws_security_group_rule" "rds_ingress_from_app" { + for_each = toset(var.allowed_security_group_ids) + + type = "ingress" + description = "PostgreSQL from app security group" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + source_security_group_id = each.value + security_group_id = aws_security_group.rds.id +} + +################################################################################ +# DB Subnet Group — must cover at least 2 AZs for Multi-AZ +################################################################################ + +resource "aws_db_subnet_group" "main" { + name = "${local.identifier}-db-subnet-group" + description = "Private subnets for AgentIdP RDS instance" + subnet_ids = var.subnet_ids + + tags = merge(local.common_tags, { + Name = "${local.identifier}-db-subnet-group" + }) +} + +################################################################################ +# DB Parameter Group — enforce SSL connections +################################################################################ + +resource "aws_db_parameter_group" "main" { + name = "${local.identifier}-pg14-params" + family = var.parameter_group_family + description = "AgentIdP custom parameter group — enforces SSL" + + parameter { + name = "rds.force_ssl" + value = "1" + apply_method = "immediate" + } + + parameter { + name = "log_connections" + value = "1" + apply_method = "immediate" + } + + parameter { + name = "log_disconnections" + value = "1" + apply_method = "immediate" + } + + parameter { + name = "log_min_duration_statement" + value = "1000" + apply_method = "immediate" + } + + tags = local.common_tags +} + +################################################################################ +# RDS Instance +################################################################################ + +resource "aws_db_instance" "main" { + identifier = local.identifier + + # Engine + engine = "postgres" + engine_version = "14" + instance_class = var.instance_class + + # Storage + storage_type = "gp3" + allocated_storage = var.allocated_storage + max_allocated_storage = var.max_allocated_storage + storage_encrypted = true + # kms_key_id is omitted — defaults to the AWS-managed RDS KMS key. + # For customer-managed key, set kms_key_id to your CMK ARN. + + # Database + db_name = var.db_name + username = var.db_username + password = var.db_password + + # Network — VPC-internal only, no public endpoint + db_subnet_group_name = aws_db_subnet_group.main.name + vpc_security_group_ids = [aws_security_group.rds.id] + publicly_accessible = false + multi_az = var.multi_az + port = 5432 + + # Parameter group + parameter_group_name = aws_db_parameter_group.main.name + + # Backups + backup_retention_period = var.backup_retention_days + backup_window = var.backup_window + delete_automated_backups = false + copy_tags_to_snapshot = true + skip_final_snapshot = var.skip_final_snapshot + final_snapshot_identifier = var.skip_final_snapshot ? null : "${local.identifier}-final-snapshot" + + # Maintenance + maintenance_window = var.maintenance_window + auto_minor_version_upgrade = true + apply_immediately = false + + # Observability + enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] + performance_insights_enabled = var.performance_insights_enabled + performance_insights_retention_period = var.performance_insights_enabled ? var.performance_insights_retention_period : null + monitoring_interval = var.monitoring_interval + monitoring_role_arn = var.monitoring_interval > 0 ? var.monitoring_role_arn : null + + # Protection + deletion_protection = var.deletion_protection + + tags = merge(local.common_tags, { + Name = local.identifier + }) +} diff --git a/terraform/modules/rds/outputs.tf b/terraform/modules/rds/outputs.tf new file mode 100644 index 0000000..444bdee --- /dev/null +++ b/terraform/modules/rds/outputs.tf @@ -0,0 +1,44 @@ +################################################################################ +# Module: rds +# Outputs +################################################################################ + +output "endpoint" { + description = "RDS instance endpoint hostname (without port). Use to construct DATABASE_URL." + value = aws_db_instance.main.address +} + +output "port" { + description = "Port the RDS instance listens on (always 5432)." + value = aws_db_instance.main.port +} + +output "db_name" { + description = "Name of the database created on the RDS instance." + value = aws_db_instance.main.db_name +} + +output "db_username" { + description = "Master username for the RDS instance." + value = aws_db_instance.main.username +} + +output "instance_id" { + description = "RDS instance identifier." + value = aws_db_instance.main.identifier +} + +output "instance_arn" { + description = "ARN of the RDS instance." + value = aws_db_instance.main.arn +} + +output "security_group_id" { + description = "Security group ID attached to the RDS instance. Use to add further ingress rules if needed." + value = aws_security_group.rds.id +} + +output "db_subnet_group_name" { + description = "Name of the DB subnet group." + value = aws_db_subnet_group.main.name +} diff --git a/terraform/modules/rds/variables.tf b/terraform/modules/rds/variables.tf new file mode 100644 index 0000000..fcb51ef --- /dev/null +++ b/terraform/modules/rds/variables.tf @@ -0,0 +1,133 @@ +################################################################################ +# Module: rds +# Variables — AWS RDS PostgreSQL 14 +################################################################################ + +variable "environment" { + description = "Deployment environment label (e.g. production, staging)." + type = string +} + +variable "project" { + description = "Project identifier used in resource names and tags." + type = string + default = "sentryagent-agentidp" +} + +variable "vpc_id" { + description = "VPC ID in which to create the RDS subnet group and security group." + type = string +} + +variable "subnet_ids" { + description = "List of private subnet IDs for the RDS DB subnet group. Must span at least 2 AZs for Multi-AZ." + type = list(string) +} + +variable "allowed_security_group_ids" { + description = "List of security group IDs (e.g. ECS app SG) permitted to connect to RDS on port 5432." + type = list(string) + default = [] +} + +variable "db_name" { + description = "Name of the initial PostgreSQL database to create." + type = string + default = "sentryagent_idp" +} + +variable "db_username" { + description = "Master username for the RDS instance." + type = string + default = "sentryagent" +} + +variable "db_password" { + description = "Master password for the RDS instance. Store this in Secrets Manager; do not hardcode." + type = string + sensitive = true +} + +variable "instance_class" { + description = "RDS instance class." + type = string + default = "db.t3.medium" +} + +variable "allocated_storage" { + description = "Initial storage allocated in GiB." + type = number + default = 50 +} + +variable "max_allocated_storage" { + description = "Upper bound for RDS storage autoscaling in GiB. Set to 0 to disable autoscaling." + type = number + default = 500 +} + +variable "multi_az" { + description = "Enable Multi-AZ deployment for high availability." + type = bool + default = true +} + +variable "backup_retention_days" { + description = "Number of days to retain automated backups. Must be >= 1 for Multi-AZ." + type = number + default = 7 +} + +variable "backup_window" { + description = "Preferred daily backup window in UTC (hh24:mi-hh24:mi)." + type = string + default = "03:00-04:00" +} + +variable "maintenance_window" { + description = "Preferred weekly maintenance window (ddd:hh24:mi-ddd:hh24:mi in UTC)." + type = string + default = "sun:05:00-sun:06:00" +} + +variable "deletion_protection" { + description = "Enable deletion protection. Set to false only when decommissioning." + type = bool + default = true +} + +variable "skip_final_snapshot" { + description = "Whether to skip the final DB snapshot on destroy. Should be false in production." + type = bool + default = false +} + +variable "performance_insights_enabled" { + description = "Enable RDS Performance Insights." + type = bool + default = true +} + +variable "performance_insights_retention_period" { + description = "Performance Insights data retention in days. Free tier = 7; paid tiers = 731." + type = number + default = 7 +} + +variable "monitoring_interval" { + description = "Enhanced monitoring interval in seconds (0 to disable, valid: 1, 5, 10, 15, 30, 60)." + type = number + default = 60 +} + +variable "monitoring_role_arn" { + description = "IAM role ARN for RDS Enhanced Monitoring. Required when monitoring_interval > 0." + type = string + default = "" +} + +variable "parameter_group_family" { + description = "DB parameter group family." + type = string + default = "postgres14" +} diff --git a/terraform/modules/redis/main.tf b/terraform/modules/redis/main.tf new file mode 100644 index 0000000..5a1d92c --- /dev/null +++ b/terraform/modules/redis/main.tf @@ -0,0 +1,176 @@ +################################################################################ +# Module: redis +# Main — AWS ElastiCache Redis 7 +# +# - Single shard (cluster mode disabled): one primary + one replica +# - Encryption at rest and in transit (TLS) +# - AUTH token required when transit encryption is enabled +# - VPC-internal only — no public access +# - Access restricted to explicitly allowed security groups (app only) +# - Slow log + engine log delivery to CloudWatch +################################################################################ + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.40.0" + } + } +} + +locals { + identifier = "${var.project}-${var.environment}" + + common_tags = { + environment = var.environment + project = var.project + managed_by = "terraform" + } +} + +################################################################################ +# CloudWatch Log Group for Redis logs +################################################################################ + +resource "aws_cloudwatch_log_group" "redis" { + count = var.log_delivery_enabled ? 1 : 0 + + name = var.log_group_name + retention_in_days = 30 + + tags = local.common_tags +} + +################################################################################ +# Security Group — only the app SGs may connect on 6379 +################################################################################ + +resource "aws_security_group" "redis" { + name = "${local.identifier}-redis-sg" + description = "Controls inbound access to ElastiCache Redis — allow only app SG on 6379" + vpc_id = var.vpc_id + + egress { + description = "All outbound" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(local.common_tags, { + Name = "${local.identifier}-redis-sg" + }) +} + +resource "aws_security_group_rule" "redis_ingress_from_app" { + for_each = toset(var.allowed_security_group_ids) + + type = "ingress" + description = "Redis from app security group" + from_port = 6379 + to_port = 6379 + protocol = "tcp" + source_security_group_id = each.value + security_group_id = aws_security_group.redis.id +} + +################################################################################ +# ElastiCache Subnet Group +################################################################################ + +resource "aws_elasticache_subnet_group" "main" { + name = "${local.identifier}-redis-subnet-group" + description = "Private subnets for AgentIdP ElastiCache Redis" + subnet_ids = var.subnet_ids + + tags = local.common_tags +} + +################################################################################ +# ElastiCache Parameter Group — Redis 7.x defaults are fine; custom group +# allows future tuning without recreating the replication group. +################################################################################ + +resource "aws_elasticache_parameter_group" "main" { + name = "${local.identifier}-redis7-params" + family = "redis7" + description = "AgentIdP Redis 7 parameter group" + + # Disable dangerous commands that could truncate data in production + parameter { + name = "lazyfree-lazy-eviction" + value = "yes" + } + + parameter { + name = "lazyfree-lazy-expire" + value = "yes" + } + + tags = local.common_tags +} + +################################################################################ +# ElastiCache Replication Group (cluster mode disabled) +# +# cluster_mode = 0 (disabled) gives a single-shard setup: +# - 1 primary node +# - num_cache_clusters - 1 replica nodes +# This matches the application usage: token revocation (SET/GET/DEL), +# rate limiting (INCR/EXPIRE), and monthly counters (INCR) — no sharding needed. +################################################################################ + +resource "aws_elasticache_replication_group" "main" { + replication_group_id = local.identifier + description = "AgentIdP Redis 7 — token revocation, rate limiting, counters" + + # Engine + engine = "redis" + engine_version = var.engine_version + node_type = var.node_type + parameter_group_name = aws_elasticache_parameter_group.main.name + port = 6379 + + # Topology — single shard, primary + replica + num_cache_clusters = var.num_cache_clusters + automatic_failover_enabled = var.automatic_failover_enabled + multi_az_enabled = var.multi_az_enabled + + # Network — VPC-internal, no public endpoints + subnet_group_name = aws_elasticache_subnet_group.main.name + security_group_ids = [aws_security_group.redis.id] + + # Security + at_rest_encryption_enabled = var.at_rest_encryption_enabled + transit_encryption_enabled = var.transit_encryption_enabled + auth_token = var.transit_encryption_enabled && var.auth_token != "" ? var.auth_token : null + + # Maintenance and snapshots + maintenance_window = var.maintenance_window + snapshot_retention_limit = var.snapshot_retention_limit + snapshot_window = var.snapshot_window + apply_immediately = var.apply_immediately + + # Log delivery to CloudWatch + dynamic "log_delivery_configuration" { + for_each = var.log_delivery_enabled ? [ + { log_type = "slow-log", log_format = "json" }, + { log_type = "engine-log", log_format = "json" } + ] : [] + + content { + destination = var.log_delivery_enabled ? aws_cloudwatch_log_group.redis[0].name : "" + destination_type = "cloudwatch-logs" + log_format = log_delivery_configuration.value.log_format + log_type = log_delivery_configuration.value.log_type + } + } + + tags = merge(local.common_tags, { + Name = local.identifier + }) +} diff --git a/terraform/modules/redis/outputs.tf b/terraform/modules/redis/outputs.tf new file mode 100644 index 0000000..4bc93fd --- /dev/null +++ b/terraform/modules/redis/outputs.tf @@ -0,0 +1,34 @@ +################################################################################ +# Module: redis +# Outputs +################################################################################ + +output "primary_endpoint" { + description = "Primary endpoint hostname for write operations. Use to construct REDIS_URL." + value = aws_elasticache_replication_group.main.primary_endpoint_address +} + +output "reader_endpoint" { + description = "Reader endpoint for read operations (load-balanced across replicas)." + value = aws_elasticache_replication_group.main.reader_endpoint_address +} + +output "port" { + description = "Port the Redis replication group listens on (always 6379)." + value = aws_elasticache_replication_group.main.port +} + +output "replication_group_id" { + description = "ID of the ElastiCache replication group." + value = aws_elasticache_replication_group.main.replication_group_id +} + +output "security_group_id" { + description = "Security group ID attached to the replication group. Use to add further ingress rules." + value = aws_security_group.redis.id +} + +output "redis_url" { + description = "Constructed REDIS_URL using the primary endpoint. Includes rediss:// (TLS) scheme when transit encryption is enabled." + value = var.transit_encryption_enabled ? "rediss://${aws_elasticache_replication_group.main.primary_endpoint_address}:${aws_elasticache_replication_group.main.port}" : "redis://${aws_elasticache_replication_group.main.primary_endpoint_address}:${aws_elasticache_replication_group.main.port}" +} diff --git a/terraform/modules/redis/variables.tf b/terraform/modules/redis/variables.tf new file mode 100644 index 0000000..2bb457b --- /dev/null +++ b/terraform/modules/redis/variables.tf @@ -0,0 +1,116 @@ +################################################################################ +# Module: redis +# Variables — AWS ElastiCache Redis 7 +################################################################################ + +variable "environment" { + description = "Deployment environment label (e.g. production, staging)." + type = string +} + +variable "project" { + description = "Project identifier used in resource names and tags." + type = string + default = "sentryagent-agentidp" +} + +variable "vpc_id" { + description = "VPC ID in which to create the ElastiCache subnet group and security group." + type = string +} + +variable "subnet_ids" { + description = "List of private subnet IDs for the ElastiCache subnet group. Span at least 2 AZs." + type = list(string) +} + +variable "allowed_security_group_ids" { + description = "List of security group IDs (e.g. ECS app SG) permitted to connect to Redis on port 6379." + type = list(string) + default = [] +} + +variable "node_type" { + description = "ElastiCache node instance type." + type = string + default = "cache.t3.medium" +} + +variable "engine_version" { + description = "Redis engine version. Use 7.x for Redis 7." + type = string + default = "7.1" +} + +variable "num_cache_clusters" { + description = "Total number of cache clusters in the replication group (1 primary + N replicas). Minimum 2 for HA." + type = number + default = 2 +} + +variable "automatic_failover_enabled" { + description = "Enable automatic failover. Required when num_cache_clusters > 1." + type = bool + default = true +} + +variable "multi_az_enabled" { + description = "Enable Multi-AZ for the replication group." + type = bool + default = true +} + +variable "at_rest_encryption_enabled" { + description = "Encrypt data at rest." + type = bool + default = true +} + +variable "transit_encryption_enabled" { + description = "Enable TLS for data in transit." + type = bool + default = true +} + +variable "auth_token" { + description = "AUTH token (password) for Redis AUTH command. Required when transit_encryption_enabled = true. Minimum 16 characters." + type = string + sensitive = true + default = "" +} + +variable "maintenance_window" { + description = "Preferred weekly maintenance window (ddd:hh24:mi-ddd:hh24:mi in UTC)." + type = string + default = "sun:06:00-sun:07:00" +} + +variable "snapshot_retention_limit" { + description = "Number of days to retain automatic Redis snapshots. 0 disables snapshots." + type = number + default = 7 +} + +variable "snapshot_window" { + description = "Daily time range for automatic snapshots (hh24:mi-hh24:mi in UTC). Must not overlap maintenance_window." + type = string + default = "04:00-05:00" +} + +variable "apply_immediately" { + description = "Apply changes immediately. Set to false to wait for the next maintenance window in production." + type = bool + default = false +} + +variable "log_delivery_enabled" { + description = "Enable delivery of Redis slow logs and engine logs to CloudWatch." + type = bool + default = true +} + +variable "log_group_name" { + description = "CloudWatch log group name for Redis logs. Created if it does not exist." + type = string + default = "/elasticache/sentryagent-agentidp/redis" +}