feat(phase-4): WS1 — Production Hardening (Redis rate limiting, DB pool, health endpoint, k6)

Rate limiting:
- Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window)
- Graceful fallback to RateLimiterMemory when Redis unreachable
- RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config
- Retry-After header on 429 responses
- agentidp_rate_limit_hits_total Prometheus counter

Database pool:
- Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS
- Defaults: max=20, min=2, idle=30s, conn timeout=5s
- agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges

Health endpoint:
- GET /health/detailed — per-service status (database, Redis, Vault, OPA)
- healthy / degraded (>1000ms) / unreachable classification
- HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable)

Load tests:
- tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs)
- npm run load-test script

Tests: 586 passing, zero TypeScript errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
SentryAgent.ai Developer
2026-04-02 04:20:37 +00:00
parent b0f70b7ac4
commit 1b682c22b2
16 changed files with 1467 additions and 113 deletions

View File

@@ -1,16 +1,30 @@
/**
* PostgreSQL connection pool singleton.
* All database access flows through this pool.
*
* Pool configuration env vars (task 2.1 / 2.2):
* DB_POOL_MAX — maximum connections (default 20)
* DB_POOL_MIN — minimum connections (default 2)
* DB_POOL_IDLE_TIMEOUT_MS — idle connection timeout in ms (default 30000)
* DB_POOL_CONNECTION_TIMEOUT_MS — connection acquisition timeout in ms (default 5000)
*/
import { Pool } from 'pg';
import { dbQueryDurationSeconds } from '../metrics/registry.js';
import {
dbQueryDurationSeconds,
dbPoolActiveConnections,
dbPoolWaitingRequests,
} from '../metrics/registry.js';
let pool: Pool | null = null;
/**
* Returns the singleton pg Pool instance.
* Initialises the pool on first call using DATABASE_URL from the environment.
* Initialises the pool on first call using DATABASE_URL and optional pool
* tuning env vars.
*
* Prometheus gauges `agentidp_db_pool_active_connections` and
* `agentidp_db_pool_waiting_requests` are updated via pool events (task 2.3).
*
* @returns The PostgreSQL connection pool.
* @throws Error if DATABASE_URL is not set.
@@ -21,13 +35,50 @@ export function getPool(): Pool {
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is required');
}
pool = new Pool({ connectionString });
const max = parseInt(process.env['DB_POOL_MAX'] ?? '20', 10);
const min = parseInt(process.env['DB_POOL_MIN'] ?? '2', 10);
const idleTimeoutMillis = parseInt(process.env['DB_POOL_IDLE_TIMEOUT_MS'] ?? '30000', 10);
const connectionTimeoutMillis = parseInt(
process.env['DB_POOL_CONNECTION_TIMEOUT_MS'] ?? '5000',
10,
);
pool = new Pool({
connectionString,
max,
min,
idleTimeoutMillis,
connectionTimeoutMillis,
});
pool.on('error', (err: Error) => {
// eslint-disable-next-line no-console
console.error('Unexpected pg pool error', err);
});
// Track active connections and waiting requests via pool events (task 2.3).
pool.on('acquire', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
pool.on('remove', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
pool.on('connect', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
// Wrap pool.query to record duration in Prometheus.
// The pg Pool.query method is heavily overloaded — the only safe approach
// without TypeScript errors is a typed-any wrapper on the shim itself.