feat(phase-4): WS1 — Production Hardening (Redis rate limiting, DB pool, health endpoint, k6)

Rate limiting: - Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window) - Graceful fallback to RateLimiterMemory when Redis unreachable - RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config - Retry-After header on 429 responses - agentidp_rate_limit_hits_total Prometheus counter Database pool: - Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS - Defaults: max=20, min=2, idle=30s, conn timeout=5s - agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges Health endpoint: - GET /health/detailed — per-service status (database, Redis, Vault, OPA) - healthy / degraded (>1000ms) / unreachable classification - HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable) Load tests: - tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs) - npm run load-test script Tests: 586 passing, zero TypeScript errors Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 04:20:37 +00:00
parent b0f70b7ac4
commit 1b682c22b2
16 changed files with 1467 additions and 113 deletions
--- a/src/controllers/HealthDetailedController.ts
+++ b/src/controllers/HealthDetailedController.ts
@@ -0,0 +1,197 @@
+/**
+ * Detailed health check controller for SentryAgent.ai AgentIdP.
+ *
+ * Implements `GET /health/detailed` — checks each dependency with latency
+ * measurement and classifies the result as:
+ *   healthy    — responded within 1000 ms
+ *   degraded   — responded but latency exceeded 1000 ms
+ *   unreachable — timed out or threw an error
+ *
+ * HTTP response codes (task 2.6):
+ *   200 — all services healthy
+ *   207 — at least one service degraded (but none unreachable)
+ *   503 — at least one service unreachable
+ */
+
+import { Request, Response } from 'express';
+import { Pool } from 'pg';
+
+/** Timeout applied to each individual health-check probe (ms). */
+const PROBE_TIMEOUT_MS = 3000;
+
+/** Latency threshold above which a service is considered degraded (ms). */
+const DEGRADED_THRESHOLD_MS = 1000;
+
+/** Classification of a single dependency's health. */
+export type ServiceStatus = 'healthy' | 'degraded' | 'unreachable';
+
+/** Per-service result returned in the response body. */
+export interface ServiceHealthResult {
+  status: ServiceStatus;
+  latencyMs: number;
+}
+
+/** Full response body shape for GET /health/detailed. */
+export interface DetailedHealthResponse {
+  status: 'healthy' | 'degraded' | 'unreachable';
+  version: string;
+  uptime: number;
+  services: Record<string, ServiceHealthResult>;
+}
+
+/**
+ * Dependencies injected into the controller.
+ * All fields are optional — services are only probed when their client is provided.
+ */
+export interface HealthDetailedDeps {
+  pool: Pool;
+  /** Optional Vault URL — when provided, the controller probes Vault's /v1/sys/health. */
+  vaultAddr?: string;
+  /** Optional OPA URL — when provided, the controller probes OPA's /health. */
+  opaUrl?: string;
+  /** Optional ioredis-compatible client for Redis probe. */
+  redisClient?: { ping(): Promise<string> } | null;
+}
+
+/**
+ * Wraps a probe promise with a hard timeout. Resolves to `'unreachable'`
+ * classification when the timeout fires.
+ *
+ * @param probe - Async function that performs the health check and returns latencyMs.
+ * @returns ServiceHealthResult with status and latency.
+ */
+async function runProbe(
+  probe: () => Promise<number>,
+): Promise<ServiceHealthResult> {
+  const timeoutPromise = new Promise<ServiceHealthResult>((resolve) => {
+    setTimeout(() => {
+      resolve({ status: 'unreachable', latencyMs: PROBE_TIMEOUT_MS });
+    }, PROBE_TIMEOUT_MS);
+  });
+
+  const probePromise = (async (): Promise<ServiceHealthResult> => {
+    try {
+      const latencyMs = await probe();
+      const status: ServiceStatus =
+        latencyMs > DEGRADED_THRESHOLD_MS ? 'degraded' : 'healthy';
+      return { status, latencyMs };
+    } catch {
+      return { status: 'unreachable', latencyMs: PROBE_TIMEOUT_MS };
+    }
+  })();
+
+  return Promise.race([probePromise, timeoutPromise]);
+}
+
+/**
+ * Controller implementing GET /health/detailed.
+ *
+ * Constructed with the required infrastructure dependencies and optional
+ * optional-service clients. The `handle` method is an Express route handler.
+ */
+export class HealthDetailedController {
+  private readonly pool: Pool;
+  private readonly vaultAddr: string | undefined;
+  private readonly opaUrl: string | undefined;
+  private readonly redisClient: { ping(): Promise<string> } | null;
+
+  constructor(deps: HealthDetailedDeps) {
+    this.pool = deps.pool;
+    this.vaultAddr = deps.vaultAddr;
+    this.opaUrl = deps.opaUrl;
+    this.redisClient = deps.redisClient ?? null;
+  }
+
+  /**
+   * Express route handler for GET /health/detailed.
+   *
+   * @param _req - Express request (unused).
+   * @param res - Express response.
+   */
+  handle = (_req: Request, res: Response): void => {
+    void this.runChecks(res);
+  };
+
+  private async runChecks(res: Response): Promise<void> {
+    const services: Record<string, ServiceHealthResult> = {};
+
+    // ── PostgreSQL probe ────────────────────────────────────────────────────
+    services['postgres'] = await runProbe(async () => {
+      const start = Date.now();
+      const client = await this.pool.connect();
+      try {
+        await client.query('SELECT 1');
+      } finally {
+        client.release();
+      }
+      return Date.now() - start;
+    });
+
+    // ── Redis probe (optional — only when client supplied) ──────────────────
+    if (this.redisClient !== null) {
+      services['redis'] = await runProbe(async () => {
+        const start = Date.now();
+        await this.redisClient!.ping();
+        return Date.now() - start;
+      });
+    }
+
+    // ── Vault probe (optional — only when VAULT_ADDR is configured) ─────────
+    if (this.vaultAddr) {
+      services['vault'] = await runProbe(async () => {
+        const url = `${this.vaultAddr}/v1/sys/health`;
+        const start = Date.now();
+        const fetchResponse = await fetch(url, { signal: AbortSignal.timeout(PROBE_TIMEOUT_MS) });
+        const latencyMs = Date.now() - start;
+        // Vault returns 200 (initialised, unsealed, active), 429 (standby), 472/473 (DR).
+        // All mean Vault is reachable — only network errors mean unreachable.
+        if (fetchResponse.status >= 500) {
+          throw new Error(`Vault health endpoint returned ${fetchResponse.status}`);
+        }
+        return latencyMs;
+      });
+    }
+
+    // ── OPA probe (optional — only when OPA_URL is configured) ─────────────
+    if (this.opaUrl) {
+      services['opa'] = await runProbe(async () => {
+        const url = `${this.opaUrl}/health`;
+        const start = Date.now();
+        const fetchResponse = await fetch(url, { signal: AbortSignal.timeout(PROBE_TIMEOUT_MS) });
+        const latencyMs = Date.now() - start;
+        if (!fetchResponse.ok) {
+          throw new Error(`OPA health endpoint returned ${fetchResponse.status}`);
+        }
+        return latencyMs;
+      });
+    }
+
+    // ── Compute overall status (task 2.6) ───────────────────────────────────
+    const statuses = Object.values(services).map((s) => s.status);
+    const hasUnreachable = statuses.includes('unreachable');
+    const hasDegraded = statuses.includes('degraded');
+
+    let overallStatus: 'healthy' | 'degraded' | 'unreachable';
+    let httpStatus: 200 | 207 | 503;
+
+    if (hasUnreachable) {
+      overallStatus = 'unreachable';
+      httpStatus = 503;
+    } else if (hasDegraded) {
+      overallStatus = 'degraded';
+      httpStatus = 207;
+    } else {
+      overallStatus = 'healthy';
+      httpStatus = 200;
+    }
+
+    const body: DetailedHealthResponse = {
+      status: overallStatus,
+      version: process.env['npm_package_version'] ?? '1.0.0',
+      uptime: Math.floor(process.uptime()),
+      services,
+    };
+
+    res.status(httpStatus).json(body);
+  }
+}
--- a/src/db/pool.ts
+++ b/src/db/pool.ts
@@ -1,16 +1,30 @@
 /**
 * PostgreSQL connection pool singleton.
 * All database access flows through this pool.
+ *
+ * Pool configuration env vars (task 2.1 / 2.2):
+ *   DB_POOL_MAX                   — maximum connections (default 20)
+ *   DB_POOL_MIN                   — minimum connections (default 2)
+ *   DB_POOL_IDLE_TIMEOUT_MS       — idle connection timeout in ms (default 30000)
+ *   DB_POOL_CONNECTION_TIMEOUT_MS — connection acquisition timeout in ms (default 5000)
 */

 import { Pool } from 'pg';
-import { dbQueryDurationSeconds } from '../metrics/registry.js';
+import {
+  dbQueryDurationSeconds,
+  dbPoolActiveConnections,
+  dbPoolWaitingRequests,
+} from '../metrics/registry.js';

 let pool: Pool | null = null;

 /**
 * Returns the singleton pg Pool instance.
- * Initialises the pool on first call using DATABASE_URL from the environment.
+ * Initialises the pool on first call using DATABASE_URL and optional pool
+ * tuning env vars.
+ *
+ * Prometheus gauges `agentidp_db_pool_active_connections` and
+ * `agentidp_db_pool_waiting_requests` are updated via pool events (task 2.3).
 *
 * @returns The PostgreSQL connection pool.
 * @throws Error if DATABASE_URL is not set.
@@ -21,13 +35,50 @@ export function getPool(): Pool {
    if (!connectionString) {
      throw new Error('DATABASE_URL environment variable is required');
    }
-    pool = new Pool({ connectionString });
+
+    const max = parseInt(process.env['DB_POOL_MAX'] ?? '20', 10);
+    const min = parseInt(process.env['DB_POOL_MIN'] ?? '2', 10);
+    const idleTimeoutMillis = parseInt(process.env['DB_POOL_IDLE_TIMEOUT_MS'] ?? '30000', 10);
+    const connectionTimeoutMillis = parseInt(
+      process.env['DB_POOL_CONNECTION_TIMEOUT_MS'] ?? '5000',
+      10,
+    );
+
+    pool = new Pool({
+      connectionString,
+      max,
+      min,
+      idleTimeoutMillis,
+      connectionTimeoutMillis,
+    });

    pool.on('error', (err: Error) => {
      // eslint-disable-next-line no-console
      console.error('Unexpected pg pool error', err);
    });

+    // Track active connections and waiting requests via pool events (task 2.3).
+    pool.on('acquire', () => {
+      if (pool) {
+        dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
+        dbPoolWaitingRequests.set(pool.waitingCount);
+      }
+    });
+
+    pool.on('remove', () => {
+      if (pool) {
+        dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
+        dbPoolWaitingRequests.set(pool.waitingCount);
+      }
+    });
+
+    pool.on('connect', () => {
+      if (pool) {
+        dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
+        dbPoolWaitingRequests.set(pool.waitingCount);
+      }
+    });
+
    // Wrap pool.query to record duration in Prometheus.
    // The pg Pool.query method is heavily overloaded — the only safe approach
    // without TypeScript errors is a typed-any wrapper on the shim itself.
--- a/src/infrastructure/redisClient.ts
+++ b/src/infrastructure/redisClient.ts
@@ -0,0 +1,78 @@
+/**
+ * ioredis singleton client for rate-limiter-flexible.
+ *
+ * This client is separate from the `src/cache/redis.ts` client (which uses the
+ * `redis` npm package and handles token revocation / OIDC caching).  The
+ * rate-limiter-flexible library requires an ioredis-compatible client.
+ *
+ * Guard: when `REDIS_RATE_LIMIT_ENABLED` is not `"true"` the factory returns
+ * `null` and the rate limiter falls back to in-process memory (RateLimiterMemory).
+ */
+
+import Redis from 'ioredis';
+
+let ioredisClient: Redis | null = null;
+
+/**
+ * Returns a singleton ioredis client for rate limiting, or `null` when Redis
+ * rate limiting is disabled via the `REDIS_RATE_LIMIT_ENABLED` env var.
+ *
+ * The client is lazily initialised on first call. Connection errors are logged
+ * but do NOT throw — callers must handle a `null` return and fall back to
+ * in-memory rate limiting.
+ *
+ * @returns The ioredis client instance, or `null` when disabled / unreachable.
+ */
+export function getRateLimitRedisClient(): Redis | null {
+  const enabled = process.env['REDIS_RATE_LIMIT_ENABLED'];
+  if (enabled !== 'true') {
+    return null;
+  }
+
+  if (ioredisClient) {
+    return ioredisClient;
+  }
+
+  const redisUrl = process.env['REDIS_URL'] ?? 'redis://localhost:6379';
+
+  ioredisClient = new Redis(redisUrl, {
+    // Do not throw on connection failure — caller handles null / fallback.
+    lazyConnect: false,
+    enableReadyCheck: true,
+    maxRetriesPerRequest: 1,
+    // Reconnect strategy: give up quickly so the health check / fallback fires.
+    retryStrategy: (times: number): number | null => {
+      if (times >= 3) {
+        return null; // stop retrying — triggers 'error' event
+      }
+      return Math.min(times * 200, 1000);
+    },
+  });
+
+  ioredisClient.on('error', (err: Error) => {
+    // eslint-disable-next-line no-console
+    console.error('[RateLimitRedis] Connection error — rate limiter will use memory fallback:', err.message);
+    // Reset singleton so next call re-attempts connection.
+    ioredisClient = null;
+  });
+
+  ioredisClient.on('connect', () => {
+    // eslint-disable-next-line no-console
+    console.log('[RateLimitRedis] Connected — Redis-backed rate limiting active.');
+  });
+
+  return ioredisClient;
+}
+
+/**
+ * Closes the ioredis rate-limit client and resets the singleton.
+ * Used for graceful shutdown and tests.
+ *
+ * @returns Promise that resolves when the client is disconnected.
+ */
+export async function closeRateLimitRedisClient(): Promise<void> {
+  if (ioredisClient) {
+    await ioredisClient.quit();
+    ioredisClient = null;
+  }
+}
--- a/src/metrics/registry.ts
+++ b/src/metrics/registry.ts
@@ -116,3 +116,34 @@ export const auditChainIntegrity = new Gauge({
  help: 'Binary gauge: 1 = most recent audit chain verification passed, 0 = failed.',
  registers: [metricsRegistry],
 });
+
+/**
+ * Total number of HTTP 429 responses returned by the rate limiter.
+ * Labels: endpoint (req.path at time of rejection)
+ */
+export const rateLimitHitsTotal = new Counter({
+  name: 'agentidp_rate_limit_hits_total',
+  help: 'Total number of HTTP 429 responses returned by the rate limiter.',
+  labelNames: ['endpoint'] as const,
+  registers: [metricsRegistry],
+});
+
+/**
+ * Current number of active (checked-out) PostgreSQL pool connections.
+ * Updated on pool `acquire` and `remove` events.
+ */
+export const dbPoolActiveConnections = new Gauge({
+  name: 'agentidp_db_pool_active_connections',
+  help: 'Current number of active (checked-out) PostgreSQL pool connections.',
+  registers: [metricsRegistry],
+});
+
+/**
+ * Current number of waiting client requests in the PostgreSQL pool queue.
+ * Updated whenever the pool queue length changes.
+ */
+export const dbPoolWaitingRequests = new Gauge({
+  name: 'agentidp_db_pool_waiting_requests',
+  help: 'Current number of requests waiting for a PostgreSQL connection.',
+  registers: [metricsRegistry],
+});
--- a/src/middleware/rateLimit.ts
+++ b/src/middleware/rateLimit.ts
@@ -1,34 +1,104 @@
 /**
 * Redis-backed rate limiting middleware for SentryAgent.ai AgentIdP.
- * Enforces 100 requests per minute per client_id using a sliding window counter.
+ *
+ * Uses `rate-limiter-flexible` with a sliding-window `RateLimiterRedis` when
+ * `REDIS_RATE_LIMIT_ENABLED=true` and Redis is reachable.  Falls back to
+ * `RateLimiterMemory` transparently when Redis is unavailable (task 1.4).
+ *
+ * Configuration env vars:
+ *   RATE_LIMIT_WINDOW_MS      — window length in milliseconds (default 60000)
+ *   RATE_LIMIT_MAX_REQUESTS   — maximum requests per window (default 100)
+ *   REDIS_RATE_LIMIT_ENABLED  — set to "true" to enable Redis backend
 */

 import { Request, Response, NextFunction } from 'express';
-import { getRedisClient } from '../cache/redis.js';
+import {
+  RateLimiterRedis,
+  RateLimiterMemory,
+  RateLimiterAbstract,
+  RateLimiterRes,
+} from 'rate-limiter-flexible';
+import { getRateLimitRedisClient } from '../infrastructure/redisClient.js';
+import { rateLimitHitsTotal } from '../metrics/registry.js';
 import { RateLimitError } from '../utils/errors.js';

-const RATE_LIMIT_MAX = 100;
-const WINDOW_MS = 60000; // 60 seconds
+/** Singleton rate limiter — created once and reused across requests. */
+let rateLimiter: RateLimiterAbstract | null = null;

 /**
- * Computes the current rate-limit window key and next reset timestamp.
+ * Returns the configured rate limiter instance (RateLimiterRedis or fallback
+ * RateLimiterMemory). The instance is memoised after the first successful
+ * construction so configuration is parsed only once per process lifetime.
 *
- * @returns Object with `windowKey` (minute index) and `resetAt` (Unix seconds).
+ * When the ioredis client is unavailable (Redis unreachable or disabled) the
+ * function falls back to in-process memory without throwing.
+ *
+ * @returns Configured RateLimiterAbstract instance.
 */
-function getWindowInfo(): { windowKey: number; resetAt: number } {
-  const windowKey = Math.floor(Date.now() / WINDOW_MS);
-  const resetAt = (windowKey + 1) * (WINDOW_MS / 1000);
-  return { windowKey, resetAt };
+function getRateLimiter(): RateLimiterAbstract {
+  if (rateLimiter) {
+    return rateLimiter;
+  }
+
+  const windowMs = parseInt(process.env['RATE_LIMIT_WINDOW_MS'] ?? '60000', 10);
+  const maxRequests = parseInt(process.env['RATE_LIMIT_MAX_REQUESTS'] ?? '100', 10);
+  const windowSeconds = Math.ceil(windowMs / 1000);
+
+  const redisClient = getRateLimitRedisClient();
+
+  if (redisClient !== null) {
+    // RateLimiterRedis: sliding window backed by ioredis.
+    // insuranceLimiter provides in-memory fallback when Redis is temporarily down.
+    rateLimiter = new RateLimiterRedis({
+      storeClient: redisClient,
+      keyPrefix: 'rl',
+      points: maxRequests,
+      duration: windowSeconds,
+      blockDuration: 0,
+      insuranceLimiter: new RateLimiterMemory({
+        points: maxRequests,
+        duration: windowSeconds,
+      }),
+    });
+  } else {
+    // Redis disabled or unreachable — use in-process memory limiter.
+    rateLimiter = new RateLimiterMemory({
+      points: maxRequests,
+      duration: windowSeconds,
+    });
+  }
+
+  return rateLimiter;
 }

 /**
- * Express middleware that applies Redis-based rate limiting per client_id.
+ * Resets the memoised rate limiter singleton.
+ * Exposed for testing purposes only — do NOT call in production code.
 *
- * The client_id is sourced from `req.user.client_id` (set by authMiddleware).
- * For unauthenticated requests (token endpoint), the client IP is used instead.
+ * @internal
+ */
+export function _resetRateLimiterForTests(): void {
+  rateLimiter = null;
+}
+
+/**
+ * Derives the rate-limit key for a given request.
+ * Authenticated requests key by `client_id`; unauthenticated requests key by IP.
 *
- * Sets `X-RateLimit-Limit`, `X-RateLimit-Remaining`, and `X-RateLimit-Reset`
- * headers on every response. Throws `RateLimitError` when the limit is exceeded.
+ * @param req - Express request object.
+ * @returns String key unique to the client.
+ */
+function resolveClientKey(req: Request): string {
+  return req.user?.client_id ?? req.ip ?? 'unknown';
+}
+
+/**
+ * Express middleware that applies sliding-window rate limiting per client.
+ *
+ * Sets `X-RateLimit-Limit`, `X-RateLimit-Remaining`, `X-RateLimit-Reset`, and
+ * `Retry-After` (on rejection) headers.  Increments the
+ * `agentidp_rate_limit_hits_total` Prometheus counter and calls
+ * `next(RateLimitError)` when the limit is exceeded.
 *
 * @param req - Express request.
 * @param res - Express response.
@@ -39,31 +109,43 @@ export async function rateLimitMiddleware(
  res: Response,
  next: NextFunction,
 ): Promise<void> {
+  const limiter = getRateLimiter();
+  const key = resolveClientKey(req);
+
  try {
-    const clientId = req.user?.client_id ?? req.ip ?? 'unknown';
-    const { windowKey, resetAt } = getWindowInfo();
-    const redisKey = `rate:${clientId}:${windowKey}`;
+    const result: RateLimiterRes = await limiter.consume(key);

-    const redis = await getRedisClient();
-
-    // Atomically increment and set TTL
-    const count = await redis.incr(redisKey);
-    if (count === 1) {
-      await redis.expire(redisKey, 60);
-    }
-
-    const remaining = Math.max(0, RATE_LIMIT_MAX - count);
-
-    res.setHeader('X-RateLimit-Limit', RATE_LIMIT_MAX);
-    res.setHeader('X-RateLimit-Remaining', remaining);
-    res.setHeader('X-RateLimit-Reset', resetAt);
-
-    if (count > RATE_LIMIT_MAX) {
-      throw new RateLimitError();
-    }
+    // Headers present on every successful response.
+    res.setHeader('X-RateLimit-Limit', limiter.points);
+    res.setHeader('X-RateLimit-Remaining', result.remainingPoints);
+    res.setHeader(
+      'X-RateLimit-Reset',
+      Math.ceil(Date.now() / 1000 + result.msBeforeNext / 1000),
+    );

    next();
  } catch (err) {
-    next(err);
+    if (err instanceof RateLimiterRes) {
+      // Rate limit exceeded — err is the RateLimiterRes rejection object.
+      const retryAfterSeconds = Math.ceil(err.msBeforeNext / 1000);
+      const endpoint = req.path;
+
+      // Prometheus counter — increment on every HTTP 429 (task 1.5).
+      rateLimitHitsTotal.inc({ endpoint });
+
+      // Standard headers on rate-limit rejection (task 1.6).
+      res.setHeader('X-RateLimit-Limit', limiter.points);
+      res.setHeader('X-RateLimit-Remaining', 0);
+      res.setHeader(
+        'X-RateLimit-Reset',
+        Math.ceil(Date.now() / 1000 + err.msBeforeNext / 1000),
+      );
+      res.setHeader('Retry-After', retryAfterSeconds);
+
+      next(new RateLimitError());
+    } else {
+      // Unexpected error (e.g. Redis failure not caught by insuranceLimiter).
+      next(err);
+    }
  }
 }
--- a/src/routes/health.ts
+++ b/src/routes/health.ts
@@ -1,12 +1,16 @@
 /**
- * Health check route for SentryAgent.ai AgentIdP.
- * Returns connectivity status for PostgreSQL and Redis.
- * Unauthenticated — safe to call from monitoring systems and the dashboard.
+ * Health check routes for SentryAgent.ai AgentIdP.
+ *
+ * GET /health          — quick liveness check (existing)
+ * GET /health/detailed — full dependency health with latency (task 2.4)
+ *
+ * Both endpoints are unauthenticated — safe to call from monitoring systems.
 */

 import { Router, Request, Response } from 'express';
 import { Pool } from 'pg';
 import { RedisClientType } from 'redis';
+import { HealthDetailedController } from '../controllers/HealthDetailedController.js';

 /** Response shape for GET /health */
 interface HealthResponse {
@@ -20,7 +24,7 @@ interface HealthResponse {
 }

 /**
- * Creates and returns the Express router for the health endpoint.
+ * Creates and returns the Express router for health endpoints.
 *
 * @param pool  - PostgreSQL connection pool.
 * @param redis - Redis client instance.
@@ -29,6 +33,14 @@ interface HealthResponse {
 export function createHealthRouter(pool: Pool, redis: RedisClientType): Router {
  const router = Router();

+  // Instantiate the detailed health controller with optional service clients.
+  const detailedController = new HealthDetailedController({
+    pool,
+    redisClient: redis,
+    vaultAddr: process.env['VAULT_ADDR'] ?? undefined,
+    opaUrl: process.env['OPA_URL'] ?? undefined,
+  });
+
  /**
   * GET /health
   * Returns 200 when all services are healthy, 503 when any are degraded.
@@ -75,5 +87,12 @@ export function createHealthRouter(pool: Pool, redis: RedisClientType): Router {
    void check();
  });

+  /**
+   * GET /health/detailed
+   * Returns per-service health with latency.
+   * 200 = all healthy, 207 = any degraded, 503 = any unreachable.
+   */
+  router.get('/detailed', detailedController.handle);
+
  return router;
 }