feat(phase-4): WS1 — Production Hardening (Redis rate limiting, DB pool, health endpoint, k6)

Rate limiting:
- Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window)
- Graceful fallback to RateLimiterMemory when Redis unreachable
- RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config
- Retry-After header on 429 responses
- agentidp_rate_limit_hits_total Prometheus counter

Database pool:
- Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS
- Defaults: max=20, min=2, idle=30s, conn timeout=5s
- agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges

Health endpoint:
- GET /health/detailed — per-service status (database, Redis, Vault, OPA)
- healthy / degraded (>1000ms) / unreachable classification
- HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable)

Load tests:
- tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs)
- npm run load-test script

Tests: 586 passing, zero TypeScript errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
SentryAgent.ai Developer
2026-04-02 04:20:37 +00:00
parent b0f70b7ac4
commit 1b682c22b2
16 changed files with 1467 additions and 113 deletions

View File

@@ -0,0 +1,197 @@
/**
* Detailed health check controller for SentryAgent.ai AgentIdP.
*
* Implements `GET /health/detailed` — checks each dependency with latency
* measurement and classifies the result as:
* healthy — responded within 1000 ms
* degraded — responded but latency exceeded 1000 ms
* unreachable — timed out or threw an error
*
* HTTP response codes (task 2.6):
* 200 — all services healthy
* 207 — at least one service degraded (but none unreachable)
* 503 — at least one service unreachable
*/
import { Request, Response } from 'express';
import { Pool } from 'pg';
/** Timeout applied to each individual health-check probe (ms). */
const PROBE_TIMEOUT_MS = 3000;
/** Latency threshold above which a service is considered degraded (ms). */
const DEGRADED_THRESHOLD_MS = 1000;
/** Classification of a single dependency's health. */
export type ServiceStatus = 'healthy' | 'degraded' | 'unreachable';
/** Per-service result returned in the response body. */
export interface ServiceHealthResult {
status: ServiceStatus;
latencyMs: number;
}
/** Full response body shape for GET /health/detailed. */
export interface DetailedHealthResponse {
status: 'healthy' | 'degraded' | 'unreachable';
version: string;
uptime: number;
services: Record<string, ServiceHealthResult>;
}
/**
* Dependencies injected into the controller.
* All fields are optional — services are only probed when their client is provided.
*/
export interface HealthDetailedDeps {
pool: Pool;
/** Optional Vault URL — when provided, the controller probes Vault's /v1/sys/health. */
vaultAddr?: string;
/** Optional OPA URL — when provided, the controller probes OPA's /health. */
opaUrl?: string;
/** Optional ioredis-compatible client for Redis probe. */
redisClient?: { ping(): Promise<string> } | null;
}
/**
* Wraps a probe promise with a hard timeout. Resolves to `'unreachable'`
* classification when the timeout fires.
*
* @param probe - Async function that performs the health check and returns latencyMs.
* @returns ServiceHealthResult with status and latency.
*/
async function runProbe(
probe: () => Promise<number>,
): Promise<ServiceHealthResult> {
const timeoutPromise = new Promise<ServiceHealthResult>((resolve) => {
setTimeout(() => {
resolve({ status: 'unreachable', latencyMs: PROBE_TIMEOUT_MS });
}, PROBE_TIMEOUT_MS);
});
const probePromise = (async (): Promise<ServiceHealthResult> => {
try {
const latencyMs = await probe();
const status: ServiceStatus =
latencyMs > DEGRADED_THRESHOLD_MS ? 'degraded' : 'healthy';
return { status, latencyMs };
} catch {
return { status: 'unreachable', latencyMs: PROBE_TIMEOUT_MS };
}
})();
return Promise.race([probePromise, timeoutPromise]);
}
/**
* Controller implementing GET /health/detailed.
*
* Constructed with the required infrastructure dependencies and optional
* optional-service clients. The `handle` method is an Express route handler.
*/
export class HealthDetailedController {
private readonly pool: Pool;
private readonly vaultAddr: string | undefined;
private readonly opaUrl: string | undefined;
private readonly redisClient: { ping(): Promise<string> } | null;
constructor(deps: HealthDetailedDeps) {
this.pool = deps.pool;
this.vaultAddr = deps.vaultAddr;
this.opaUrl = deps.opaUrl;
this.redisClient = deps.redisClient ?? null;
}
/**
* Express route handler for GET /health/detailed.
*
* @param _req - Express request (unused).
* @param res - Express response.
*/
handle = (_req: Request, res: Response): void => {
void this.runChecks(res);
};
private async runChecks(res: Response): Promise<void> {
const services: Record<string, ServiceHealthResult> = {};
// ── PostgreSQL probe ────────────────────────────────────────────────────
services['postgres'] = await runProbe(async () => {
const start = Date.now();
const client = await this.pool.connect();
try {
await client.query('SELECT 1');
} finally {
client.release();
}
return Date.now() - start;
});
// ── Redis probe (optional — only when client supplied) ──────────────────
if (this.redisClient !== null) {
services['redis'] = await runProbe(async () => {
const start = Date.now();
await this.redisClient!.ping();
return Date.now() - start;
});
}
// ── Vault probe (optional — only when VAULT_ADDR is configured) ─────────
if (this.vaultAddr) {
services['vault'] = await runProbe(async () => {
const url = `${this.vaultAddr}/v1/sys/health`;
const start = Date.now();
const fetchResponse = await fetch(url, { signal: AbortSignal.timeout(PROBE_TIMEOUT_MS) });
const latencyMs = Date.now() - start;
// Vault returns 200 (initialised, unsealed, active), 429 (standby), 472/473 (DR).
// All mean Vault is reachable — only network errors mean unreachable.
if (fetchResponse.status >= 500) {
throw new Error(`Vault health endpoint returned ${fetchResponse.status}`);
}
return latencyMs;
});
}
// ── OPA probe (optional — only when OPA_URL is configured) ─────────────
if (this.opaUrl) {
services['opa'] = await runProbe(async () => {
const url = `${this.opaUrl}/health`;
const start = Date.now();
const fetchResponse = await fetch(url, { signal: AbortSignal.timeout(PROBE_TIMEOUT_MS) });
const latencyMs = Date.now() - start;
if (!fetchResponse.ok) {
throw new Error(`OPA health endpoint returned ${fetchResponse.status}`);
}
return latencyMs;
});
}
// ── Compute overall status (task 2.6) ───────────────────────────────────
const statuses = Object.values(services).map((s) => s.status);
const hasUnreachable = statuses.includes('unreachable');
const hasDegraded = statuses.includes('degraded');
let overallStatus: 'healthy' | 'degraded' | 'unreachable';
let httpStatus: 200 | 207 | 503;
if (hasUnreachable) {
overallStatus = 'unreachable';
httpStatus = 503;
} else if (hasDegraded) {
overallStatus = 'degraded';
httpStatus = 207;
} else {
overallStatus = 'healthy';
httpStatus = 200;
}
const body: DetailedHealthResponse = {
status: overallStatus,
version: process.env['npm_package_version'] ?? '1.0.0',
uptime: Math.floor(process.uptime()),
services,
};
res.status(httpStatus).json(body);
}
}

View File

@@ -1,16 +1,30 @@
/**
* PostgreSQL connection pool singleton.
* All database access flows through this pool.
*
* Pool configuration env vars (task 2.1 / 2.2):
* DB_POOL_MAX — maximum connections (default 20)
* DB_POOL_MIN — minimum connections (default 2)
* DB_POOL_IDLE_TIMEOUT_MS — idle connection timeout in ms (default 30000)
* DB_POOL_CONNECTION_TIMEOUT_MS — connection acquisition timeout in ms (default 5000)
*/
import { Pool } from 'pg';
import { dbQueryDurationSeconds } from '../metrics/registry.js';
import {
dbQueryDurationSeconds,
dbPoolActiveConnections,
dbPoolWaitingRequests,
} from '../metrics/registry.js';
let pool: Pool | null = null;
/**
* Returns the singleton pg Pool instance.
* Initialises the pool on first call using DATABASE_URL from the environment.
* Initialises the pool on first call using DATABASE_URL and optional pool
* tuning env vars.
*
* Prometheus gauges `agentidp_db_pool_active_connections` and
* `agentidp_db_pool_waiting_requests` are updated via pool events (task 2.3).
*
* @returns The PostgreSQL connection pool.
* @throws Error if DATABASE_URL is not set.
@@ -21,13 +35,50 @@ export function getPool(): Pool {
if (!connectionString) {
throw new Error('DATABASE_URL environment variable is required');
}
pool = new Pool({ connectionString });
const max = parseInt(process.env['DB_POOL_MAX'] ?? '20', 10);
const min = parseInt(process.env['DB_POOL_MIN'] ?? '2', 10);
const idleTimeoutMillis = parseInt(process.env['DB_POOL_IDLE_TIMEOUT_MS'] ?? '30000', 10);
const connectionTimeoutMillis = parseInt(
process.env['DB_POOL_CONNECTION_TIMEOUT_MS'] ?? '5000',
10,
);
pool = new Pool({
connectionString,
max,
min,
idleTimeoutMillis,
connectionTimeoutMillis,
});
pool.on('error', (err: Error) => {
// eslint-disable-next-line no-console
console.error('Unexpected pg pool error', err);
});
// Track active connections and waiting requests via pool events (task 2.3).
pool.on('acquire', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
pool.on('remove', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
pool.on('connect', () => {
if (pool) {
dbPoolActiveConnections.set(pool.totalCount - pool.idleCount);
dbPoolWaitingRequests.set(pool.waitingCount);
}
});
// Wrap pool.query to record duration in Prometheus.
// The pg Pool.query method is heavily overloaded — the only safe approach
// without TypeScript errors is a typed-any wrapper on the shim itself.

View File

@@ -0,0 +1,78 @@
/**
* ioredis singleton client for rate-limiter-flexible.
*
* This client is separate from the `src/cache/redis.ts` client (which uses the
* `redis` npm package and handles token revocation / OIDC caching). The
* rate-limiter-flexible library requires an ioredis-compatible client.
*
* Guard: when `REDIS_RATE_LIMIT_ENABLED` is not `"true"` the factory returns
* `null` and the rate limiter falls back to in-process memory (RateLimiterMemory).
*/
import Redis from 'ioredis';
let ioredisClient: Redis | null = null;
/**
* Returns a singleton ioredis client for rate limiting, or `null` when Redis
* rate limiting is disabled via the `REDIS_RATE_LIMIT_ENABLED` env var.
*
* The client is lazily initialised on first call. Connection errors are logged
* but do NOT throw — callers must handle a `null` return and fall back to
* in-memory rate limiting.
*
* @returns The ioredis client instance, or `null` when disabled / unreachable.
*/
export function getRateLimitRedisClient(): Redis | null {
const enabled = process.env['REDIS_RATE_LIMIT_ENABLED'];
if (enabled !== 'true') {
return null;
}
if (ioredisClient) {
return ioredisClient;
}
const redisUrl = process.env['REDIS_URL'] ?? 'redis://localhost:6379';
ioredisClient = new Redis(redisUrl, {
// Do not throw on connection failure — caller handles null / fallback.
lazyConnect: false,
enableReadyCheck: true,
maxRetriesPerRequest: 1,
// Reconnect strategy: give up quickly so the health check / fallback fires.
retryStrategy: (times: number): number | null => {
if (times >= 3) {
return null; // stop retrying — triggers 'error' event
}
return Math.min(times * 200, 1000);
},
});
ioredisClient.on('error', (err: Error) => {
// eslint-disable-next-line no-console
console.error('[RateLimitRedis] Connection error — rate limiter will use memory fallback:', err.message);
// Reset singleton so next call re-attempts connection.
ioredisClient = null;
});
ioredisClient.on('connect', () => {
// eslint-disable-next-line no-console
console.log('[RateLimitRedis] Connected — Redis-backed rate limiting active.');
});
return ioredisClient;
}
/**
* Closes the ioredis rate-limit client and resets the singleton.
* Used for graceful shutdown and tests.
*
* @returns Promise that resolves when the client is disconnected.
*/
export async function closeRateLimitRedisClient(): Promise<void> {
if (ioredisClient) {
await ioredisClient.quit();
ioredisClient = null;
}
}

View File

@@ -116,3 +116,34 @@ export const auditChainIntegrity = new Gauge({
help: 'Binary gauge: 1 = most recent audit chain verification passed, 0 = failed.',
registers: [metricsRegistry],
});
/**
* Total number of HTTP 429 responses returned by the rate limiter.
* Labels: endpoint (req.path at time of rejection)
*/
export const rateLimitHitsTotal = new Counter({
name: 'agentidp_rate_limit_hits_total',
help: 'Total number of HTTP 429 responses returned by the rate limiter.',
labelNames: ['endpoint'] as const,
registers: [metricsRegistry],
});
/**
* Current number of active (checked-out) PostgreSQL pool connections.
* Updated on pool `acquire` and `remove` events.
*/
export const dbPoolActiveConnections = new Gauge({
name: 'agentidp_db_pool_active_connections',
help: 'Current number of active (checked-out) PostgreSQL pool connections.',
registers: [metricsRegistry],
});
/**
* Current number of waiting client requests in the PostgreSQL pool queue.
* Updated whenever the pool queue length changes.
*/
export const dbPoolWaitingRequests = new Gauge({
name: 'agentidp_db_pool_waiting_requests',
help: 'Current number of requests waiting for a PostgreSQL connection.',
registers: [metricsRegistry],
});

View File

@@ -1,34 +1,104 @@
/**
* Redis-backed rate limiting middleware for SentryAgent.ai AgentIdP.
* Enforces 100 requests per minute per client_id using a sliding window counter.
*
* Uses `rate-limiter-flexible` with a sliding-window `RateLimiterRedis` when
* `REDIS_RATE_LIMIT_ENABLED=true` and Redis is reachable. Falls back to
* `RateLimiterMemory` transparently when Redis is unavailable (task 1.4).
*
* Configuration env vars:
* RATE_LIMIT_WINDOW_MS — window length in milliseconds (default 60000)
* RATE_LIMIT_MAX_REQUESTS — maximum requests per window (default 100)
* REDIS_RATE_LIMIT_ENABLED — set to "true" to enable Redis backend
*/
import { Request, Response, NextFunction } from 'express';
import { getRedisClient } from '../cache/redis.js';
import {
RateLimiterRedis,
RateLimiterMemory,
RateLimiterAbstract,
RateLimiterRes,
} from 'rate-limiter-flexible';
import { getRateLimitRedisClient } from '../infrastructure/redisClient.js';
import { rateLimitHitsTotal } from '../metrics/registry.js';
import { RateLimitError } from '../utils/errors.js';
const RATE_LIMIT_MAX = 100;
const WINDOW_MS = 60000; // 60 seconds
/** Singleton rate limiter — created once and reused across requests. */
let rateLimiter: RateLimiterAbstract | null = null;
/**
* Computes the current rate-limit window key and next reset timestamp.
* Returns the configured rate limiter instance (RateLimiterRedis or fallback
* RateLimiterMemory). The instance is memoised after the first successful
* construction so configuration is parsed only once per process lifetime.
*
* @returns Object with `windowKey` (minute index) and `resetAt` (Unix seconds).
* When the ioredis client is unavailable (Redis unreachable or disabled) the
* function falls back to in-process memory without throwing.
*
* @returns Configured RateLimiterAbstract instance.
*/
function getWindowInfo(): { windowKey: number; resetAt: number } {
const windowKey = Math.floor(Date.now() / WINDOW_MS);
const resetAt = (windowKey + 1) * (WINDOW_MS / 1000);
return { windowKey, resetAt };
function getRateLimiter(): RateLimiterAbstract {
if (rateLimiter) {
return rateLimiter;
}
const windowMs = parseInt(process.env['RATE_LIMIT_WINDOW_MS'] ?? '60000', 10);
const maxRequests = parseInt(process.env['RATE_LIMIT_MAX_REQUESTS'] ?? '100', 10);
const windowSeconds = Math.ceil(windowMs / 1000);
const redisClient = getRateLimitRedisClient();
if (redisClient !== null) {
// RateLimiterRedis: sliding window backed by ioredis.
// insuranceLimiter provides in-memory fallback when Redis is temporarily down.
rateLimiter = new RateLimiterRedis({
storeClient: redisClient,
keyPrefix: 'rl',
points: maxRequests,
duration: windowSeconds,
blockDuration: 0,
insuranceLimiter: new RateLimiterMemory({
points: maxRequests,
duration: windowSeconds,
}),
});
} else {
// Redis disabled or unreachable — use in-process memory limiter.
rateLimiter = new RateLimiterMemory({
points: maxRequests,
duration: windowSeconds,
});
}
return rateLimiter;
}
/**
* Express middleware that applies Redis-based rate limiting per client_id.
* Resets the memoised rate limiter singleton.
* Exposed for testing purposes only — do NOT call in production code.
*
* The client_id is sourced from `req.user.client_id` (set by authMiddleware).
* For unauthenticated requests (token endpoint), the client IP is used instead.
* @internal
*/
export function _resetRateLimiterForTests(): void {
rateLimiter = null;
}
/**
* Derives the rate-limit key for a given request.
* Authenticated requests key by `client_id`; unauthenticated requests key by IP.
*
* Sets `X-RateLimit-Limit`, `X-RateLimit-Remaining`, and `X-RateLimit-Reset`
* headers on every response. Throws `RateLimitError` when the limit is exceeded.
* @param req - Express request object.
* @returns String key unique to the client.
*/
function resolveClientKey(req: Request): string {
return req.user?.client_id ?? req.ip ?? 'unknown';
}
/**
* Express middleware that applies sliding-window rate limiting per client.
*
* Sets `X-RateLimit-Limit`, `X-RateLimit-Remaining`, `X-RateLimit-Reset`, and
* `Retry-After` (on rejection) headers. Increments the
* `agentidp_rate_limit_hits_total` Prometheus counter and calls
* `next(RateLimitError)` when the limit is exceeded.
*
* @param req - Express request.
* @param res - Express response.
@@ -39,31 +109,43 @@ export async function rateLimitMiddleware(
res: Response,
next: NextFunction,
): Promise<void> {
const limiter = getRateLimiter();
const key = resolveClientKey(req);
try {
const clientId = req.user?.client_id ?? req.ip ?? 'unknown';
const { windowKey, resetAt } = getWindowInfo();
const redisKey = `rate:${clientId}:${windowKey}`;
const result: RateLimiterRes = await limiter.consume(key);
const redis = await getRedisClient();
// Atomically increment and set TTL
const count = await redis.incr(redisKey);
if (count === 1) {
await redis.expire(redisKey, 60);
}
const remaining = Math.max(0, RATE_LIMIT_MAX - count);
res.setHeader('X-RateLimit-Limit', RATE_LIMIT_MAX);
res.setHeader('X-RateLimit-Remaining', remaining);
res.setHeader('X-RateLimit-Reset', resetAt);
if (count > RATE_LIMIT_MAX) {
throw new RateLimitError();
}
// Headers present on every successful response.
res.setHeader('X-RateLimit-Limit', limiter.points);
res.setHeader('X-RateLimit-Remaining', result.remainingPoints);
res.setHeader(
'X-RateLimit-Reset',
Math.ceil(Date.now() / 1000 + result.msBeforeNext / 1000),
);
next();
} catch (err) {
next(err);
if (err instanceof RateLimiterRes) {
// Rate limit exceeded — err is the RateLimiterRes rejection object.
const retryAfterSeconds = Math.ceil(err.msBeforeNext / 1000);
const endpoint = req.path;
// Prometheus counter — increment on every HTTP 429 (task 1.5).
rateLimitHitsTotal.inc({ endpoint });
// Standard headers on rate-limit rejection (task 1.6).
res.setHeader('X-RateLimit-Limit', limiter.points);
res.setHeader('X-RateLimit-Remaining', 0);
res.setHeader(
'X-RateLimit-Reset',
Math.ceil(Date.now() / 1000 + err.msBeforeNext / 1000),
);
res.setHeader('Retry-After', retryAfterSeconds);
next(new RateLimitError());
} else {
// Unexpected error (e.g. Redis failure not caught by insuranceLimiter).
next(err);
}
}
}

View File

@@ -1,12 +1,16 @@
/**
* Health check route for SentryAgent.ai AgentIdP.
* Returns connectivity status for PostgreSQL and Redis.
* Unauthenticated — safe to call from monitoring systems and the dashboard.
* Health check routes for SentryAgent.ai AgentIdP.
*
* GET /health — quick liveness check (existing)
* GET /health/detailed — full dependency health with latency (task 2.4)
*
* Both endpoints are unauthenticated — safe to call from monitoring systems.
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import { RedisClientType } from 'redis';
import { HealthDetailedController } from '../controllers/HealthDetailedController.js';
/** Response shape for GET /health */
interface HealthResponse {
@@ -20,7 +24,7 @@ interface HealthResponse {
}
/**
* Creates and returns the Express router for the health endpoint.
* Creates and returns the Express router for health endpoints.
*
* @param pool - PostgreSQL connection pool.
* @param redis - Redis client instance.
@@ -29,6 +33,14 @@ interface HealthResponse {
export function createHealthRouter(pool: Pool, redis: RedisClientType): Router {
const router = Router();
// Instantiate the detailed health controller with optional service clients.
const detailedController = new HealthDetailedController({
pool,
redisClient: redis,
vaultAddr: process.env['VAULT_ADDR'] ?? undefined,
opaUrl: process.env['OPA_URL'] ?? undefined,
});
/**
* GET /health
* Returns 200 when all services are healthy, 503 when any are degraded.
@@ -75,5 +87,12 @@ export function createHealthRouter(pool: Pool, redis: RedisClientType): Router {
void check();
});
/**
* GET /health/detailed
* Returns per-service health with latency.
* 200 = all healthy, 207 = any degraded, 503 = any unreachable.
*/
router.get('/detailed', detailedController.handle);
return router;
}