feat(phase-4): WS1 — Production Hardening (Redis rate limiting, DB pool, health endpoint, k6)

Rate limiting:
- Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window)
- Graceful fallback to RateLimiterMemory when Redis unreachable
- RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config
- Retry-After header on 429 responses
- agentidp_rate_limit_hits_total Prometheus counter

Database pool:
- Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS
- Defaults: max=20, min=2, idle=30s, conn timeout=5s
- agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges

Health endpoint:
- GET /health/detailed — per-service status (database, Redis, Vault, OPA)
- healthy / degraded (>1000ms) / unreachable classification
- HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable)

Load tests:
- tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs)
- npm run load-test script

Tests: 586 passing, zero TypeScript errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
SentryAgent.ai Developer
2026-04-02 04:20:37 +00:00
parent b0f70b7ac4
commit 1b682c22b2
16 changed files with 1467 additions and 113 deletions

View File

@@ -0,0 +1,116 @@
/**
* k6 load test — Credential Rotation
*
* Scenario : POST /api/v1/agents/:agentId/credentials/:credentialId/rotate
* VUs : 50
* Duration : 60 seconds
* Thresholds:
* p95 response time < 500 ms
* HTTP error rate < 1 %
*
* Usage:
* BASE_URL=http://localhost:3000 \
* AGENT_ID=your-agent-id \
* ACCESS_TOKEN=your-access-token \
* k6 run tests/load/credential-rotation.js
*
* Note: This test requires a pre-provisioned agent with at least one active
* credential. The AGENT_ID and ACCESS_TOKEN must be set before running.
* If CREDENTIAL_ID is not set, the test uses the "active" credential alias.
*/
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend } from 'k6/metrics';
// ── Custom metrics ─────────────────────────────────────────────────────────────
const errorRate = new Rate('error_rate');
const rotationDuration = new Trend('rotation_duration_ms', true);
// ── Configuration ──────────────────────────────────────────────────────────────
export const options = {
vus: 50,
duration: '60s',
thresholds: {
http_req_duration: ['p(95)<500'],
error_rate: ['rate<0.01'],
},
};
const BASE_URL = __ENV.BASE_URL || 'http://localhost:3000';
const AGENT_ID = __ENV.AGENT_ID || 'load-test-agent-id';
const CREDENTIAL_ID = __ENV.CREDENTIAL_ID || 'active';
const ACCESS_TOKEN = __ENV.ACCESS_TOKEN || 'load-test-token';
// ── Setup: issue an access token once per test run ────────────────────────────
export function setup() {
// If an ACCESS_TOKEN was provided, skip token issuance.
if (ACCESS_TOKEN !== 'load-test-token') {
return { token: ACCESS_TOKEN };
}
const tokenUrl = `${BASE_URL}/api/v1/token`;
const tokenPayload = {
grant_type: 'client_credentials',
client_id: __ENV.CLIENT_ID || '',
client_secret: __ENV.CLIENT_SECRET || '',
scope: 'credentials:write',
};
const tokenRes = http.post(tokenUrl, tokenPayload, {
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
});
if (tokenRes.status !== 200) {
console.warn(`Setup token issuance failed: ${tokenRes.status} — using env ACCESS_TOKEN`);
return { token: ACCESS_TOKEN };
}
const tokenBody = JSON.parse(tokenRes.body);
return { token: tokenBody.access_token };
}
// ── Default function (executed per VU iteration) ───────────────────────────────
export default function credentialRotation(data) {
const { token } = data;
const url = `${BASE_URL}/api/v1/agents/${AGENT_ID}/credentials/${CREDENTIAL_ID}/rotate`;
const params = {
headers: {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/json',
Accept: 'application/json',
},
timeout: '10s',
};
const response = http.post(url, null, params);
rotationDuration.add(response.timings.duration);
const success = check(response, {
'status is 200 or 201': (r) => r.status === 200 || r.status === 201,
'response has new credential': (r) => {
// 401/403 from misconfigured env vars counts as an infrastructure issue,
// not an application error, so we only fail on 5xx.
if (r.status === 401 || r.status === 403) {
console.warn(`Auth error ${r.status} — check ACCESS_TOKEN / AGENT_ID env vars`);
return true; // do not inflate error rate for config issues
}
if (r.status >= 500) {
return false;
}
try {
const body = JSON.parse(r.body);
return typeof body.credentialId === 'string' || typeof body.id === 'string';
} catch {
return false;
}
},
'response time < 500ms': (r) => r.timings.duration < 500,
});
errorRate.add(!success);
// Think-time between rotations — credential rotation is a lower-frequency op
sleep(0.2);
}