Files
sentryagent-idp/src/metrics/registry.ts
SentryAgent.ai Developer 1b682c22b2 feat(phase-4): WS1 — Production Hardening (Redis rate limiting, DB pool, health endpoint, k6)
Rate limiting:
- Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window)
- Graceful fallback to RateLimiterMemory when Redis unreachable
- RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config
- Retry-After header on 429 responses
- agentidp_rate_limit_hits_total Prometheus counter

Database pool:
- Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS
- Defaults: max=20, min=2, idle=30s, conn timeout=5s
- agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges

Health endpoint:
- GET /health/detailed — per-service status (database, Redis, Vault, OPA)
- healthy / degraded (>1000ms) / unreachable classification
- HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable)

Load tests:
- tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs)
- npm run load-test script

Tests: 586 passing, zero TypeScript errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 04:20:37 +00:00

150 lines
4.9 KiB
TypeScript

/**
* Shared Prometheus metrics registry for SentryAgent.ai AgentIdP.
* All metric definitions live here. Import specific metrics in the files that use them.
* This is the ONLY file that defines metrics — all other files import from here.
*/
import { Registry, Counter, Gauge, Histogram } from 'prom-client';
/** Shared registry — do NOT use the default global registry (conflicts with tests). */
export const metricsRegistry = new Registry();
/**
* Total number of OAuth 2.0 tokens successfully issued.
* Labels: scope (space-separated scope string)
*/
export const tokensIssuedTotal = new Counter({
name: 'agentidp_tokens_issued_total',
help: 'Total number of OAuth 2.0 access tokens issued successfully.',
labelNames: ['scope'] as const,
registers: [metricsRegistry],
});
/**
* Total number of agents successfully registered.
* Labels: deployment_env
*/
export const agentsRegisteredTotal = new Counter({
name: 'agentidp_agents_registered_total',
help: 'Total number of AI agents registered successfully.',
labelNames: ['deployment_env'] as const,
registers: [metricsRegistry],
});
/**
* Total HTTP requests received.
* Labels: method, route (normalised path), status_code
*/
export const httpRequestsTotal = new Counter({
name: 'agentidp_http_requests_total',
help: 'Total number of HTTP requests received.',
labelNames: ['method', 'route', 'status_code'] as const,
registers: [metricsRegistry],
});
/**
* HTTP request duration in seconds.
* Labels: method, route, status_code
*/
export const httpRequestDurationSeconds = new Histogram({
name: 'agentidp_http_request_duration_seconds',
help: 'HTTP request duration in seconds.',
labelNames: ['method', 'route', 'status_code'] as const,
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
registers: [metricsRegistry],
});
/**
* PostgreSQL query duration in seconds.
* Labels: operation (query/connect)
*/
export const dbQueryDurationSeconds = new Histogram({
name: 'agentidp_db_query_duration_seconds',
help: 'PostgreSQL query duration in seconds.',
labelNames: ['operation'] as const,
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
registers: [metricsRegistry],
});
/**
* Redis command duration in seconds.
* Labels: command (get/set/incr/expire/ping/etc.)
*/
export const redisCommandDurationSeconds = new Histogram({
name: 'agentidp_redis_command_duration_seconds',
help: 'Redis command duration in seconds.',
labelNames: ['command'] as const,
buckets: [0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25],
registers: [metricsRegistry],
});
/**
* Total number of webhook deliveries that reached the dead-letter state
* (i.e. exhausted all retry attempts without a 2xx response).
* Labels: organization_id
*/
export const webhookDeadLettersTotal = new Counter({
name: 'agentidp_webhook_dead_letters_total',
help: 'Total number of webhook deliveries that exhausted all retry attempts.',
labelNames: ['organization_id'] as const,
registers: [metricsRegistry],
});
/**
* Total number of agent credentials detected as expiring within 7 days.
* Incremented by SecretsRotationJob on each scheduled check.
* Labels: agent_id
*
* SOC 2 CC9.2 — Secrets Rotation monitoring.
*/
export const credentialsExpiringSoonTotal = new Counter({
name: 'agentidp_credentials_expiring_soon_total',
help: 'Total number of agent credentials detected as expiring within 7 days.',
labelNames: ['agent_id'] as const,
registers: [metricsRegistry],
});
/**
* Binary gauge indicating whether the most recent audit chain verification passed.
* Set to 1 (passing) or 0 (failing) by AuditChainVerificationJob.
* No labels.
*
* SOC 2 CC7.2 — Audit Log Integrity monitoring.
*/
export const auditChainIntegrity = new Gauge({
name: 'agentidp_audit_chain_integrity',
help: 'Binary gauge: 1 = most recent audit chain verification passed, 0 = failed.',
registers: [metricsRegistry],
});
/**
* Total number of HTTP 429 responses returned by the rate limiter.
* Labels: endpoint (req.path at time of rejection)
*/
export const rateLimitHitsTotal = new Counter({
name: 'agentidp_rate_limit_hits_total',
help: 'Total number of HTTP 429 responses returned by the rate limiter.',
labelNames: ['endpoint'] as const,
registers: [metricsRegistry],
});
/**
* Current number of active (checked-out) PostgreSQL pool connections.
* Updated on pool `acquire` and `remove` events.
*/
export const dbPoolActiveConnections = new Gauge({
name: 'agentidp_db_pool_active_connections',
help: 'Current number of active (checked-out) PostgreSQL pool connections.',
registers: [metricsRegistry],
});
/**
* Current number of waiting client requests in the PostgreSQL pool queue.
* Updated whenever the pool queue length changes.
*/
export const dbPoolWaitingRequests = new Gauge({
name: 'agentidp_db_pool_waiting_requests',
help: 'Current number of requests waiting for a PostgreSQL connection.',
registers: [metricsRegistry],
});