Rate limiting: - Replace in-memory express-rate-limit with ioredis + rate-limiter-flexible (sliding window) - Graceful fallback to RateLimiterMemory when Redis unreachable - RATE_LIMIT_WINDOW_MS / RATE_LIMIT_MAX_REQUESTS env var config - Retry-After header on 429 responses - agentidp_rate_limit_hits_total Prometheus counter Database pool: - Explicit pg.Pool config via DB_POOL_MAX/MIN/IDLE_TIMEOUT_MS/CONNECTION_TIMEOUT_MS - Defaults: max=20, min=2, idle=30s, conn timeout=5s - agentidp_db_pool_active_connections + agentidp_db_pool_waiting_requests gauges Health endpoint: - GET /health/detailed — per-service status (database, Redis, Vault, OPA) - healthy / degraded (>1000ms) / unreachable classification - HTTP 200 (all healthy) / 207 (any degraded) / 503 (any unreachable) Load tests: - tests/load/ with k6 scenarios for agent registration (100 VUs), token issuance (1000 VUs), credential rotation (50 VUs) - npm run load-test script Tests: 586 passing, zero TypeScript errors Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
150 lines
4.9 KiB
TypeScript
150 lines
4.9 KiB
TypeScript
/**
|
|
* Shared Prometheus metrics registry for SentryAgent.ai AgentIdP.
|
|
* All metric definitions live here. Import specific metrics in the files that use them.
|
|
* This is the ONLY file that defines metrics — all other files import from here.
|
|
*/
|
|
|
|
import { Registry, Counter, Gauge, Histogram } from 'prom-client';
|
|
|
|
/** Shared registry — do NOT use the default global registry (conflicts with tests). */
|
|
export const metricsRegistry = new Registry();
|
|
|
|
/**
|
|
* Total number of OAuth 2.0 tokens successfully issued.
|
|
* Labels: scope (space-separated scope string)
|
|
*/
|
|
export const tokensIssuedTotal = new Counter({
|
|
name: 'agentidp_tokens_issued_total',
|
|
help: 'Total number of OAuth 2.0 access tokens issued successfully.',
|
|
labelNames: ['scope'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Total number of agents successfully registered.
|
|
* Labels: deployment_env
|
|
*/
|
|
export const agentsRegisteredTotal = new Counter({
|
|
name: 'agentidp_agents_registered_total',
|
|
help: 'Total number of AI agents registered successfully.',
|
|
labelNames: ['deployment_env'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Total HTTP requests received.
|
|
* Labels: method, route (normalised path), status_code
|
|
*/
|
|
export const httpRequestsTotal = new Counter({
|
|
name: 'agentidp_http_requests_total',
|
|
help: 'Total number of HTTP requests received.',
|
|
labelNames: ['method', 'route', 'status_code'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* HTTP request duration in seconds.
|
|
* Labels: method, route, status_code
|
|
*/
|
|
export const httpRequestDurationSeconds = new Histogram({
|
|
name: 'agentidp_http_request_duration_seconds',
|
|
help: 'HTTP request duration in seconds.',
|
|
labelNames: ['method', 'route', 'status_code'] as const,
|
|
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* PostgreSQL query duration in seconds.
|
|
* Labels: operation (query/connect)
|
|
*/
|
|
export const dbQueryDurationSeconds = new Histogram({
|
|
name: 'agentidp_db_query_duration_seconds',
|
|
help: 'PostgreSQL query duration in seconds.',
|
|
labelNames: ['operation'] as const,
|
|
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Redis command duration in seconds.
|
|
* Labels: command (get/set/incr/expire/ping/etc.)
|
|
*/
|
|
export const redisCommandDurationSeconds = new Histogram({
|
|
name: 'agentidp_redis_command_duration_seconds',
|
|
help: 'Redis command duration in seconds.',
|
|
labelNames: ['command'] as const,
|
|
buckets: [0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25],
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Total number of webhook deliveries that reached the dead-letter state
|
|
* (i.e. exhausted all retry attempts without a 2xx response).
|
|
* Labels: organization_id
|
|
*/
|
|
export const webhookDeadLettersTotal = new Counter({
|
|
name: 'agentidp_webhook_dead_letters_total',
|
|
help: 'Total number of webhook deliveries that exhausted all retry attempts.',
|
|
labelNames: ['organization_id'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Total number of agent credentials detected as expiring within 7 days.
|
|
* Incremented by SecretsRotationJob on each scheduled check.
|
|
* Labels: agent_id
|
|
*
|
|
* SOC 2 CC9.2 — Secrets Rotation monitoring.
|
|
*/
|
|
export const credentialsExpiringSoonTotal = new Counter({
|
|
name: 'agentidp_credentials_expiring_soon_total',
|
|
help: 'Total number of agent credentials detected as expiring within 7 days.',
|
|
labelNames: ['agent_id'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Binary gauge indicating whether the most recent audit chain verification passed.
|
|
* Set to 1 (passing) or 0 (failing) by AuditChainVerificationJob.
|
|
* No labels.
|
|
*
|
|
* SOC 2 CC7.2 — Audit Log Integrity monitoring.
|
|
*/
|
|
export const auditChainIntegrity = new Gauge({
|
|
name: 'agentidp_audit_chain_integrity',
|
|
help: 'Binary gauge: 1 = most recent audit chain verification passed, 0 = failed.',
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Total number of HTTP 429 responses returned by the rate limiter.
|
|
* Labels: endpoint (req.path at time of rejection)
|
|
*/
|
|
export const rateLimitHitsTotal = new Counter({
|
|
name: 'agentidp_rate_limit_hits_total',
|
|
help: 'Total number of HTTP 429 responses returned by the rate limiter.',
|
|
labelNames: ['endpoint'] as const,
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Current number of active (checked-out) PostgreSQL pool connections.
|
|
* Updated on pool `acquire` and `remove` events.
|
|
*/
|
|
export const dbPoolActiveConnections = new Gauge({
|
|
name: 'agentidp_db_pool_active_connections',
|
|
help: 'Current number of active (checked-out) PostgreSQL pool connections.',
|
|
registers: [metricsRegistry],
|
|
});
|
|
|
|
/**
|
|
* Current number of waiting client requests in the PostgreSQL pool queue.
|
|
* Updated whenever the pool queue length changes.
|
|
*/
|
|
export const dbPoolWaitingRequests = new Gauge({
|
|
name: 'agentidp_db_pool_waiting_requests',
|
|
help: 'Current number of requests waiting for a PostgreSQL connection.',
|
|
registers: [metricsRegistry],
|
|
});
|