feat(phase-2): workstream 7 — Prometheus + Grafana Monitoring

- Add prom-client 15; shared registry in src/metrics/registry.ts (7 metrics)
- HTTP request counter + duration histogram via metricsMiddleware
- DB query duration histogram wrapping pg Pool.query
- Redis command duration histogram via typed instrumentRedisMethod wrapper
- agentidp_tokens_issued_total in OAuth2Service
- agentidp_agents_registered_total in AgentService
- GET /metrics unauthenticated endpoint (Prometheus text format)
- docker-compose.monitoring.yml overlay (Prometheus + Grafana)
- Grafana auto-provisioned datasource + pre-built AgentIdP dashboard
- docs/devops/operations.md monitoring section added
- 36/36 unit tests passing, 100% coverage on new metrics code
- Fix pre-existing unused import in tests/integration/agents.test.ts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
SentryAgent.ai Developer
2026-03-29 06:13:41 +00:00
parent 7d6e248a14
commit a504964e5f
21 changed files with 1053 additions and 15 deletions

View File

@@ -32,9 +32,11 @@ import { createTokenRouter } from './routes/token.js';
import { createCredentialsRouter } from './routes/credentials.js';
import { createAuditRouter } from './routes/audit.js';
import { createHealthRouter } from './routes/health.js';
import { createMetricsRouter } from './routes/metrics.js';
import { errorHandler } from './middleware/errorHandler.js';
import { createOpaMiddleware } from './middleware/opa.js';
import { metricsMiddleware } from './middleware/metrics.js';
import { createVaultClientFromEnv } from './vault/VaultClient.js';
import { RedisClientType } from 'redis';
import path from 'path';
@@ -75,6 +77,11 @@ export async function createApp(): Promise<Application> {
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
// ────────────────────────────────────────────────────────────────
// Prometheus HTTP metrics middleware — must be before all routes
// ────────────────────────────────────────────────────────────────
app.use(metricsMiddleware);
// ────────────────────────────────────────────────────────────────
// Infrastructure singletons
// ────────────────────────────────────────────────────────────────
@@ -144,6 +151,9 @@ export async function createApp(): Promise<Application> {
// Health check — unauthenticated, no OPA
app.use('/health', createHealthRouter(pool, redis as RedisClientType));
// Prometheus metrics — unauthenticated, internal scraping only
app.use('/metrics', createMetricsRouter());
app.use(`${API_BASE}/agents`, createAgentsRouter(agentController, opaMiddleware));
app.use(
`${API_BASE}/agents/:agentId/credentials`,

32
src/cache/redis.ts vendored
View File

@@ -4,6 +4,31 @@
*/
import { createClient, RedisClientType } from 'redis';
import { redisCommandDurationSeconds } from '../metrics/registry.js';
/**
* Wraps a Redis client method to record its duration in Prometheus.
* The cast to `T` is safe: the wrapper is async with identical parameters and
* resolves to the same value. TypeScript cannot infer this through the generic
* constraint alone, so we assert the type explicitly.
*
* @param fn - The bound Redis method to wrap.
* @param command - The command label used in the Prometheus histogram.
* @returns The wrapped method with identical signature.
*/
function instrumentRedisMethod<TArgs extends unknown[], TReturn>(
fn: (...args: TArgs) => Promise<TReturn>,
command: string,
): (...args: TArgs) => Promise<TReturn> {
return async (...args: TArgs): Promise<TReturn> => {
const end = redisCommandDurationSeconds.startTimer({ command });
try {
return await fn(...args);
} finally {
end();
}
};
}
let redisClient: RedisClientType | null = null;
@@ -29,6 +54,13 @@ export async function getRedisClient(): Promise<RedisClientType> {
});
await redisClient.connect();
// Wrap high-frequency commands to record durations in Prometheus
redisClient.get = instrumentRedisMethod(redisClient.get.bind(redisClient), 'get');
redisClient.set = instrumentRedisMethod(redisClient.set.bind(redisClient), 'set');
redisClient.incr = instrumentRedisMethod(redisClient.incr.bind(redisClient), 'incr');
redisClient.expire = instrumentRedisMethod(redisClient.expire.bind(redisClient), 'expire');
redisClient.ping = instrumentRedisMethod(redisClient.ping.bind(redisClient), 'ping');
}
return redisClient;
}

View File

@@ -4,6 +4,7 @@
*/
import { Pool } from 'pg';
import { dbQueryDurationSeconds } from '../metrics/registry.js';
let pool: Pool | null = null;
@@ -26,6 +27,24 @@ export function getPool(): Pool {
// eslint-disable-next-line no-console
console.error('Unexpected pg pool error', err);
});
// Wrap pool.query to record duration in Prometheus.
// The pg Pool.query method is heavily overloaded — the only safe approach
// without TypeScript errors is a typed-any wrapper on the shim itself.
// We capture originalQuery as `(...args: any[]) => Promise<any>` to satisfy
// TypeScript's spread-into-rest constraint; this is the one sanctioned use of
// `any` in this file.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const originalQuery = pool.query.bind(pool) as (...args: any[]) => Promise<any>;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(pool as any).query = async (...args: any[]): Promise<any> => {
const end = dbQueryDurationSeconds.startTimer({ operation: 'query' });
try {
return await originalQuery(...args);
} finally {
end();
}
};
}
return pool;
}

79
src/metrics/registry.ts Normal file
View File

@@ -0,0 +1,79 @@
/**
* Shared Prometheus metrics registry for SentryAgent.ai AgentIdP.
* All 7 metric definitions live here. Import specific metrics in the files that use them.
* This is the ONLY file that defines metrics — all other files import from here.
*/
import { Registry, Counter, Histogram } from 'prom-client';
/** Shared registry — do NOT use the default global registry (conflicts with tests). */
export const metricsRegistry = new Registry();
/**
* Total number of OAuth 2.0 tokens successfully issued.
* Labels: scope (space-separated scope string)
*/
export const tokensIssuedTotal = new Counter({
name: 'agentidp_tokens_issued_total',
help: 'Total number of OAuth 2.0 access tokens issued successfully.',
labelNames: ['scope'] as const,
registers: [metricsRegistry],
});
/**
* Total number of agents successfully registered.
* Labels: deployment_env
*/
export const agentsRegisteredTotal = new Counter({
name: 'agentidp_agents_registered_total',
help: 'Total number of AI agents registered successfully.',
labelNames: ['deployment_env'] as const,
registers: [metricsRegistry],
});
/**
* Total HTTP requests received.
* Labels: method, route (normalised path), status_code
*/
export const httpRequestsTotal = new Counter({
name: 'agentidp_http_requests_total',
help: 'Total number of HTTP requests received.',
labelNames: ['method', 'route', 'status_code'] as const,
registers: [metricsRegistry],
});
/**
* HTTP request duration in seconds.
* Labels: method, route, status_code
*/
export const httpRequestDurationSeconds = new Histogram({
name: 'agentidp_http_request_duration_seconds',
help: 'HTTP request duration in seconds.',
labelNames: ['method', 'route', 'status_code'] as const,
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
registers: [metricsRegistry],
});
/**
* PostgreSQL query duration in seconds.
* Labels: operation (query/connect)
*/
export const dbQueryDurationSeconds = new Histogram({
name: 'agentidp_db_query_duration_seconds',
help: 'PostgreSQL query duration in seconds.',
labelNames: ['operation'] as const,
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
registers: [metricsRegistry],
});
/**
* Redis command duration in seconds.
* Labels: command (get/set/incr/expire/ping/etc.)
*/
export const redisCommandDurationSeconds = new Histogram({
name: 'agentidp_redis_command_duration_seconds',
help: 'Redis command duration in seconds.',
labelNames: ['command'] as const,
buckets: [0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25],
registers: [metricsRegistry],
});

51
src/middleware/metrics.ts Normal file
View File

@@ -0,0 +1,51 @@
/**
* Prometheus HTTP metrics middleware for SentryAgent.ai AgentIdP.
* Records request count and duration for every HTTP request.
*/
import { Request, Response, NextFunction } from 'express';
import { httpRequestsTotal, httpRequestDurationSeconds } from '../metrics/registry.js';
/**
* Normalises an Express request path to a stable route label.
* Replaces UUIDs and numeric IDs with ':id' to avoid high cardinality.
*
* @param req - The Express request object.
* @returns A normalised route string.
*/
function normalisePath(req: Request): string {
// Use matched route pattern if available (most accurate)
const route = req.route?.path as string | undefined;
if (route) {
return `${req.baseUrl}${route}`;
}
// Fall back to original URL stripped of query, with UUIDs replaced
return req.path.replace(
/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi,
':id',
);
}
/**
* Express middleware that records Prometheus HTTP metrics for every request.
* Must be registered BEFORE routes in app.ts.
*
* @param req - Express request.
* @param res - Express response.
* @param next - Express next function.
*/
export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void {
const startTime = Date.now();
res.on('finish', () => {
const route = normalisePath(req);
const labels = {
method: req.method,
route,
status_code: String(res.statusCode),
};
httpRequestsTotal.inc(labels);
httpRequestDurationSeconds.observe(labels, (Date.now() - startTime) / 1000);
});
next();
}

25
src/routes/metrics.ts Normal file
View File

@@ -0,0 +1,25 @@
/**
* Prometheus metrics endpoint for SentryAgent.ai AgentIdP.
* Unauthenticated — intended for internal Prometheus scraping only.
* Do NOT expose this endpoint on a public-facing network interface.
*/
import { Router, Request, Response } from 'express';
import { metricsRegistry } from '../metrics/registry.js';
/**
* Creates and returns the Express router for the Prometheus metrics endpoint.
* Returns metrics in Prometheus text exposition format.
*
* @returns Configured Express router.
*/
export function createMetricsRouter(): Router {
const router = Router();
router.get('/', async (_req: Request, res: Response): Promise<void> => {
const metrics = await metricsRegistry.metrics();
res.set('Content-Type', metricsRegistry.contentType);
res.end(metrics);
});
return router;
}

View File

@@ -19,6 +19,7 @@ import {
AgentAlreadyDecommissionedError,
FreeTierLimitError,
} from '../utils/errors.js';
import { agentsRegisteredTotal } from '../metrics/registry.js';
const FREE_TIER_MAX_AGENTS = 100;
@@ -81,6 +82,9 @@ export class AgentService {
{ agentType: agent.agentType, owner: agent.owner },
);
// Instrument: count successful agent registrations
agentsRegisteredTotal.inc({ deployment_env: data.deploymentEnv });
return agent;
}

View File

@@ -22,6 +22,7 @@ import {
import { signToken, verifyToken, decodeToken, getTokenExpiresIn } from '../utils/jwt.js';
import { verifySecret } from '../utils/crypto.js';
import { v4 as uuidv4 } from 'uuid';
import { tokensIssuedTotal } from '../metrics/registry.js';
const FREE_TIER_MAX_MONTHLY_TOKENS = 10000;
@@ -202,6 +203,9 @@ export class OAuth2Service {
{ scope, expiresAt: expiresAtDate.toISOString() },
);
// Instrument: count successful token issuances
tokensIssuedTotal.inc({ scope });
return {
access_token: accessToken,
token_type: 'Bearer',