feat(phase-2): workstream 7 — Prometheus + Grafana Monitoring
- Add prom-client 15; shared registry in src/metrics/registry.ts (7 metrics) - HTTP request counter + duration histogram via metricsMiddleware - DB query duration histogram wrapping pg Pool.query - Redis command duration histogram via typed instrumentRedisMethod wrapper - agentidp_tokens_issued_total in OAuth2Service - agentidp_agents_registered_total in AgentService - GET /metrics unauthenticated endpoint (Prometheus text format) - docker-compose.monitoring.yml overlay (Prometheus + Grafana) - Grafana auto-provisioned datasource + pre-built AgentIdP dashboard - docs/devops/operations.md monitoring section added - 36/36 unit tests passing, 100% coverage on new metrics code - Fix pre-existing unused import in tests/integration/agents.test.ts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
10
src/app.ts
10
src/app.ts
@@ -32,9 +32,11 @@ import { createTokenRouter } from './routes/token.js';
|
||||
import { createCredentialsRouter } from './routes/credentials.js';
|
||||
import { createAuditRouter } from './routes/audit.js';
|
||||
import { createHealthRouter } from './routes/health.js';
|
||||
import { createMetricsRouter } from './routes/metrics.js';
|
||||
|
||||
import { errorHandler } from './middleware/errorHandler.js';
|
||||
import { createOpaMiddleware } from './middleware/opa.js';
|
||||
import { metricsMiddleware } from './middleware/metrics.js';
|
||||
import { createVaultClientFromEnv } from './vault/VaultClient.js';
|
||||
import { RedisClientType } from 'redis';
|
||||
import path from 'path';
|
||||
@@ -75,6 +77,11 @@ export async function createApp(): Promise<Application> {
|
||||
app.use(express.json());
|
||||
app.use(express.urlencoded({ extended: false }));
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Prometheus HTTP metrics middleware — must be before all routes
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
app.use(metricsMiddleware);
|
||||
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
// Infrastructure singletons
|
||||
// ────────────────────────────────────────────────────────────────
|
||||
@@ -144,6 +151,9 @@ export async function createApp(): Promise<Application> {
|
||||
// Health check — unauthenticated, no OPA
|
||||
app.use('/health', createHealthRouter(pool, redis as RedisClientType));
|
||||
|
||||
// Prometheus metrics — unauthenticated, internal scraping only
|
||||
app.use('/metrics', createMetricsRouter());
|
||||
|
||||
app.use(`${API_BASE}/agents`, createAgentsRouter(agentController, opaMiddleware));
|
||||
app.use(
|
||||
`${API_BASE}/agents/:agentId/credentials`,
|
||||
|
||||
32
src/cache/redis.ts
vendored
32
src/cache/redis.ts
vendored
@@ -4,6 +4,31 @@
|
||||
*/
|
||||
|
||||
import { createClient, RedisClientType } from 'redis';
|
||||
import { redisCommandDurationSeconds } from '../metrics/registry.js';
|
||||
|
||||
/**
|
||||
* Wraps a Redis client method to record its duration in Prometheus.
|
||||
* The cast to `T` is safe: the wrapper is async with identical parameters and
|
||||
* resolves to the same value. TypeScript cannot infer this through the generic
|
||||
* constraint alone, so we assert the type explicitly.
|
||||
*
|
||||
* @param fn - The bound Redis method to wrap.
|
||||
* @param command - The command label used in the Prometheus histogram.
|
||||
* @returns The wrapped method with identical signature.
|
||||
*/
|
||||
function instrumentRedisMethod<TArgs extends unknown[], TReturn>(
|
||||
fn: (...args: TArgs) => Promise<TReturn>,
|
||||
command: string,
|
||||
): (...args: TArgs) => Promise<TReturn> {
|
||||
return async (...args: TArgs): Promise<TReturn> => {
|
||||
const end = redisCommandDurationSeconds.startTimer({ command });
|
||||
try {
|
||||
return await fn(...args);
|
||||
} finally {
|
||||
end();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let redisClient: RedisClientType | null = null;
|
||||
|
||||
@@ -29,6 +54,13 @@ export async function getRedisClient(): Promise<RedisClientType> {
|
||||
});
|
||||
|
||||
await redisClient.connect();
|
||||
|
||||
// Wrap high-frequency commands to record durations in Prometheus
|
||||
redisClient.get = instrumentRedisMethod(redisClient.get.bind(redisClient), 'get');
|
||||
redisClient.set = instrumentRedisMethod(redisClient.set.bind(redisClient), 'set');
|
||||
redisClient.incr = instrumentRedisMethod(redisClient.incr.bind(redisClient), 'incr');
|
||||
redisClient.expire = instrumentRedisMethod(redisClient.expire.bind(redisClient), 'expire');
|
||||
redisClient.ping = instrumentRedisMethod(redisClient.ping.bind(redisClient), 'ping');
|
||||
}
|
||||
return redisClient;
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { dbQueryDurationSeconds } from '../metrics/registry.js';
|
||||
|
||||
let pool: Pool | null = null;
|
||||
|
||||
@@ -26,6 +27,24 @@ export function getPool(): Pool {
|
||||
// eslint-disable-next-line no-console
|
||||
console.error('Unexpected pg pool error', err);
|
||||
});
|
||||
|
||||
// Wrap pool.query to record duration in Prometheus.
|
||||
// The pg Pool.query method is heavily overloaded — the only safe approach
|
||||
// without TypeScript errors is a typed-any wrapper on the shim itself.
|
||||
// We capture originalQuery as `(...args: any[]) => Promise<any>` to satisfy
|
||||
// TypeScript's spread-into-rest constraint; this is the one sanctioned use of
|
||||
// `any` in this file.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const originalQuery = pool.query.bind(pool) as (...args: any[]) => Promise<any>;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(pool as any).query = async (...args: any[]): Promise<any> => {
|
||||
const end = dbQueryDurationSeconds.startTimer({ operation: 'query' });
|
||||
try {
|
||||
return await originalQuery(...args);
|
||||
} finally {
|
||||
end();
|
||||
}
|
||||
};
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
79
src/metrics/registry.ts
Normal file
79
src/metrics/registry.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
/**
|
||||
* Shared Prometheus metrics registry for SentryAgent.ai AgentIdP.
|
||||
* All 7 metric definitions live here. Import specific metrics in the files that use them.
|
||||
* This is the ONLY file that defines metrics — all other files import from here.
|
||||
*/
|
||||
|
||||
import { Registry, Counter, Histogram } from 'prom-client';
|
||||
|
||||
/** Shared registry — do NOT use the default global registry (conflicts with tests). */
|
||||
export const metricsRegistry = new Registry();
|
||||
|
||||
/**
|
||||
* Total number of OAuth 2.0 tokens successfully issued.
|
||||
* Labels: scope (space-separated scope string)
|
||||
*/
|
||||
export const tokensIssuedTotal = new Counter({
|
||||
name: 'agentidp_tokens_issued_total',
|
||||
help: 'Total number of OAuth 2.0 access tokens issued successfully.',
|
||||
labelNames: ['scope'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/**
|
||||
* Total number of agents successfully registered.
|
||||
* Labels: deployment_env
|
||||
*/
|
||||
export const agentsRegisteredTotal = new Counter({
|
||||
name: 'agentidp_agents_registered_total',
|
||||
help: 'Total number of AI agents registered successfully.',
|
||||
labelNames: ['deployment_env'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/**
|
||||
* Total HTTP requests received.
|
||||
* Labels: method, route (normalised path), status_code
|
||||
*/
|
||||
export const httpRequestsTotal = new Counter({
|
||||
name: 'agentidp_http_requests_total',
|
||||
help: 'Total number of HTTP requests received.',
|
||||
labelNames: ['method', 'route', 'status_code'] as const,
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/**
|
||||
* HTTP request duration in seconds.
|
||||
* Labels: method, route, status_code
|
||||
*/
|
||||
export const httpRequestDurationSeconds = new Histogram({
|
||||
name: 'agentidp_http_request_duration_seconds',
|
||||
help: 'HTTP request duration in seconds.',
|
||||
labelNames: ['method', 'route', 'status_code'] as const,
|
||||
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/**
|
||||
* PostgreSQL query duration in seconds.
|
||||
* Labels: operation (query/connect)
|
||||
*/
|
||||
export const dbQueryDurationSeconds = new Histogram({
|
||||
name: 'agentidp_db_query_duration_seconds',
|
||||
help: 'PostgreSQL query duration in seconds.',
|
||||
labelNames: ['operation'] as const,
|
||||
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
|
||||
/**
|
||||
* Redis command duration in seconds.
|
||||
* Labels: command (get/set/incr/expire/ping/etc.)
|
||||
*/
|
||||
export const redisCommandDurationSeconds = new Histogram({
|
||||
name: 'agentidp_redis_command_duration_seconds',
|
||||
help: 'Redis command duration in seconds.',
|
||||
labelNames: ['command'] as const,
|
||||
buckets: [0.0005, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25],
|
||||
registers: [metricsRegistry],
|
||||
});
|
||||
51
src/middleware/metrics.ts
Normal file
51
src/middleware/metrics.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* Prometheus HTTP metrics middleware for SentryAgent.ai AgentIdP.
|
||||
* Records request count and duration for every HTTP request.
|
||||
*/
|
||||
import { Request, Response, NextFunction } from 'express';
|
||||
import { httpRequestsTotal, httpRequestDurationSeconds } from '../metrics/registry.js';
|
||||
|
||||
/**
|
||||
* Normalises an Express request path to a stable route label.
|
||||
* Replaces UUIDs and numeric IDs with ':id' to avoid high cardinality.
|
||||
*
|
||||
* @param req - The Express request object.
|
||||
* @returns A normalised route string.
|
||||
*/
|
||||
function normalisePath(req: Request): string {
|
||||
// Use matched route pattern if available (most accurate)
|
||||
const route = req.route?.path as string | undefined;
|
||||
if (route) {
|
||||
return `${req.baseUrl}${route}`;
|
||||
}
|
||||
// Fall back to original URL stripped of query, with UUIDs replaced
|
||||
return req.path.replace(
|
||||
/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi,
|
||||
':id',
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Express middleware that records Prometheus HTTP metrics for every request.
|
||||
* Must be registered BEFORE routes in app.ts.
|
||||
*
|
||||
* @param req - Express request.
|
||||
* @param res - Express response.
|
||||
* @param next - Express next function.
|
||||
*/
|
||||
export function metricsMiddleware(req: Request, res: Response, next: NextFunction): void {
|
||||
const startTime = Date.now();
|
||||
|
||||
res.on('finish', () => {
|
||||
const route = normalisePath(req);
|
||||
const labels = {
|
||||
method: req.method,
|
||||
route,
|
||||
status_code: String(res.statusCode),
|
||||
};
|
||||
httpRequestsTotal.inc(labels);
|
||||
httpRequestDurationSeconds.observe(labels, (Date.now() - startTime) / 1000);
|
||||
});
|
||||
|
||||
next();
|
||||
}
|
||||
25
src/routes/metrics.ts
Normal file
25
src/routes/metrics.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
/**
|
||||
* Prometheus metrics endpoint for SentryAgent.ai AgentIdP.
|
||||
* Unauthenticated — intended for internal Prometheus scraping only.
|
||||
* Do NOT expose this endpoint on a public-facing network interface.
|
||||
*/
|
||||
import { Router, Request, Response } from 'express';
|
||||
import { metricsRegistry } from '../metrics/registry.js';
|
||||
|
||||
/**
|
||||
* Creates and returns the Express router for the Prometheus metrics endpoint.
|
||||
* Returns metrics in Prometheus text exposition format.
|
||||
*
|
||||
* @returns Configured Express router.
|
||||
*/
|
||||
export function createMetricsRouter(): Router {
|
||||
const router = Router();
|
||||
|
||||
router.get('/', async (_req: Request, res: Response): Promise<void> => {
|
||||
const metrics = await metricsRegistry.metrics();
|
||||
res.set('Content-Type', metricsRegistry.contentType);
|
||||
res.end(metrics);
|
||||
});
|
||||
|
||||
return router;
|
||||
}
|
||||
@@ -19,6 +19,7 @@ import {
|
||||
AgentAlreadyDecommissionedError,
|
||||
FreeTierLimitError,
|
||||
} from '../utils/errors.js';
|
||||
import { agentsRegisteredTotal } from '../metrics/registry.js';
|
||||
|
||||
const FREE_TIER_MAX_AGENTS = 100;
|
||||
|
||||
@@ -81,6 +82,9 @@ export class AgentService {
|
||||
{ agentType: agent.agentType, owner: agent.owner },
|
||||
);
|
||||
|
||||
// Instrument: count successful agent registrations
|
||||
agentsRegisteredTotal.inc({ deployment_env: data.deploymentEnv });
|
||||
|
||||
return agent;
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ import {
|
||||
import { signToken, verifyToken, decodeToken, getTokenExpiresIn } from '../utils/jwt.js';
|
||||
import { verifySecret } from '../utils/crypto.js';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { tokensIssuedTotal } from '../metrics/registry.js';
|
||||
|
||||
const FREE_TIER_MAX_MONTHLY_TOKENS = 10000;
|
||||
|
||||
@@ -202,6 +203,9 @@ export class OAuth2Service {
|
||||
{ scope, expiresAt: expiresAtDate.toISOString() },
|
||||
);
|
||||
|
||||
// Instrument: count successful token issuances
|
||||
tokensIssuedTotal.inc({ scope });
|
||||
|
||||
return {
|
||||
access_token: accessToken,
|
||||
token_type: 'Bearer',
|
||||
|
||||
Reference in New Issue
Block a user