Observability lets you understand system behavior from external outputs. Metrics, logs, and traces form the three pillars of observability.
The Three Pillars#
Metrics:
- Numerical measurements over time
- CPU, memory, request counts, latencies
- Aggregated and sampled
Logs:
- Discrete events with context
- Errors, requests, business events
- Detailed but voluminous
Traces:
- Request flow across services
- Timing and dependencies
- End-to-end visibility
Metrics with Prometheus#
1import { Registry, Counter, Histogram, Gauge } from 'prom-client';
2
3const register = new Registry();
4
5// Request counter
6const httpRequestsTotal = new Counter({
7 name: 'http_requests_total',
8 help: 'Total HTTP requests',
9 labelNames: ['method', 'path', 'status'],
10 registers: [register],
11});
12
13// Request duration histogram
14const httpRequestDuration = new Histogram({
15 name: 'http_request_duration_seconds',
16 help: 'HTTP request duration in seconds',
17 labelNames: ['method', 'path', 'status'],
18 buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
19 registers: [register],
20});
21
22// Active connections gauge
23const activeConnections = new Gauge({
24 name: 'active_connections',
25 help: 'Number of active connections',
26 registers: [register],
27});
28
29// Middleware
30function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
31 const start = Date.now();
32
33 res.on('finish', () => {
34 const duration = (Date.now() - start) / 1000;
35 const labels = {
36 method: req.method,
37 path: req.route?.path || req.path,
38 status: res.statusCode.toString(),
39 };
40
41 httpRequestsTotal.inc(labels);
42 httpRequestDuration.observe(labels, duration);
43 });
44
45 next();
46}
47
48// Expose metrics endpoint
49app.get('/metrics', async (req, res) => {
50 res.set('Content-Type', register.contentType);
51 res.send(await register.metrics());
52});Business Metrics#
1// Track business-specific metrics
2const ordersCreated = new Counter({
3 name: 'orders_created_total',
4 help: 'Total orders created',
5 labelNames: ['plan', 'country'],
6});
7
8const orderValue = new Histogram({
9 name: 'order_value_dollars',
10 help: 'Order value in dollars',
11 buckets: [10, 50, 100, 500, 1000, 5000],
12});
13
14const activeUsers = new Gauge({
15 name: 'active_users',
16 help: 'Currently active users',
17});
18
19// Usage
20async function createOrder(order: Order) {
21 const result = await db.order.create({ data: order });
22
23 ordersCreated.inc({
24 plan: order.plan,
25 country: order.country,
26 });
27 orderValue.observe(order.total);
28
29 return result;
30}Structured Logging#
1import pino from 'pino';
2
3const logger = pino({
4 level: process.env.LOG_LEVEL || 'info',
5 formatters: {
6 level: (label) => ({ level: label }),
7 },
8 base: {
9 service: 'api-server',
10 version: process.env.APP_VERSION,
11 environment: process.env.NODE_ENV,
12 },
13});
14
15// Create child logger with context
16function createRequestLogger(req: Request) {
17 return logger.child({
18 requestId: req.id,
19 userId: req.user?.id,
20 path: req.path,
21 method: req.method,
22 });
23}
24
25// Usage
26app.use((req, res, next) => {
27 req.log = createRequestLogger(req);
28 req.log.info('Request started');
29
30 res.on('finish', () => {
31 req.log.info({
32 statusCode: res.statusCode,
33 duration: Date.now() - req.startTime,
34 }, 'Request completed');
35 });
36
37 next();
38});
39
40// Log levels
41logger.trace('Detailed debugging');
42logger.debug('Debugging information');
43logger.info('Normal operations');
44logger.warn('Warning conditions');
45logger.error({ err: error }, 'Error occurred');
46logger.fatal('System unusable');Distributed Tracing#
1import { NodeSDK } from '@opentelemetry/sdk-node';
2import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
3import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
4
5// Initialize OpenTelemetry
6const sdk = new NodeSDK({
7 traceExporter: new OTLPTraceExporter({
8 url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
9 }),
10 instrumentations: [getNodeAutoInstrumentations()],
11 serviceName: 'api-server',
12});
13
14sdk.start();
15
16// Manual spans
17import { trace, SpanStatusCode } from '@opentelemetry/api';
18
19const tracer = trace.getTracer('api-server');
20
21async function processOrder(orderId: string) {
22 return tracer.startActiveSpan('process-order', async (span) => {
23 try {
24 span.setAttribute('order.id', orderId);
25
26 // Nested span
27 await tracer.startActiveSpan('validate-order', async (validateSpan) => {
28 await validateOrder(orderId);
29 validateSpan.end();
30 });
31
32 await tracer.startActiveSpan('charge-payment', async (paymentSpan) => {
33 const result = await chargePayment(orderId);
34 paymentSpan.setAttribute('payment.amount', result.amount);
35 paymentSpan.end();
36 });
37
38 span.setStatus({ code: SpanStatusCode.OK });
39 return { success: true };
40 } catch (error) {
41 span.setStatus({
42 code: SpanStatusCode.ERROR,
43 message: error.message,
44 });
45 span.recordException(error);
46 throw error;
47 } finally {
48 span.end();
49 }
50 });
51}Error Tracking#
1import * as Sentry from '@sentry/node';
2
3Sentry.init({
4 dsn: process.env.SENTRY_DSN,
5 environment: process.env.NODE_ENV,
6 release: process.env.APP_VERSION,
7 tracesSampleRate: 0.1,
8 integrations: [
9 new Sentry.Integrations.Http({ tracing: true }),
10 new Sentry.Integrations.Express({ app }),
11 new Sentry.Integrations.Prisma({ client: prisma }),
12 ],
13});
14
15// Add request handler
16app.use(Sentry.Handlers.requestHandler());
17app.use(Sentry.Handlers.tracingHandler());
18
19// Error handler
20app.use(Sentry.Handlers.errorHandler());
21
22// Manual error capture
23try {
24 await riskyOperation();
25} catch (error) {
26 Sentry.captureException(error, {
27 tags: { operation: 'risky-operation' },
28 extra: { userId: user.id },
29 });
30 throw error;
31}
32
33// Capture message
34Sentry.captureMessage('Important event occurred', {
35 level: 'info',
36 tags: { feature: 'checkout' },
37});Health Checks#
1interface HealthCheck {
2 name: string;
3 check: () => Promise<{ healthy: boolean; details?: any }>;
4}
5
6const healthChecks: HealthCheck[] = [
7 {
8 name: 'database',
9 check: async () => {
10 try {
11 await db.$queryRaw`SELECT 1`;
12 return { healthy: true };
13 } catch (error) {
14 return { healthy: false, details: error.message };
15 }
16 },
17 },
18 {
19 name: 'redis',
20 check: async () => {
21 try {
22 await redis.ping();
23 return { healthy: true };
24 } catch (error) {
25 return { healthy: false, details: error.message };
26 }
27 },
28 },
29 {
30 name: 'external-api',
31 check: async () => {
32 try {
33 const response = await fetch('https://api.external.com/health');
34 return { healthy: response.ok };
35 } catch (error) {
36 return { healthy: false, details: error.message };
37 }
38 },
39 },
40];
41
42app.get('/health', async (req, res) => {
43 const results = await Promise.all(
44 healthChecks.map(async (check) => ({
45 name: check.name,
46 ...(await check.check()),
47 }))
48 );
49
50 const healthy = results.every((r) => r.healthy);
51
52 res.status(healthy ? 200 : 503).json({
53 status: healthy ? 'healthy' : 'unhealthy',
54 checks: results,
55 timestamp: new Date().toISOString(),
56 });
57});Alerting Rules#
1# prometheus-rules.yaml
2groups:
3 - name: api-alerts
4 rules:
5 - alert: HighErrorRate
6 expr: |
7 sum(rate(http_requests_total{status=~"5.."}[5m]))
8 /
9 sum(rate(http_requests_total[5m])) > 0.05
10 for: 5m
11 labels:
12 severity: critical
13 annotations:
14 summary: High error rate detected
15 description: Error rate is {{ $value | humanizePercentage }}
16
17 - alert: HighLatency
18 expr: |
19 histogram_quantile(0.95,
20 sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
21 ) > 1
22 for: 5m
23 labels:
24 severity: warning
25 annotations:
26 summary: High latency detected
27 description: P95 latency is {{ $value }}s
28
29 - alert: ServiceDown
30 expr: up == 0
31 for: 1m
32 labels:
33 severity: critical
34 annotations:
35 summary: Service is downDashboard Queries#
# Request rate
sum(rate(http_requests_total[5m])) by (path)
# Error rate
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
# P50, P95, P99 latency
histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
# Active connections
active_connections
# Memory usage
process_resident_memory_bytes
Best Practices#
Metrics:
✓ Use consistent naming conventions
✓ Add relevant labels
✓ Set appropriate buckets
✓ Monitor cardinality
Logging:
✓ Use structured logging
✓ Include correlation IDs
✓ Log at appropriate levels
✓ Don't log sensitive data
Tracing:
✓ Propagate context across services
✓ Add meaningful span names
✓ Include relevant attributes
✓ Sample appropriately
Alerting:
✓ Alert on symptoms, not causes
✓ Include runbooks
✓ Avoid alert fatigue
✓ Test alerts regularly
Conclusion#
Observability requires metrics, logs, and traces working together. Start with basic health checks and metrics, add structured logging, then implement tracing for complex systems. Good observability reduces mean time to resolution and improves system reliability.