Observability enables understanding system behavior from external outputs. This guide covers implementing the three pillars: logs, metrics, and traces.
The Three Pillars#
┌─────────────────────────────────────────────────────┐
│ OBSERVABILITY │
├─────────────────┬─────────────────┬─────────────────┤
│ LOGS │ METRICS │ TRACES │
│ │ │ │
│ What happened │ How much/ │ Request flow │
│ in detail │ how often │ across │
│ │ │ services │
└─────────────────┴─────────────────┴─────────────────┘
Structured Logging#
Logger Implementation#
1import pino from 'pino';
2
3const logger = pino({
4 level: process.env.LOG_LEVEL || 'info',
5 formatters: {
6 level: (label) => ({ level: label }),
7 },
8 base: {
9 service: process.env.SERVICE_NAME,
10 version: process.env.APP_VERSION,
11 environment: process.env.NODE_ENV,
12 },
13 timestamp: pino.stdTimeFunctions.isoTime,
14});
15
16// Create child loggers with context
17export function createLogger(context: Record<string, unknown>) {
18 return logger.child(context);
19}
20
21// Usage
22const requestLogger = createLogger({
23 requestId: req.id,
24 userId: req.user?.id,
25});
26
27requestLogger.info({ path: req.path, method: req.method }, 'Request received');Request Logging Middleware#
1import { Request, Response, NextFunction } from 'express';
2import { createLogger } from './logger';
3
4export function requestLogging() {
5 return (req: Request, res: Response, next: NextFunction) => {
6 const startTime = Date.now();
7 const requestId = req.headers['x-request-id'] || crypto.randomUUID();
8
9 const log = createLogger({
10 requestId,
11 method: req.method,
12 path: req.path,
13 userAgent: req.headers['user-agent'],
14 ip: req.ip,
15 });
16
17 req.log = log;
18
19 log.info('Request started');
20
21 res.on('finish', () => {
22 const duration = Date.now() - startTime;
23
24 log.info({
25 statusCode: res.statusCode,
26 duration,
27 contentLength: res.get('content-length'),
28 }, 'Request completed');
29 });
30
31 next();
32 };
33}Error Logging#
1export function errorHandler(
2 err: Error,
3 req: Request,
4 res: Response,
5 next: NextFunction
6) {
7 const log = req.log || logger;
8
9 log.error({
10 error: {
11 name: err.name,
12 message: err.message,
13 stack: err.stack,
14 },
15 request: {
16 method: req.method,
17 path: req.path,
18 query: req.query,
19 body: sanitizeBody(req.body),
20 },
21 }, 'Unhandled error');
22
23 res.status(500).json({ error: 'Internal server error' });
24}
25
26function sanitizeBody(body: any) {
27 const sensitiveFields = ['password', 'token', 'creditCard'];
28 const sanitized = { ...body };
29
30 for (const field of sensitiveFields) {
31 if (field in sanitized) {
32 sanitized[field] = '[REDACTED]';
33 }
34 }
35
36 return sanitized;
37}Metrics with Prometheus#
Custom Metrics#
1import client from 'prom-client';
2
3// Enable default metrics
4client.collectDefaultMetrics();
5
6// Custom metrics
7export const httpRequestDuration = new client.Histogram({
8 name: 'http_request_duration_seconds',
9 help: 'Duration of HTTP requests in seconds',
10 labelNames: ['method', 'route', 'status'],
11 buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
12});
13
14export const httpRequestTotal = new client.Counter({
15 name: 'http_requests_total',
16 help: 'Total number of HTTP requests',
17 labelNames: ['method', 'route', 'status'],
18});
19
20export const activeConnections = new client.Gauge({
21 name: 'active_connections',
22 help: 'Number of active connections',
23});
24
25export const orderValue = new client.Summary({
26 name: 'order_value_dollars',
27 help: 'Value of orders in dollars',
28 percentiles: [0.5, 0.9, 0.99],
29});Metrics Middleware#
1export function metricsMiddleware() {
2 return (req: Request, res: Response, next: NextFunction) => {
3 const start = process.hrtime();
4
5 res.on('finish', () => {
6 const [seconds, nanoseconds] = process.hrtime(start);
7 const duration = seconds + nanoseconds / 1e9;
8
9 const route = req.route?.path || req.path;
10 const labels = {
11 method: req.method,
12 route,
13 status: res.statusCode.toString(),
14 };
15
16 httpRequestDuration.observe(labels, duration);
17 httpRequestTotal.inc(labels);
18 });
19
20 next();
21 };
22}
23
24// Expose metrics endpoint
25app.get('/metrics', async (req, res) => {
26 res.set('Content-Type', client.register.contentType);
27 res.send(await client.register.metrics());
28});Business Metrics#
1// Track business-specific metrics
2export const businessMetrics = {
3 orderCreated: new client.Counter({
4 name: 'orders_created_total',
5 help: 'Total orders created',
6 labelNames: ['payment_method', 'customer_type'],
7 }),
8
9 checkoutAbandoned: new client.Counter({
10 name: 'checkout_abandoned_total',
11 help: 'Total abandoned checkouts',
12 labelNames: ['step'],
13 }),
14
15 inventoryLevel: new client.Gauge({
16 name: 'inventory_level',
17 help: 'Current inventory level',
18 labelNames: ['product_id', 'warehouse'],
19 }),
20};
21
22// Usage
23businessMetrics.orderCreated.inc({
24 payment_method: 'credit_card',
25 customer_type: 'returning',
26});Distributed Tracing#
OpenTelemetry Setup#
1import { NodeSDK } from '@opentelemetry/sdk-node';
2import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
3import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
4import { Resource } from '@opentelemetry/resources';
5import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
6
7const sdk = new NodeSDK({
8 resource: new Resource({
9 [SemanticResourceAttributes.SERVICE_NAME]: 'order-service',
10 [SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
11 [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV,
12 }),
13 traceExporter: new OTLPTraceExporter({
14 url: process.env.OTLP_ENDPOINT,
15 }),
16 instrumentations: [
17 getNodeAutoInstrumentations({
18 '@opentelemetry/instrumentation-fs': { enabled: false },
19 }),
20 ],
21});
22
23sdk.start();Custom Spans#
1import { trace, SpanStatusCode, context } from '@opentelemetry/api';
2
3const tracer = trace.getTracer('order-service');
4
5async function processOrder(orderId: string) {
6 return tracer.startActiveSpan('processOrder', async (span) => {
7 try {
8 span.setAttribute('order.id', orderId);
9
10 // Child span for payment
11 await tracer.startActiveSpan('chargePayment', async (paymentSpan) => {
12 const result = await paymentService.charge(orderId);
13 paymentSpan.setAttribute('payment.id', result.paymentId);
14 paymentSpan.end();
15 });
16
17 // Child span for inventory
18 await tracer.startActiveSpan('reserveInventory', async (inventorySpan) => {
19 await inventoryService.reserve(orderId);
20 inventorySpan.end();
21 });
22
23 span.setStatus({ code: SpanStatusCode.OK });
24 } catch (error) {
25 span.setStatus({
26 code: SpanStatusCode.ERROR,
27 message: error.message,
28 });
29 span.recordException(error);
30 throw error;
31 } finally {
32 span.end();
33 }
34 });
35}Context Propagation#
1import { propagation, context } from '@opentelemetry/api';
2
3// Extract context from incoming request
4function extractContext(req: Request) {
5 return propagation.extract(context.active(), req.headers);
6}
7
8// Inject context into outgoing request
9function injectContext(headers: Record<string, string>) {
10 propagation.inject(context.active(), headers);
11 return headers;
12}
13
14// HTTP client with context propagation
15async function fetchWithTracing(url: string, options: RequestInit = {}) {
16 const headers = { ...options.headers };
17 injectContext(headers);
18
19 return fetch(url, { ...options, headers });
20}Alerting#
Alert Rules (Prometheus)#
1# alerts.yml
2groups:
3 - name: service-alerts
4 rules:
5 - alert: HighErrorRate
6 expr: |
7 sum(rate(http_requests_total{status=~"5.."}[5m]))
8 / sum(rate(http_requests_total[5m])) > 0.05
9 for: 5m
10 labels:
11 severity: critical
12 annotations:
13 summary: High error rate detected
14 description: Error rate is {{ $value | humanizePercentage }}
15
16 - alert: HighLatency
17 expr: |
18 histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
19 for: 10m
20 labels:
21 severity: warning
22 annotations:
23 summary: High latency detected
24 description: P95 latency is {{ $value }}s
25
26 - alert: ServiceDown
27 expr: up == 0
28 for: 1m
29 labels:
30 severity: critical
31 annotations:
32 summary: Service is downDashboards#
Key Metrics to Display#
1// RED metrics for services
2const redMetrics = {
3 rate: 'sum(rate(http_requests_total[5m]))',
4 errors: 'sum(rate(http_requests_total{status=~"5.."}[5m]))',
5 duration: 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))',
6};
7
8// USE metrics for resources
9const useMetrics = {
10 utilization: 'process_cpu_seconds_total',
11 saturation: 'process_open_fds / process_max_fds',
12 errors: 'process_cpu_seconds_total', // CPU errors are rare
13};Best Practices#
- Use structured logging: JSON logs are easier to query
- Include correlation IDs: Track requests across services
- Alert on symptoms, not causes: Focus on user impact
- Use high-cardinality data wisely: Traces, not metrics
- Set up SLOs: Define and monitor service level objectives
Conclusion#
Observability requires investment across all three pillars. Start with structured logging, add key metrics, then implement tracing for request flow visibility. The goal is reducing time to detect and resolve issues.