Observability: Monitoring Distributed Systems

Observability enables understanding system behavior from external outputs. This guide covers implementing the three pillars: logs, metrics, and traces.

The Three Pillars#

┌─────────────────────────────────────────────────────┐
│                   OBSERVABILITY                      │
├─────────────────┬─────────────────┬─────────────────┤
│      LOGS       │     METRICS     │     TRACES      │
│                 │                 │                 │
│  What happened  │   How much/     │   Request flow  │
│  in detail      │   how often     │   across        │
│                 │                 │   services      │
└─────────────────┴─────────────────┴─────────────────┘

Structured Logging#

Logger Implementation#

import pino from 'pino';

const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
  },
  base: {
    service: process.env.SERVICE_NAME,
    version: process.env.APP_VERSION,
    environment: process.env.NODE_ENV,
  },
  timestamp: pino.stdTimeFunctions.isoTime,
});

// Create child loggers with context
export function createLogger(context: Record<string, unknown>) {
  return logger.child(context);
}

// Usage
const requestLogger = createLogger({
  requestId: req.id,
  userId: req.user?.id,
});

requestLogger.info({ path: req.path, method: req.method }, 'Request received');

Request Logging Middleware#

import { Request, Response, NextFunction } from 'express';
import { createLogger } from './logger';

export function requestLogging() {
  return (req: Request, res: Response, next: NextFunction) => {
    const startTime = Date.now();
    const requestId = req.headers['x-request-id'] || crypto.randomUUID();

    const log = createLogger({
      requestId,
      method: req.method,
      path: req.path,
      userAgent: req.headers['user-agent'],
      ip: req.ip,
    });

    req.log = log;

    log.info('Request started');

    res.on('finish', () => {
      const duration = Date.now() - startTime;

      log.info({
        statusCode: res.statusCode,
        duration,
        contentLength: res.get('content-length'),
      }, 'Request completed');
    });

    next();
  };
}

Error Logging#

export function errorHandler(
  err: Error,
  req: Request,
  res: Response,
  next: NextFunction
) {
  const log = req.log || logger;

  log.error({
    error: {
      name: err.name,
      message: err.message,
      stack: err.stack,
    },
    request: {
      method: req.method,
      path: req.path,
      query: req.query,
      body: sanitizeBody(req.body),
    },
  }, 'Unhandled error');

  res.status(500).json({ error: 'Internal server error' });
}

function sanitizeBody(body: any) {
  const sensitiveFields = ['password', 'token', 'creditCard'];
  const sanitized = { ...body };

  for (const field of sensitiveFields) {
    if (field in sanitized) {
      sanitized[field] = '[REDACTED]';
    }
  }

  return sanitized;
}

Metrics with Prometheus#

Custom Metrics#

import client from 'prom-client';

// Enable default metrics
client.collectDefaultMetrics();

// Custom metrics
export const httpRequestDuration = new client.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
});

export const httpRequestTotal = new client.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status'],
});

export const activeConnections = new client.Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
});

export const orderValue = new client.Summary({
  name: 'order_value_dollars',
  help: 'Value of orders in dollars',
  percentiles: [0.5, 0.9, 0.99],
});

Metrics Middleware#

export function metricsMiddleware() {
  return (req: Request, res: Response, next: NextFunction) => {
    const start = process.hrtime();

    res.on('finish', () => {
      const [seconds, nanoseconds] = process.hrtime(start);
      const duration = seconds + nanoseconds / 1e9;

      const route = req.route?.path || req.path;
      const labels = {
        method: req.method,
        route,
        status: res.statusCode.toString(),
      };

      httpRequestDuration.observe(labels, duration);
      httpRequestTotal.inc(labels);
    });

    next();
  };
}

// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', client.register.contentType);
  res.send(await client.register.metrics());
});

Business Metrics#

// Track business-specific metrics
export const businessMetrics = {
  orderCreated: new client.Counter({
    name: 'orders_created_total',
    help: 'Total orders created',
    labelNames: ['payment_method', 'customer_type'],
  }),

  checkoutAbandoned: new client.Counter({
    name: 'checkout_abandoned_total',
    help: 'Total abandoned checkouts',
    labelNames: ['step'],
  }),

  inventoryLevel: new client.Gauge({
    name: 'inventory_level',
    help: 'Current inventory level',
    labelNames: ['product_id', 'warehouse'],
  }),
};

// Usage
businessMetrics.orderCreated.inc({
  payment_method: 'credit_card',
  customer_type: 'returning',
});

Distributed Tracing#

OpenTelemetry Setup#

import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';

const sdk = new NodeSDK({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: 'order-service',
    [SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
    [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV,
  }),
  traceExporter: new OTLPTraceExporter({
    url: process.env.OTLP_ENDPOINT,
  }),
  instrumentations: [
    getNodeAutoInstrumentations({
      '@opentelemetry/instrumentation-fs': { enabled: false },
    }),
  ],
});

sdk.start();

Custom Spans#

import { trace, SpanStatusCode, context } from '@opentelemetry/api';

const tracer = trace.getTracer('order-service');

async function processOrder(orderId: string) {
  return tracer.startActiveSpan('processOrder', async (span) => {
    try {
      span.setAttribute('order.id', orderId);

      // Child span for payment
      await tracer.startActiveSpan('chargePayment', async (paymentSpan) => {
        const result = await paymentService.charge(orderId);
        paymentSpan.setAttribute('payment.id', result.paymentId);
        paymentSpan.end();
      });

      // Child span for inventory
      await tracer.startActiveSpan('reserveInventory', async (inventorySpan) => {
        await inventoryService.reserve(orderId);
        inventorySpan.end();
      });

      span.setStatus({ code: SpanStatusCode.OK });
    } catch (error) {
      span.setStatus({
        code: SpanStatusCode.ERROR,
        message: error.message,
      });
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  });
}

Context Propagation#

import { propagation, context } from '@opentelemetry/api';

// Extract context from incoming request
function extractContext(req: Request) {
  return propagation.extract(context.active(), req.headers);
}

// Inject context into outgoing request
function injectContext(headers: Record<string, string>) {
  propagation.inject(context.active(), headers);
  return headers;
}

// HTTP client with context propagation
async function fetchWithTracing(url: string, options: RequestInit = {}) {
  const headers = { ...options.headers };
  injectContext(headers);

  return fetch(url, { ...options, headers });
}

Alerting#

Alert Rules (Prometheus)#

# alerts.yml
groups:
  - name: service-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: High error rate detected
          description: Error rate is {{ $value | humanizePercentage }}

      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: High latency detected
          description: P95 latency is {{ $value }}s

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: Service is down

Dashboards#

Key Metrics to Display#

// RED metrics for services
const redMetrics = {
  rate: 'sum(rate(http_requests_total[5m]))',
  errors: 'sum(rate(http_requests_total{status=~"5.."}[5m]))',
  duration: 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))',
};

// USE metrics for resources
const useMetrics = {
  utilization: 'process_cpu_seconds_total',
  saturation: 'process_open_fds / process_max_fds',
  errors: 'process_cpu_seconds_total', // CPU errors are rare
};

Best Practices#

Use structured logging: JSON logs are easier to query
Include correlation IDs: Track requests across services
Alert on symptoms, not causes: Focus on user impact
Use high-cardinality data wisely: Traces, not metrics
Set up SLOs: Define and monitor service level objectives

Observability requires investment across all three pillars. Start with structured logging, add key metrics, then implement tracing for request flow visibility. The goal is reducing time to detect and resolve issues.