Log Aggregation and Analysis at Scale

Logs are essential for debugging, auditing, and understanding system behavior. At scale, you need centralized logging that's searchable, analyzable, and cost-effective.

Log Aggregation Architecture#

┌─────────────┐  ┌─────────────┐  ┌─────────────┐
│  Service A  │  │  Service B  │  │  Service C  │
└──────┬──────┘  └──────┬──────┘  └──────┬──────┘
       │                │                │
       └────────────────┼────────────────┘
                        │
                        ▼
              ┌─────────────────┐
              │   Log Shipper   │
              │   (Fluentd/     │
              │    Filebeat)    │
              └────────┬────────┘
                       │
                       ▼
              ┌─────────────────┐
              │  Message Queue  │
              │    (Kafka)      │
              └────────┬────────┘
                       │
                       ▼
              ┌─────────────────┐
              │   Log Storage   │
              │ (Elasticsearch/ │
              │   Loki/Cloud)   │
              └────────┬────────┘
                       │
                       ▼
              ┌─────────────────┐
              │   Visualization │
              │(Kibana/Grafana) │
              └─────────────────┘

Structured Logging#

Log Format#

// Structured JSON logs
interface LogEntry {
  timestamp: string;       // ISO 8601
  level: string;           // info, warn, error
  message: string;         // Human-readable message
  service: string;         // Service name
  environment: string;     // prod, staging, dev
  traceId?: string;        // Distributed tracing
  spanId?: string;
  userId?: string;         // Context
  requestId?: string;
  error?: {
    name: string;
    message: string;
    stack: string;
  };
  metadata?: Record<string, unknown>;
}

// Example output
{
  "timestamp": "2024-09-20T10:30:00.000Z",
  "level": "error",
  "message": "Failed to process payment",
  "service": "payment-service",
  "environment": "production",
  "traceId": "abc123",
  "requestId": "req-456",
  "userId": "user-789",
  "error": {
    "name": "PaymentError",
    "message": "Card declined",
    "stack": "Error: Card declined\n    at processPayment..."
  },
  "metadata": {
    "orderId": "order-111",
    "amount": 99.99
  }
}

Implementation#

import pino from 'pino';

const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  base: {
    service: process.env.SERVICE_NAME,
    environment: process.env.NODE_ENV,
    version: process.env.APP_VERSION,
  },
  timestamp: () => `,"timestamp":"${new Date().toISOString()}"`,
});

// Create child logger with request context
function createRequestLogger(req: Request) {
  return logger.child({
    requestId: req.headers['x-request-id'],
    traceId: req.headers['x-trace-id'],
    userId: req.user?.id,
  });
}

// Usage
app.use((req, res, next) => {
  req.log = createRequestLogger(req);
  req.log.info({ path: req.path, method: req.method }, 'Request started');
  next();
});

Log Collection#

Fluentd Configuration#

# fluent.conf
<source>
  @type tail
  path /var/log/app/*.log
  pos_file /var/log/fluentd/app.log.pos
  tag app.logs
  <parse>
    @type json
    time_key timestamp
    time_format %Y-%m-%dT%H:%M:%S.%L%z
  </parse>
</source>

<filter app.logs>
  @type record_transformer
  <record>
    hostname "#{Socket.gethostname}"
    environment "#{ENV['ENVIRONMENT']}"
  </record>
</filter>

<match app.logs>
  @type elasticsearch
  host elasticsearch
  port 9200
  index_name logs-${tag}-%Y.%m.%d
  <buffer>
    @type memory
    flush_interval 5s
    chunk_limit_size 5MB
    retry_max_interval 30
  </buffer>
</match>

Kubernetes Logging#

# DaemonSet for log collection
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: fluentd
spec:
  selector:
    matchLabels:
      app: fluentd
  template:
    spec:
      containers:
        - name: fluentd
          image: fluent/fluentd-kubernetes-daemonset:v1.16-debian-elasticsearch8
          env:
            - name: FLUENT_ELASTICSEARCH_HOST
              value: "elasticsearch"
          volumeMounts:
            - name: varlog
              mountPath: /var/log
            - name: containers
              mountPath: /var/lib/docker/containers
              readOnly: true
      volumes:
        - name: varlog
          hostPath:
            path: /var/log
        - name: containers
          hostPath:
            path: /var/lib/docker/containers

Storage Solutions#

Elasticsearch#

# Common queries
# Find errors in last hour
GET logs-*/_search
{
  "query": {
    "bool": {
      "filter": [
        { "term": { "level": "error" } },
        { "range": { "timestamp": { "gte": "now-1h" } } }
      ]
    }
  },
  "sort": [{ "timestamp": "desc" }],
  "size": 100
}

# Aggregate errors by service
GET logs-*/_search
{
  "size": 0,
  "query": {
    "range": { "timestamp": { "gte": "now-24h" } }
  },
  "aggs": {
    "by_service": {
      "terms": { "field": "service" },
      "aggs": {
        "error_count": {
          "filter": { "term": { "level": "error" } }
        }
      }
    }
  }
}

Grafana Loki#

# Loki config
auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  lifecycler:
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1

schema_config:
  configs:
    - from: 2024-01-01
      store: boltdb-shipper
      object_store: s3
      schema: v12
      index:
        prefix: loki_index_
        period: 24h

storage_config:
  boltdb_shipper:
    active_index_directory: /loki/index
    cache_location: /loki/cache
  aws:
    s3: s3://region/bucket-name

# LogQL queries
# Find errors
{service="api"} |= "error"

# JSON parsing
{service="api"} | json | level="error"

# With metrics
sum(rate({service="api"} | json | level="error" [5m]))

Log Retention#

# Elasticsearch Index Lifecycle Management
PUT _ilm/policy/logs-policy
{
  "policy": {
    "phases": {
      "hot": {
        "min_age": "0ms",
        "actions": {
          "rollover": {
            "max_size": "50GB",
            "max_age": "1d"
          }
        }
      },
      "warm": {
        "min_age": "7d",
        "actions": {
          "shrink": { "number_of_shards": 1 },
          "forcemerge": { "max_num_segments": 1 }
        }
      },
      "cold": {
        "min_age": "30d",
        "actions": {
          "freeze": {}
        }
      },
      "delete": {
        "min_age": "90d",
        "actions": {
          "delete": {}
        }
      }
    }
  }
}

Best Practices#

What to Log#

// ✅ Log these
logger.info('Request received', { path, method, userId });
logger.info('Business event', { event: 'order_placed', orderId, total });
logger.error('Operation failed', { operation, error, context });
logger.warn('Degraded performance', { latency, threshold });

// ❌ Don't log these
logger.info('Password: ' + password);  // Sensitive data
logger.info('Request body', { body });  // May contain PII
logger.debug('Loop iteration');  // Too verbose for production

Log Sampling#

// Sample verbose logs
function shouldLog(sampleRate: number): boolean {
  return Math.random() < sampleRate;
}

// Log 10% of successful requests
if (response.ok && shouldLog(0.1)) {
  logger.debug('Request succeeded', { details });
}

// Always log errors
if (!response.ok) {
  logger.error('Request failed', { details });
}

Effective log aggregation requires structured logging, efficient collection, and smart retention. Use structured JSON logs for parseability, centralize logs for searchability, and implement retention policies for cost control.

Logs complement metrics and traces—together they provide complete observability into your systems.