Back to Blog
Node.jsClusterPerformanceScaling

Node.js Cluster Module Guide

Master the Node.js cluster module for utilizing multiple CPU cores and improving application performance.

B
Bootspring Team
Engineering
June 24, 2019
7 min read

The cluster module enables creating child processes that share server ports, allowing Node.js applications to utilize multiple CPU cores.

Basic Setup#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5const numCPUs = os.cpus().length; 6 7if (cluster.isPrimary) { 8 console.log(`Primary ${process.pid} is running`); 9 10 // Fork workers 11 for (let i = 0; i < numCPUs; i++) { 12 cluster.fork(); 13 } 14 15 cluster.on('exit', (worker, code, signal) => { 16 console.log(`Worker ${worker.process.pid} died`); 17 }); 18} else { 19 // Workers share the TCP connection 20 http.createServer((req, res) => { 21 res.writeHead(200); 22 res.end(`Hello from worker ${process.pid}\n`); 23 }).listen(8000); 24 25 console.log(`Worker ${process.pid} started`); 26}

Worker Management#

1import cluster from 'node:cluster'; 2import os from 'node:os'; 3 4if (cluster.isPrimary) { 5 const numCPUs = os.cpus().length; 6 7 // Fork workers 8 for (let i = 0; i < numCPUs; i++) { 9 const worker = cluster.fork(); 10 11 // Listen for messages from worker 12 worker.on('message', (message) => { 13 console.log(`Message from worker ${worker.id}:`, message); 14 }); 15 } 16 17 // Get all workers 18 for (const id in cluster.workers) { 19 console.log(`Worker ${id} is running`); 20 } 21 22 // Listen for worker events 23 cluster.on('online', (worker) => { 24 console.log(`Worker ${worker.id} is online`); 25 }); 26 27 cluster.on('listening', (worker, address) => { 28 console.log(`Worker ${worker.id} listening on ${address.port}`); 29 }); 30 31 cluster.on('disconnect', (worker) => { 32 console.log(`Worker ${worker.id} disconnected`); 33 }); 34 35 cluster.on('exit', (worker, code, signal) => { 36 if (signal) { 37 console.log(`Worker ${worker.id} killed by signal ${signal}`); 38 } else if (code !== 0) { 39 console.log(`Worker ${worker.id} exited with code ${code}`); 40 } 41 }); 42}

Auto-Restart Workers#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5if (cluster.isPrimary) { 6 const numCPUs = os.cpus().length; 7 8 for (let i = 0; i < numCPUs; i++) { 9 cluster.fork(); 10 } 11 12 // Auto-restart dead workers 13 cluster.on('exit', (worker, code, signal) => { 14 console.log(`Worker ${worker.process.pid} died`); 15 16 // Don't restart if intentionally killed 17 if (!worker.exitedAfterDisconnect) { 18 console.log('Starting a new worker'); 19 cluster.fork(); 20 } 21 }); 22} else { 23 http.createServer((req, res) => { 24 // Simulate occasional crash 25 if (Math.random() < 0.01) { 26 throw new Error('Random failure'); 27 } 28 res.end('Hello World'); 29 }).listen(8000); 30}

Graceful Shutdown#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5if (cluster.isPrimary) { 6 const numCPUs = os.cpus().length; 7 8 for (let i = 0; i < numCPUs; i++) { 9 cluster.fork(); 10 } 11 12 // Graceful shutdown 13 process.on('SIGTERM', () => { 14 console.log('Primary received SIGTERM'); 15 16 for (const id in cluster.workers) { 17 cluster.workers[id].send('shutdown'); 18 } 19 20 // Wait for workers to finish 21 setTimeout(() => { 22 process.exit(0); 23 }, 5000); 24 }); 25} else { 26 const server = http.createServer((req, res) => { 27 res.end('Hello World'); 28 }); 29 30 server.listen(8000); 31 32 process.on('message', (msg) => { 33 if (msg === 'shutdown') { 34 console.log(`Worker ${process.pid} shutting down`); 35 36 server.close(() => { 37 process.exit(0); 38 }); 39 40 // Force exit after timeout 41 setTimeout(() => { 42 process.exit(1); 43 }, 5000); 44 } 45 }); 46}

Inter-Process Communication#

1import cluster from 'node:cluster'; 2import os from 'node:os'; 3 4if (cluster.isPrimary) { 5 const workers = []; 6 7 for (let i = 0; i < os.cpus().length; i++) { 8 const worker = cluster.fork(); 9 workers.push(worker); 10 11 // Receive messages from workers 12 worker.on('message', (message) => { 13 console.log(`Primary received: ${JSON.stringify(message)}`); 14 15 // Broadcast to all workers 16 workers.forEach((w) => { 17 w.send({ type: 'broadcast', data: message }); 18 }); 19 }); 20 } 21 22 // Send task to specific worker 23 setTimeout(() => { 24 workers[0].send({ type: 'task', data: 'Process this' }); 25 }, 1000); 26} else { 27 // Worker process 28 process.on('message', (message) => { 29 console.log(`Worker ${process.pid} received:`, message); 30 31 if (message.type === 'task') { 32 // Process task and respond 33 process.send({ type: 'result', worker: process.pid, data: 'Done' }); 34 } 35 }); 36 37 // Send heartbeat 38 setInterval(() => { 39 process.send({ type: 'heartbeat', worker: process.pid }); 40 }, 5000); 41}

Load Balancing#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5// Node.js uses round-robin by default on Linux 6// On Windows, it's handled by the OS 7 8if (cluster.isPrimary) { 9 // Custom scheduling policy 10 cluster.schedulingPolicy = cluster.SCHED_RR; // Round-robin 11 12 const numCPUs = os.cpus().length; 13 14 for (let i = 0; i < numCPUs; i++) { 15 cluster.fork(); 16 } 17 18 // Track requests per worker 19 const requestCounts = {}; 20 21 cluster.on('message', (worker, message) => { 22 if (message.type === 'request') { 23 requestCounts[worker.id] = (requestCounts[worker.id] || 0) + 1; 24 } 25 }); 26 27 setInterval(() => { 28 console.log('Request distribution:', requestCounts); 29 }, 5000); 30} else { 31 http.createServer((req, res) => { 32 // Notify primary of request 33 process.send({ type: 'request' }); 34 35 res.writeHead(200); 36 res.end(`Handled by worker ${process.pid}`); 37 }).listen(8000); 38}

Zero-Downtime Restart#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5if (cluster.isPrimary) { 6 const numCPUs = os.cpus().length; 7 8 for (let i = 0; i < numCPUs; i++) { 9 cluster.fork(); 10 } 11 12 // Reload workers one by one 13 process.on('SIGHUP', () => { 14 console.log('Reloading workers...'); 15 16 const workers = Object.values(cluster.workers); 17 let index = 0; 18 19 const reloadNext = () => { 20 if (index >= workers.length) { 21 console.log('All workers reloaded'); 22 return; 23 } 24 25 const worker = workers[index++]; 26 console.log(`Reloading worker ${worker.id}`); 27 28 // Fork new worker 29 const newWorker = cluster.fork(); 30 31 // Wait for new worker to be ready 32 newWorker.on('listening', () => { 33 // Gracefully shutdown old worker 34 worker.disconnect(); 35 36 worker.on('disconnect', () => { 37 console.log(`Worker ${worker.id} disconnected`); 38 reloadNext(); 39 }); 40 }); 41 }; 42 43 reloadNext(); 44 }); 45 46 cluster.on('exit', (worker, code, signal) => { 47 if (!worker.exitedAfterDisconnect) { 48 console.log(`Worker ${worker.id} crashed, restarting...`); 49 cluster.fork(); 50 } 51 }); 52} else { 53 const server = http.createServer((req, res) => { 54 res.end(`Worker ${process.pid} - Version 1.0`); 55 }); 56 57 server.listen(8000); 58}

Shared State with External Store#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4import { createClient } from 'redis'; 5 6if (cluster.isPrimary) { 7 for (let i = 0; i < os.cpus().length; i++) { 8 cluster.fork(); 9 } 10} else { 11 // Each worker connects to Redis 12 const redis = createClient(); 13 await redis.connect(); 14 15 http.createServer(async (req, res) => { 16 if (req.url === '/increment') { 17 const count = await redis.incr('counter'); 18 res.end(`Count: ${count} (Worker ${process.pid})`); 19 } else if (req.url === '/count') { 20 const count = await redis.get('counter'); 21 res.end(`Count: ${count}`); 22 } else { 23 res.end('Hello'); 24 } 25 }).listen(8000); 26}

Worker Health Checks#

1import cluster from 'node:cluster'; 2import http from 'node:http'; 3import os from 'node:os'; 4 5if (cluster.isPrimary) { 6 const workerHealth = new Map(); 7 8 for (let i = 0; i < os.cpus().length; i++) { 9 const worker = cluster.fork(); 10 workerHealth.set(worker.id, { lastHeartbeat: Date.now(), healthy: true }); 11 12 worker.on('message', (msg) => { 13 if (msg.type === 'heartbeat') { 14 workerHealth.set(worker.id, { 15 lastHeartbeat: Date.now(), 16 healthy: true, 17 }); 18 } 19 }); 20 } 21 22 // Check health periodically 23 setInterval(() => { 24 const now = Date.now(); 25 26 for (const [id, health] of workerHealth) { 27 if (now - health.lastHeartbeat > 10000) { 28 console.log(`Worker ${id} is unresponsive`); 29 const worker = cluster.workers[id]; 30 31 if (worker) { 32 worker.kill(); 33 workerHealth.delete(id); 34 } 35 } 36 } 37 }, 5000); 38 39 cluster.on('exit', (worker) => { 40 workerHealth.delete(worker.id); 41 const newWorker = cluster.fork(); 42 workerHealth.set(newWorker.id, { 43 lastHeartbeat: Date.now(), 44 healthy: true, 45 }); 46 }); 47} else { 48 // Send heartbeat 49 setInterval(() => { 50 process.send({ type: 'heartbeat' }); 51 }, 3000); 52 53 http.createServer((req, res) => { 54 res.end('OK'); 55 }).listen(8000); 56}

Best Practices#

Setup: ✓ Fork based on CPU count ✓ Implement auto-restart ✓ Handle graceful shutdown ✓ Use external state storage Communication: ✓ Keep messages small ✓ Use structured message types ✓ Handle message errors ✓ Implement timeouts Reliability: ✓ Health checks ✓ Zero-downtime restarts ✓ Crash recovery ✓ Resource limits Avoid: ✗ Sharing memory between workers ✗ Too many workers ✗ Ignoring worker crashes ✗ Long-running synchronous code

Conclusion#

The cluster module enables horizontal scaling by utilizing multiple CPU cores. The primary process manages worker processes that share server ports. Implement auto-restart for crashed workers, graceful shutdown for deployments, and health checks for reliability. Use external stores like Redis for shared state since workers have separate memory spaces. For production, consider process managers like PM2 that handle clustering automatically.

Share this article

Help spread the word about Bootspring