Back to Blog
Node.jsClusterScalingPerformance

Scaling Node.js with Cluster Module

Scale Node.js across CPU cores. From basic clustering to load balancing to graceful shutdown patterns.

B
Bootspring Team
Engineering
September 25, 2021
7 min read

Node.js runs on a single thread by default. Here's how to utilize all CPU cores with clustering.

Basic Cluster Setup#

1import cluster from 'cluster'; 2import http from 'http'; 3import os from 'os'; 4 5const numCPUs = os.cpus().length; 6 7if (cluster.isPrimary) { 8 console.log(`Primary ${process.pid} is running`); 9 console.log(`Forking ${numCPUs} workers...`); 10 11 // Fork workers 12 for (let i = 0; i < numCPUs; i++) { 13 cluster.fork(); 14 } 15 16 cluster.on('exit', (worker, code, signal) => { 17 console.log(`Worker ${worker.process.pid} died`); 18 // Optionally restart 19 cluster.fork(); 20 }); 21} else { 22 // Workers share the TCP connection 23 http.createServer((req, res) => { 24 res.writeHead(200); 25 res.end(`Hello from worker ${process.pid}\n`); 26 }).listen(3000); 27 28 console.log(`Worker ${process.pid} started`); 29}

Express with Clustering#

1// cluster.ts 2import cluster from 'cluster'; 3import os from 'os'; 4 5export function startCluster(workerFn: () => void) { 6 const numCPUs = os.cpus().length; 7 8 if (cluster.isPrimary) { 9 console.log(`Primary process ${process.pid}`); 10 11 // Fork workers 12 for (let i = 0; i < numCPUs; i++) { 13 const worker = cluster.fork(); 14 console.log(`Started worker ${worker.process.pid}`); 15 } 16 17 // Handle worker events 18 cluster.on('exit', (worker, code, signal) => { 19 console.log(`Worker ${worker.process.pid} exited (${signal || code})`); 20 21 // Restart worker unless shutting down 22 if (!worker.exitedAfterDisconnect) { 23 console.log('Starting replacement worker...'); 24 cluster.fork(); 25 } 26 }); 27 28 cluster.on('online', (worker) => { 29 console.log(`Worker ${worker.process.pid} is online`); 30 }); 31 } else { 32 workerFn(); 33 } 34} 35 36// app.ts 37import express from 'express'; 38import { startCluster } from './cluster'; 39 40function startServer() { 41 const app = express(); 42 43 app.get('/', (req, res) => { 44 res.json({ 45 message: 'Hello', 46 pid: process.pid, 47 }); 48 }); 49 50 app.get('/heavy', (req, res) => { 51 // Simulate CPU-intensive work 52 let sum = 0; 53 for (let i = 0; i < 1e8; i++) { 54 sum += i; 55 } 56 res.json({ sum, pid: process.pid }); 57 }); 58 59 app.listen(3000, () => { 60 console.log(`Worker ${process.pid} listening on port 3000`); 61 }); 62} 63 64startCluster(startServer);

Graceful Shutdown#

1import cluster from 'cluster'; 2import http from 'http'; 3import os from 'os'; 4 5const numCPUs = os.cpus().length; 6 7if (cluster.isPrimary) { 8 const workers: cluster.Worker[] = []; 9 10 // Fork workers 11 for (let i = 0; i < numCPUs; i++) { 12 workers.push(cluster.fork()); 13 } 14 15 // Graceful shutdown handler 16 async function shutdown(signal: string) { 17 console.log(`\n${signal} received. Starting graceful shutdown...`); 18 19 // Stop accepting new connections 20 for (const worker of Object.values(cluster.workers || {})) { 21 worker?.send('shutdown'); 22 } 23 24 // Wait for workers to finish 25 const timeout = setTimeout(() => { 26 console.log('Forcing shutdown after timeout'); 27 process.exit(1); 28 }, 30000); 29 30 cluster.on('exit', () => { 31 const remaining = Object.keys(cluster.workers || {}).length; 32 if (remaining === 0) { 33 clearTimeout(timeout); 34 console.log('All workers stopped. Exiting.'); 35 process.exit(0); 36 } 37 }); 38 } 39 40 process.on('SIGTERM', () => shutdown('SIGTERM')); 41 process.on('SIGINT', () => shutdown('SIGINT')); 42 43 cluster.on('exit', (worker, code, signal) => { 44 if (!worker.exitedAfterDisconnect) { 45 console.log(`Worker ${worker.process.pid} crashed. Restarting...`); 46 cluster.fork(); 47 } 48 }); 49} else { 50 const server = http.createServer((req, res) => { 51 res.writeHead(200); 52 res.end(`Worker ${process.pid}\n`); 53 }); 54 55 server.listen(3000); 56 57 // Handle shutdown message from primary 58 process.on('message', (msg) => { 59 if (msg === 'shutdown') { 60 console.log(`Worker ${process.pid} shutting down...`); 61 62 server.close(() => { 63 console.log(`Worker ${process.pid} closed all connections`); 64 process.exit(0); 65 }); 66 67 // Force close after timeout 68 setTimeout(() => { 69 console.log(`Worker ${process.pid} forcing close`); 70 process.exit(0); 71 }, 10000); 72 } 73 }); 74}

Zero-Downtime Restart#

1import cluster from 'cluster'; 2import http from 'http'; 3import os from 'os'; 4 5const numCPUs = os.cpus().length; 6 7if (cluster.isPrimary) { 8 const workers: Set<cluster.Worker> = new Set(); 9 10 function forkWorker() { 11 const worker = cluster.fork(); 12 workers.add(worker); 13 14 worker.on('exit', () => { 15 workers.delete(worker); 16 }); 17 18 return worker; 19 } 20 21 // Initial fork 22 for (let i = 0; i < numCPUs; i++) { 23 forkWorker(); 24 } 25 26 // Zero-downtime restart 27 async function restartWorkers() { 28 console.log('Starting zero-downtime restart...'); 29 30 const currentWorkers = [...workers]; 31 32 for (const worker of currentWorkers) { 33 // Fork new worker first 34 const newWorker = forkWorker(); 35 36 // Wait for new worker to be ready 37 await new Promise<void>((resolve) => { 38 newWorker.on('listening', () => { 39 console.log(`New worker ${newWorker.process.pid} is ready`); 40 41 // Gracefully shutdown old worker 42 worker.send('shutdown'); 43 worker.disconnect(); 44 45 // Force kill after timeout 46 setTimeout(() => { 47 if (!worker.isDead()) { 48 worker.kill(); 49 } 50 }, 10000); 51 52 resolve(); 53 }); 54 }); 55 56 // Small delay between restarts 57 await new Promise(r => setTimeout(r, 1000)); 58 } 59 60 console.log('All workers restarted'); 61 } 62 63 // Trigger restart on SIGUSR2 64 process.on('SIGUSR2', restartWorkers); 65 66 cluster.on('exit', (worker, code, signal) => { 67 if (!worker.exitedAfterDisconnect && code !== 0) { 68 console.log(`Worker ${worker.process.pid} crashed. Restarting...`); 69 forkWorker(); 70 } 71 }); 72} else { 73 // Worker code 74 const server = http.createServer((req, res) => { 75 res.writeHead(200); 76 res.end(`Worker ${process.pid}\n`); 77 }); 78 79 server.listen(3000, () => { 80 console.log(`Worker ${process.pid} listening`); 81 }); 82 83 process.on('message', (msg) => { 84 if (msg === 'shutdown') { 85 server.close(() => { 86 process.exit(0); 87 }); 88 } 89 }); 90}

Shared State with IPC#

1import cluster from 'cluster'; 2import http from 'http'; 3import os from 'os'; 4 5interface Message { 6 type: string; 7 data?: any; 8} 9 10if (cluster.isPrimary) { 11 // Shared state in primary process 12 const stats = { 13 totalRequests: 0, 14 requestsByWorker: {} as Record<number, number>, 15 }; 16 17 for (let i = 0; i < os.cpus().length; i++) { 18 const worker = cluster.fork(); 19 20 worker.on('message', (msg: Message) => { 21 if (msg.type === 'request') { 22 stats.totalRequests++; 23 stats.requestsByWorker[worker.process.pid!] = 24 (stats.requestsByWorker[worker.process.pid!] || 0) + 1; 25 } 26 27 if (msg.type === 'getStats') { 28 worker.send({ type: 'stats', data: stats }); 29 } 30 }); 31 } 32 33 // Log stats periodically 34 setInterval(() => { 35 console.log('Stats:', stats); 36 }, 5000); 37} else { 38 const server = http.createServer((req, res) => { 39 // Notify primary of request 40 process.send?.({ type: 'request' }); 41 42 if (req.url === '/stats') { 43 process.send?.({ type: 'getStats' }); 44 45 const handler = (msg: Message) => { 46 if (msg.type === 'stats') { 47 res.writeHead(200, { 'Content-Type': 'application/json' }); 48 res.end(JSON.stringify(msg.data)); 49 process.off('message', handler); 50 } 51 }; 52 53 process.on('message', handler); 54 } else { 55 res.writeHead(200); 56 res.end(`Worker ${process.pid}\n`); 57 } 58 }); 59 60 server.listen(3000); 61}

Sticky Sessions#

1import cluster from 'cluster'; 2import http from 'http'; 3import net from 'net'; 4import os from 'os'; 5 6const numCPUs = os.cpus().length; 7 8if (cluster.isPrimary) { 9 const workers: cluster.Worker[] = []; 10 11 // Fork workers 12 for (let i = 0; i < numCPUs; i++) { 13 workers.push(cluster.fork()); 14 } 15 16 // Create proxy server 17 const server = net.createServer({ pauseOnConnect: true }, (connection) => { 18 // Get client IP for sticky routing 19 const remoteAddress = connection.remoteAddress || ''; 20 21 // Simple hash to pick worker 22 let hash = 0; 23 for (let i = 0; i < remoteAddress.length; i++) { 24 hash = ((hash << 5) - hash) + remoteAddress.charCodeAt(i); 25 hash = hash & hash; 26 } 27 28 const workerIndex = Math.abs(hash) % workers.length; 29 const worker = workers[workerIndex]; 30 31 // Send connection to worker 32 worker.send('connection', connection); 33 }); 34 35 server.listen(3000, () => { 36 console.log('Primary listening on port 3000'); 37 }); 38} else { 39 const server = http.createServer((req, res) => { 40 res.writeHead(200); 41 res.end(`Handled by worker ${process.pid}\n`); 42 }); 43 44 // Don't bind to port, receive connections from primary 45 server.listen(0, 'localhost'); 46 47 process.on('message', (message, connection) => { 48 if (message === 'connection') { 49 server.emit('connection', connection); 50 (connection as net.Socket).resume(); 51 } 52 }); 53}

PM2 Alternative#

1// ecosystem.config.js for PM2 2module.exports = { 3 apps: [{ 4 name: 'api', 5 script: './dist/server.js', 6 instances: 'max', // Use all CPUs 7 exec_mode: 'cluster', 8 watch: false, 9 max_memory_restart: '1G', 10 env: { 11 NODE_ENV: 'production', 12 }, 13 // Zero-downtime restart 14 wait_ready: true, 15 listen_timeout: 10000, 16 kill_timeout: 5000, 17 }], 18}; 19 20// In your app, signal ready 21process.send?.('ready'); 22 23// Handle graceful shutdown 24process.on('SIGINT', () => { 25 // Close connections, etc. 26 process.exit(0); 27});

Load Testing#

1# Install autocannon 2npm install -g autocannon 3 4# Test single instance 5node app-single.js & 6autocannon -c 100 -d 10 http://localhost:3000 7 8# Test cluster 9node app-cluster.js & 10autocannon -c 100 -d 10 http://localhost:3000

Best Practices#

Scaling: ✓ Match workers to CPU cores ✓ Implement graceful shutdown ✓ Handle worker crashes ✓ Use zero-downtime restarts State: ✓ Keep workers stateless ✓ Use external storage (Redis) ✓ IPC for coordination only ✓ Avoid shared mutable state Monitoring: ✓ Track per-worker metrics ✓ Monitor memory usage ✓ Log worker lifecycle events ✓ Set up health checks Production: ✓ Consider PM2 or similar ✓ Use process managers ✓ Configure restart limits ✓ Set memory limits

Conclusion#

Clustering scales Node.js across all CPU cores. Implement graceful shutdown for production reliability, use IPC sparingly for coordination, and keep workers stateless. For production, consider PM2 or Kubernetes for advanced orchestration. Always test with load to verify scaling benefits.

Share this article

Help spread the word about Bootspring