Distributed systems are inherently complex. Understanding core patterns helps you build systems that handle failures gracefully and scale effectively.
The Challenges#
Distributed systems must handle:
- Network failures (partitions, latency)
- Node failures (crashes, restarts)
- Timing issues (clock skew, ordering)
- Partial failures (some nodes fail, others don't)
The CAP theorem:
You can only guarantee two of three:
- Consistency: All nodes see same data
- Availability: Every request gets a response
- Partition tolerance: System works despite network issues
In practice, you must handle partitions, so choose between CP or AP.
Consensus Patterns#
Leader Election#
1// Simple leader election with Redis
2class LeaderElection {
3 private lockKey = 'leader-lock';
4 private lockTTL = 30000; // 30 seconds
5 private heartbeatInterval = 10000; // 10 seconds
6
7 async tryBecomeLeader(nodeId: string): Promise<boolean> {
8 const acquired = await redis.set(
9 this.lockKey,
10 nodeId,
11 'NX', // Only if not exists
12 'PX', this.lockTTL
13 );
14
15 if (acquired) {
16 this.startHeartbeat(nodeId);
17 return true;
18 }
19 return false;
20 }
21
22 private startHeartbeat(nodeId: string): void {
23 setInterval(async () => {
24 const currentLeader = await redis.get(this.lockKey);
25 if (currentLeader === nodeId) {
26 await redis.pexpire(this.lockKey, this.lockTTL);
27 }
28 }, this.heartbeatInterval);
29 }
30
31 async getCurrentLeader(): Promise<string | null> {
32 return redis.get(this.lockKey);
33 }
34}Distributed Locking#
1// Redlock algorithm for distributed locks
2import Redlock from 'redlock';
3
4const redlock = new Redlock(
5 [redis1, redis2, redis3], // Multiple Redis instances
6 {
7 driftFactor: 0.01,
8 retryCount: 10,
9 retryDelay: 200,
10 retryJitter: 200,
11 }
12);
13
14async function processExclusively(resourceId: string): Promise<void> {
15 const lock = await redlock.acquire([`lock:${resourceId}`], 30000);
16
17 try {
18 await doWork(resourceId);
19 } finally {
20 await lock.release();
21 }
22}Consistency Patterns#
Eventual Consistency#
1// Accept eventual consistency with conflict resolution
2interface Document {
3 id: string;
4 content: string;
5 vectorClock: Map<string, number>;
6 lastModified: Date;
7}
8
9function resolveConflict(a: Document, b: Document): Document {
10 // Last-writer-wins
11 if (a.lastModified > b.lastModified) return a;
12 if (b.lastModified > a.lastModified) return b;
13
14 // If same timestamp, use deterministic tiebreaker
15 return a.id < b.id ? a : b;
16}
17
18// Or use CRDTs for automatic resolution
19// Conflict-free Replicated Data TypesRead-Your-Writes Consistency#
1// Ensure user sees their own writes
2class SessionConsistency {
3 async write(key: string, value: any, userId: string): Promise<void> {
4 const version = await this.storage.write(key, value);
5 await this.cache.set(`lastWrite:${userId}:${key}`, version);
6 }
7
8 async read(key: string, userId: string): Promise<any> {
9 const minVersion = await this.cache.get(`lastWrite:${userId}:${key}`) || 0;
10
11 // Read from replica that has at least this version
12 return this.storage.readWithMinVersion(key, minVersion);
13 }
14}Partitioning Patterns#
Consistent Hashing#
1import { createHash } from 'crypto';
2
3class ConsistentHash {
4 private ring: Map<number, string> = new Map();
5 private sortedKeys: number[] = [];
6 private replicas = 150;
7
8 addNode(node: string): void {
9 for (let i = 0; i < this.replicas; i++) {
10 const hash = this.hash(`${node}:${i}`);
11 this.ring.set(hash, node);
12 this.sortedKeys.push(hash);
13 }
14 this.sortedKeys.sort((a, b) => a - b);
15 }
16
17 removeNode(node: string): void {
18 for (let i = 0; i < this.replicas; i++) {
19 const hash = this.hash(`${node}:${i}`);
20 this.ring.delete(hash);
21 this.sortedKeys = this.sortedKeys.filter(k => k !== hash);
22 }
23 }
24
25 getNode(key: string): string {
26 const hash = this.hash(key);
27 const idx = this.sortedKeys.findIndex(k => k >= hash);
28 const nodeHash = idx === -1 ? this.sortedKeys[0] : this.sortedKeys[idx];
29 return this.ring.get(nodeHash)!;
30 }
31
32 private hash(key: string): number {
33 return parseInt(createHash('md5').update(key).digest('hex').slice(0, 8), 16);
34 }
35}Failure Handling Patterns#
Circuit Breaker#
1enum CircuitState {
2 Closed = 'closed',
3 Open = 'open',
4 HalfOpen = 'half-open',
5}
6
7class CircuitBreaker {
8 private state = CircuitState.Closed;
9 private failures = 0;
10 private lastFailure: Date | null = null;
11
12 constructor(
13 private threshold: number = 5,
14 private timeout: number = 60000,
15 private halfOpenRequests: number = 3,
16 ) {}
17
18 async execute<T>(fn: () => Promise<T>): Promise<T> {
19 if (this.state === CircuitState.Open) {
20 if (Date.now() - this.lastFailure!.getTime() > this.timeout) {
21 this.state = CircuitState.HalfOpen;
22 } else {
23 throw new Error('Circuit breaker is open');
24 }
25 }
26
27 try {
28 const result = await fn();
29 this.onSuccess();
30 return result;
31 } catch (error) {
32 this.onFailure();
33 throw error;
34 }
35 }
36
37 private onSuccess(): void {
38 this.failures = 0;
39 this.state = CircuitState.Closed;
40 }
41
42 private onFailure(): void {
43 this.failures++;
44 this.lastFailure = new Date();
45 if (this.failures >= this.threshold) {
46 this.state = CircuitState.Open;
47 }
48 }
49}Bulkhead#
1// Isolate failures to prevent cascade
2class Bulkhead {
3 private semaphores: Map<string, number> = new Map();
4
5 constructor(private limits: Map<string, number>) {
6 for (const [name, limit] of limits) {
7 this.semaphores.set(name, limit);
8 }
9 }
10
11 async execute<T>(partition: string, fn: () => Promise<T>): Promise<T> {
12 const available = this.semaphores.get(partition) || 0;
13
14 if (available <= 0) {
15 throw new Error(`Bulkhead ${partition} is full`);
16 }
17
18 this.semaphores.set(partition, available - 1);
19
20 try {
21 return await fn();
22 } finally {
23 this.semaphores.set(partition, this.semaphores.get(partition)! + 1);
24 }
25 }
26}
27
28// Usage
29const bulkhead = new Bulkhead(new Map([
30 ['payment-service', 10],
31 ['email-service', 5],
32 ['analytics', 20],
33]));
34
35// Payment service failure won't affect email service
36await bulkhead.execute('payment-service', () => processPayment());
37await bulkhead.execute('email-service', () => sendEmail());Retry with Exponential Backoff#
1async function retryWithBackoff<T>(
2 fn: () => Promise<T>,
3 options: {
4 maxRetries: number;
5 baseDelay: number;
6 maxDelay: number;
7 jitter: boolean;
8 }
9): Promise<T> {
10 let lastError: Error;
11
12 for (let attempt = 0; attempt < options.maxRetries; attempt++) {
13 try {
14 return await fn();
15 } catch (error) {
16 lastError = error as Error;
17
18 if (attempt < options.maxRetries - 1) {
19 let delay = Math.min(
20 options.baseDelay * Math.pow(2, attempt),
21 options.maxDelay
22 );
23
24 if (options.jitter) {
25 delay = delay * (0.5 + Math.random());
26 }
27
28 await new Promise(r => setTimeout(r, delay));
29 }
30 }
31 }
32
33 throw lastError!;
34}Saga Pattern#
1// Manage distributed transactions
2interface SagaStep {
3 execute(): Promise<void>;
4 compensate(): Promise<void>;
5}
6
7class Saga {
8 private completedSteps: SagaStep[] = [];
9
10 async execute(steps: SagaStep[]): Promise<void> {
11 for (const step of steps) {
12 try {
13 await step.execute();
14 this.completedSteps.push(step);
15 } catch (error) {
16 await this.rollback();
17 throw error;
18 }
19 }
20 }
21
22 private async rollback(): Promise<void> {
23 // Compensate in reverse order
24 for (const step of this.completedSteps.reverse()) {
25 try {
26 await step.compensate();
27 } catch (error) {
28 console.error('Compensation failed:', error);
29 // Log for manual intervention
30 }
31 }
32 }
33}
34
35// Usage
36const orderSaga = new Saga();
37await orderSaga.execute([
38 {
39 execute: () => reserveInventory(orderId),
40 compensate: () => releaseInventory(orderId),
41 },
42 {
43 execute: () => chargePayment(orderId),
44 compensate: () => refundPayment(orderId),
45 },
46 {
47 execute: () => shipOrder(orderId),
48 compensate: () => cancelShipment(orderId),
49 },
50]);Conclusion#
Distributed systems require thinking about failures at every level. Use consensus for coordination, understand consistency tradeoffs, and implement resilience patterns like circuit breakers and bulkheads.
Test failure scenarios aggressively. The patterns that seem theoretical become critical when systems fail at 3 AM.