Database Backup and Recovery Strategies

Data loss can destroy a business. A solid backup strategy protects against hardware failures, human errors, and disasters. Here's how to implement bulletproof backups.

Backup Types#

Full Backup:
- Complete copy of database
- Longest to create
- Fastest to restore
- Use for: Weekly/monthly backups

Incremental Backup:
- Only changes since last backup
- Fastest to create
- Requires all increments to restore
- Use for: Daily/hourly backups

Differential Backup:
- Changes since last full backup
- Medium creation time
- Only needs full + latest differential
- Use for: Daily backups

PostgreSQL Backup#

pg_dump (Logical Backup)#

# Full database backup
pg_dump -h localhost -U postgres -d mydb -F c -f backup.dump

# Specific tables
pg_dump -h localhost -U postgres -d mydb -t users -t orders -F c -f partial.dump

# Schema only
pg_dump -h localhost -U postgres -d mydb --schema-only -f schema.sql

# Data only
pg_dump -h localhost -U postgres -d mydb --data-only -f data.sql

# Compressed backup
pg_dump -h localhost -U postgres -d mydb | gzip > backup.sql.gz

# Restore
pg_restore -h localhost -U postgres -d mydb backup.dump

# Restore with options
pg_restore -h localhost -U postgres -d mydb \
  --clean --if-exists \
  --no-owner --no-privileges \
  backup.dump

pg_basebackup (Physical Backup)#

# Physical backup for point-in-time recovery
pg_basebackup -h localhost -U replication -D /backup/base \
  -Fp -Xs -P -R

# With compression
pg_basebackup -h localhost -U replication -D /backup/base \
  -Ft -z -Xs -P

Continuous Archiving (WAL)#

# postgresql.conf
archive_mode = on
archive_command = 'cp %p /archive/wal/%f'
wal_level = replica

# Point-in-time recovery
restore_command = 'cp /archive/wal/%f %p'
recovery_target_time = '2024-01-15 14:30:00'

Automated Backup Script#

#!/bin/bash
# backup.sh

set -e

# Configuration
DB_HOST="${DB_HOST:-localhost}"
DB_NAME="${DB_NAME:-mydb}"
DB_USER="${DB_USER:-postgres}"
BACKUP_DIR="/backups"
RETENTION_DAYS=30
S3_BUCKET="my-backups"
DATE=$(date +%Y%m%d_%H%M%S)

# Create backup
BACKUP_FILE="${BACKUP_DIR}/${DB_NAME}_${DATE}.dump"
echo "Creating backup: ${BACKUP_FILE}"

pg_dump -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" \
  -F c -f "$BACKUP_FILE"

# Compress
gzip "$BACKUP_FILE"
BACKUP_FILE="${BACKUP_FILE}.gz"

# Upload to S3
echo "Uploading to S3..."
aws s3 cp "$BACKUP_FILE" "s3://${S3_BUCKET}/postgres/${DB_NAME}/"

# Cleanup old local backups
echo "Cleaning up old backups..."
find "$BACKUP_DIR" -name "*.dump.gz" -mtime +$RETENTION_DAYS -delete

# Cleanup old S3 backups
aws s3 ls "s3://${S3_BUCKET}/postgres/${DB_NAME}/" | \
  while read -r line; do
    createDate=$(echo "$line" | awk '{print $1" "$2}')
    createDate=$(date -d "$createDate" +%s)
    olderThan=$(date -d "-${RETENTION_DAYS} days" +%s)
    if [[ $createDate -lt $olderThan ]]; then
      fileName=$(echo "$line" | awk '{print $4}')
      aws s3 rm "s3://${S3_BUCKET}/postgres/${DB_NAME}/$fileName"
    fi
  done

echo "Backup complete: ${BACKUP_FILE}"

Kubernetes CronJob#

apiVersion: batch/v1
kind: CronJob
metadata:
  name: postgres-backup
spec:
  schedule: "0 2 * * *"  # Daily at 2 AM
  concurrencyPolicy: Forbid
  jobTemplate:
    spec:
      template:
        spec:
          containers:
            - name: backup
              image: postgres:15
              command:
                - /bin/bash
                - -c
                - |
                  pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME -F c | \
                    gzip | \
                    aws s3 cp - s3://$S3_BUCKET/backups/$(date +%Y%m%d).dump.gz
              env:
                - name: DB_HOST
                  value: postgres-service
                - name: DB_USER
                  valueFrom:
                    secretKeyRef:
                      name: postgres-secret
                      key: username
                - name: PGPASSWORD
                  valueFrom:
                    secretKeyRef:
                      name: postgres-secret
                      key: password
                - name: AWS_ACCESS_KEY_ID
                  valueFrom:
                    secretKeyRef:
                      name: aws-secret
                      key: access-key
                - name: AWS_SECRET_ACCESS_KEY
                  valueFrom:
                    secretKeyRef:
                      name: aws-secret
                      key: secret-key
          restartPolicy: OnFailure

Recovery Procedures#

// Recovery runbook
const recoveryProcedure = {
  steps: [
    {
      name: 'Assess the situation',
      actions: [
        'Identify what data was lost',
        'Determine recovery point needed',
        'Notify stakeholders',
      ],
    },
    {
      name: 'Prepare recovery environment',
      actions: [
        'Provision new database server if needed',
        'Download backup from S3',
        'Verify backup integrity',
      ],
    },
    {
      name: 'Restore database',
      commands: [
        'createdb -h localhost -U postgres mydb_restored',
        'pg_restore -h localhost -U postgres -d mydb_restored backup.dump',
      ],
    },
    {
      name: 'Verify data',
      actions: [
        'Run data integrity checks',
        'Compare row counts',
        'Test application connectivity',
      ],
    },
    {
      name: 'Switch traffic',
      actions: [
        'Update connection strings',
        'Restart application servers',
        'Monitor for errors',
      ],
    },
  ],
};

Backup Testing#

// Automated backup verification
async function verifyBackup(backupPath: string): Promise<boolean> {
  const tempDb = `verify_${Date.now()}`;

  try {
    // Create temp database
    await exec(`createdb ${tempDb}`);

    // Restore backup
    await exec(`pg_restore -d ${tempDb} ${backupPath}`);

    // Run integrity checks
    const checks = await Promise.all([
      verifyTableCounts(tempDb),
      verifyConstraints(tempDb),
      verifyIndexes(tempDb),
      runSampleQueries(tempDb),
    ]);

    return checks.every((c) => c.passed);
  } finally {
    // Cleanup
    await exec(`dropdb ${tempDb}`);
  }
}

async function verifyTableCounts(db: string): Promise<CheckResult> {
  const result = await query(db, `
    SELECT schemaname, relname, n_live_tup
    FROM pg_stat_user_tables
    ORDER BY n_live_tup DESC
  `);

  // Compare with expected counts
  const expected = await getExpectedCounts();
  const mismatches = result.filter(
    (row) => Math.abs(row.n_live_tup - expected[row.relname]) > 100
  );

  return {
    passed: mismatches.length === 0,
    details: mismatches,
  };
}

Disaster Recovery Plan#

## DR Checklist

### RTO (Recovery Time Objective): 4 hours
### RPO (Recovery Point Objective): 1 hour

### Primary Failure
1. [ ] Detect failure (monitoring alerts)
2. [ ] Assess impact
3. [ ] Initiate failover to replica
4. [ ] Update DNS/load balancer
5. [ ] Verify application connectivity
6. [ ] Notify stakeholders

### Regional Failure
1. [ ] Activate DR region
2. [ ] Restore from cross-region backup
3. [ ] Update global DNS
4. [ ] Scale infrastructure
5. [ ] Verify all services operational

### Data Corruption
1. [ ] Identify affected tables/timeframe
2. [ ] Stop writes to affected tables
3. [ ] Restore from point-in-time backup
4. [ ] Reconcile any lost transactions
5. [ ] Resume normal operations

### Contact List
- DBA On-Call: +1-xxx-xxx-xxxx
- Infrastructure: +1-xxx-xxx-xxxx
- CTO: +1-xxx-xxx-xxxx

Best Practices#

DO:
✓ Test backups regularly (monthly)
✓ Store backups in multiple regions
✓ Encrypt backups at rest
✓ Monitor backup job success
✓ Document recovery procedures
✓ Practice recovery drills

DON'T:
✗ Store backups only on same server
✗ Skip backup verification
✗ Ignore backup failures
✗ Keep backups indefinitely (storage costs)
✗ Forget to backup WAL for PITR

Conclusion#

Backups are insurance—you hope to never need them, but when you do, they're invaluable. Automate backups, test recovery regularly, and document procedures for when disaster strikes.

The time to prepare for data loss is before it happens.

Share this article