SQL Query Optimization: Writing Faster Database Queries

Slow queries can cripple application performance. This guide covers techniques to analyze, understand, and optimize SQL queries for PostgreSQL and other databases.

Understanding Query Execution#

EXPLAIN ANALYZE#

Always start with understanding how your query executes:

EXPLAIN ANALYZE
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
WHERE u.created_at > '2024-01-01'
GROUP BY u.id;

-- Output:
-- HashAggregate  (cost=1234.56..1234.78 rows=100 width=40) (actual time=45.123..45.456 rows=98 loops=1)
--   Group Key: u.id
--   ->  Hash Right Join  (cost=123.45..1200.00 rows=5000 width=36) (actual time=1.234..40.567 rows=4532 loops=1)
--         Hash Cond: (o.user_id = u.id)
--         ->  Seq Scan on orders o  (cost=0.00..800.00 rows=50000 width=8) (actual time=0.012..20.345 rows=50000 loops=1)
--         ->  Hash  (cost=100.00..100.00 rows=1000 width=36) (actual time=1.000..1.000 rows=1000 loops=1)
--               ->  Seq Scan on users u  (cost=0.00..100.00 rows=1000 width=36) (actual time=0.010..0.800 rows=1000 loops=1)
--                     Filter: (created_at > '2024-01-01'::date)
-- Planning Time: 0.234 ms
-- Execution Time: 45.789 ms

Key Metrics to Watch#

-- Cost: Estimated work (arbitrary units)
-- Rows: Estimated vs actual row counts
-- Time: Planning + Execution time
-- Loops: Number of times operation executed

-- Look for:
-- 1. Seq Scan on large tables (missing index?)
-- 2. Large difference between estimated and actual rows
-- 3. Nested loops with high loop counts
-- 4. Sort operations on large datasets

Common Optimizations#

1. Use Appropriate Indexes#

-- Before: Seq Scan
SELECT * FROM orders WHERE customer_email = 'john@example.com';

-- Add index
CREATE INDEX idx_orders_customer_email ON orders(customer_email);

-- After: Index Scan (much faster)

-- Composite index for multiple columns
CREATE INDEX idx_orders_status_date ON orders(status, created_at);

-- Covers queries like:
SELECT * FROM orders WHERE status = 'pending' AND created_at > '2024-01-01';

2. Avoid SELECT *#

-- Bad: Fetches all columns
SELECT * FROM users WHERE id = 123;

-- Good: Only fetch what you need
SELECT id, name, email FROM users WHERE id = 123;

-- Even better with covering index
CREATE INDEX idx_users_email_name ON users(id) INCLUDE (name, email);
-- Enables index-only scan

3. Optimize JOINs#

-- Ensure join columns are indexed
CREATE INDEX idx_orders_user_id ON orders(user_id);

-- Use appropriate join type
-- INNER JOIN when you need matches in both tables
SELECT u.name, o.total
FROM users u
INNER JOIN orders o ON o.user_id = u.id;

-- LEFT JOIN when you need all from left table
SELECT u.name, COALESCE(COUNT(o.id), 0) as order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id;

4. Limit Result Sets#

-- Bad: Fetch everything, filter in app
SELECT * FROM products WHERE category = 'electronics';

-- Good: Paginate in database
SELECT * FROM products
WHERE category = 'electronics'
ORDER BY created_at DESC
LIMIT 20 OFFSET 0;

-- Better: Keyset pagination for large datasets
SELECT * FROM products
WHERE category = 'electronics'
  AND created_at < '2024-01-15 10:30:00'
ORDER BY created_at DESC
LIMIT 20;

5. Optimize Subqueries#

-- Bad: Correlated subquery (runs for each row)
SELECT u.name,
  (SELECT COUNT(*) FROM orders WHERE user_id = u.id) as order_count
FROM users u;

-- Good: JOIN with aggregation
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id, u.name;

-- Alternative: Lateral join for complex subqueries
SELECT u.name, recent.total
FROM users u
LEFT JOIN LATERAL (
  SELECT SUM(total) as total
  FROM orders
  WHERE user_id = u.id
    AND created_at > NOW() - INTERVAL '30 days'
) recent ON true;

6. Use EXISTS Instead of IN#

-- Slower with large subquery results
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders WHERE total > 1000);

-- Faster: EXISTS stops at first match
SELECT * FROM users u
WHERE EXISTS (
  SELECT 1 FROM orders o
  WHERE o.user_id = u.id AND o.total > 1000
);

7. Optimize GROUP BY#

-- Add index on grouped columns
CREATE INDEX idx_orders_status ON orders(status);

-- Filter before grouping
SELECT status, COUNT(*)
FROM orders
WHERE created_at > '2024-01-01'  -- Filter first
GROUP BY status;

-- Use partial aggregation for time-series
SELECT
  date_trunc('day', created_at) as day,
  COUNT(*)
FROM orders
WHERE created_at > NOW() - INTERVAL '30 days'
GROUP BY date_trunc('day', created_at)
ORDER BY day;

Advanced Techniques#

Materialized Views#

-- Create materialized view for expensive aggregations
CREATE MATERIALIZED VIEW daily_sales AS
SELECT
  date_trunc('day', created_at) as day,
  COUNT(*) as order_count,
  SUM(total) as revenue
FROM orders
GROUP BY date_trunc('day', created_at);

-- Create index on materialized view
CREATE INDEX idx_daily_sales_day ON daily_sales(day);

-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY daily_sales;

-- Query the view instead
SELECT * FROM daily_sales WHERE day > '2024-01-01';

Partitioning#

-- Partition large tables by time
CREATE TABLE orders (
  id SERIAL,
  user_id INTEGER,
  total DECIMAL,
  created_at TIMESTAMP
) PARTITION BY RANGE (created_at);

-- Create partitions
CREATE TABLE orders_2024_q1 PARTITION OF orders
  FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

CREATE TABLE orders_2024_q2 PARTITION OF orders
  FOR VALUES FROM ('2024-04-01') TO ('2024-07-01');

-- Queries automatically use partition pruning
SELECT * FROM orders WHERE created_at > '2024-03-01';
-- Only scans orders_2024_q1 and later partitions

Common Table Expressions (CTEs)#

-- Use CTEs for readability and reuse
WITH active_users AS (
  SELECT id, name
  FROM users
  WHERE last_login > NOW() - INTERVAL '30 days'
),
user_orders AS (
  SELECT user_id, SUM(total) as total_spent
  FROM orders
  WHERE created_at > NOW() - INTERVAL '30 days'
  GROUP BY user_id
)
SELECT
  au.name,
  COALESCE(uo.total_spent, 0) as total_spent
FROM active_users au
LEFT JOIN user_orders uo ON uo.user_id = au.id
ORDER BY total_spent DESC;

Query Anti-Patterns#

1. Functions on Indexed Columns#

-- Bad: Can't use index on email
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';

-- Good: Expression index
CREATE INDEX idx_users_email_lower ON users(LOWER(email));

-- Or store normalized data
ALTER TABLE users ADD COLUMN email_normalized VARCHAR(255);
CREATE INDEX idx_users_email_normalized ON users(email_normalized);

2. Implicit Type Conversions#

-- Bad: user_id is INTEGER, comparing to STRING
SELECT * FROM orders WHERE user_id = '123';

-- Good: Use correct type
SELECT * FROM orders WHERE user_id = 123;

3. OR Conditions#

-- Bad: OR can prevent index usage
SELECT * FROM orders
WHERE status = 'pending' OR status = 'processing';

-- Good: Use IN
SELECT * FROM orders
WHERE status IN ('pending', 'processing');

-- Or UNION for complex conditions
SELECT * FROM orders WHERE status = 'pending'
UNION ALL
SELECT * FROM orders WHERE customer_id = 123;

4. LIKE with Leading Wildcard#

-- Bad: Can't use index
SELECT * FROM products WHERE name LIKE '%phone%';

-- Good: Full-text search
CREATE INDEX idx_products_search ON products USING GIN(to_tsvector('english', name));

SELECT * FROM products
WHERE to_tsvector('english', name) @@ to_tsquery('phone');

Monitoring Query Performance#

-- Enable query statistics
CREATE EXTENSION pg_stat_statements;

-- Find slowest queries
SELECT
  query,
  calls,
  total_exec_time / 1000 as total_seconds,
  mean_exec_time / 1000 as avg_seconds,
  rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;

-- Find queries with most rows scanned
SELECT
  query,
  shared_blks_read + shared_blks_hit as total_blocks,
  rows
FROM pg_stat_statements
ORDER BY shared_blks_read DESC
LIMIT 20;

Query optimization is iterative. Start with EXPLAIN ANALYZE, identify bottlenecks, add appropriate indexes, and restructure queries. Monitor production queries continuously to catch regressions early.