SQL Query Optimization Techniques

Slow queries impact user experience and server resources. Learn to analyze and optimize SQL for better database performance.

Understanding EXPLAIN#

-- PostgreSQL EXPLAIN
EXPLAIN ANALYZE
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
WHERE u.created_at > '2024-01-01'
GROUP BY u.id;

-- Output interpretation
Aggregate  (cost=1234.56..1234.78 rows=100 width=40)
           (actual time=45.123..45.234 rows=100 loops=1)
  ->  Hash Join  (cost=100.00..1200.00 rows=1000 width=24)
                 (actual time=5.123..40.456 rows=1000 loops=1)
        Hash Cond: (o.user_id = u.id)
        ->  Seq Scan on orders o  -- Sequential scan (slow!)
        ->  Hash
              ->  Index Scan on users u
                    Filter: (created_at > '2024-01-01')
Planning Time: 0.5 ms
Execution Time: 45.5 ms

Common Performance Issues#

-- ❌ N+1 Problem
SELECT * FROM users;
-- Then for each user:
SELECT * FROM orders WHERE user_id = ?;

-- ✅ Join in single query
SELECT u.*, o.*
FROM users u
LEFT JOIN orders o ON o.user_id = u.id;

-- ❌ SELECT *
SELECT * FROM users WHERE id = 1;

-- ✅ Select only needed columns
SELECT id, name, email FROM users WHERE id = 1;

-- ❌ Functions on indexed columns
SELECT * FROM users WHERE YEAR(created_at) = 2024;

-- ✅ Use range instead
SELECT * FROM users
WHERE created_at >= '2024-01-01'
AND created_at < '2025-01-01';

Index Optimization#

-- Create indexes for common queries
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_created_at ON orders(created_at);

-- Composite indexes for multi-column queries
-- Order matters! Most selective column first
CREATE INDEX idx_orders_user_status ON orders(user_id, status);

-- Partial indexes for filtered queries
CREATE INDEX idx_active_users ON users(email)
WHERE status = 'active';

-- Include columns to avoid table lookup
CREATE INDEX idx_orders_user_cover ON orders(user_id)
INCLUDE (total, status);

Query Refactoring#

-- ❌ Correlated subquery
SELECT *
FROM products p
WHERE price > (
  SELECT AVG(price)
  FROM products
  WHERE category_id = p.category_id
);

-- ✅ Use JOIN with CTE
WITH category_avg AS (
  SELECT category_id, AVG(price) as avg_price
  FROM products
  GROUP BY category_id
)
SELECT p.*
FROM products p
JOIN category_avg ca ON ca.category_id = p.category_id
WHERE p.price > ca.avg_price;

-- ❌ OR conditions (often prevents index usage)
SELECT * FROM users
WHERE email = 'a@b.com' OR phone = '123';

-- ✅ Use UNION
SELECT * FROM users WHERE email = 'a@b.com'
UNION
SELECT * FROM users WHERE phone = '123';

-- ❌ LIKE with leading wildcard
SELECT * FROM products WHERE name LIKE '%widget%';

-- ✅ Use full-text search
CREATE INDEX idx_products_name_fts ON products
USING gin(to_tsvector('english', name));

SELECT * FROM products
WHERE to_tsvector('english', name) @@ to_tsquery('widget');

Pagination Optimization#

-- ❌ Slow for large offsets
SELECT * FROM products
ORDER BY created_at DESC
LIMIT 20 OFFSET 10000;
-- Has to scan 10,020 rows

-- ✅ Keyset pagination (cursor-based)
SELECT * FROM products
WHERE created_at < '2024-01-15T10:30:00Z'
ORDER BY created_at DESC
LIMIT 20;

-- ✅ With tie-breaker for non-unique columns
SELECT * FROM products
WHERE (created_at, id) < ('2024-01-15T10:30:00Z', 'last-id')
ORDER BY created_at DESC, id DESC
LIMIT 20;

Aggregation Optimization#

-- ❌ Count with subquery
SELECT
  (SELECT COUNT(*) FROM orders WHERE status = 'pending') as pending,
  (SELECT COUNT(*) FROM orders WHERE status = 'completed') as completed;

-- ✅ Single scan with FILTER
SELECT
  COUNT(*) FILTER (WHERE status = 'pending') as pending,
  COUNT(*) FILTER (WHERE status = 'completed') as completed
FROM orders;

-- ✅ Or use CASE
SELECT
  SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
  SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed
FROM orders;

Join Optimization#

-- Prefer explicit JOINs over subqueries
-- ❌ Subquery
SELECT * FROM orders
WHERE user_id IN (
  SELECT id FROM users WHERE country = 'US'
);

-- ✅ JOIN
SELECT o.*
FROM orders o
JOIN users u ON u.id = o.user_id
WHERE u.country = 'US';

-- Use EXISTS for existence checks
-- ❌ IN with large results
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders);

-- ✅ EXISTS
SELECT * FROM users u
WHERE EXISTS (
  SELECT 1 FROM orders o WHERE o.user_id = u.id
);

Batch Operations#

-- ❌ Multiple individual inserts
INSERT INTO logs (message) VALUES ('msg1');
INSERT INTO logs (message) VALUES ('msg2');
INSERT INTO logs (message) VALUES ('msg3');

-- ✅ Batch insert
INSERT INTO logs (message) VALUES
  ('msg1'),
  ('msg2'),
  ('msg3');

-- ❌ Update one by one
UPDATE products SET price = 100 WHERE id = 1;
UPDATE products SET price = 200 WHERE id = 2;

-- ✅ Batch update
UPDATE products
SET price = CASE id
  WHEN 1 THEN 100
  WHEN 2 THEN 200
END
WHERE id IN (1, 2);

-- ✅ Or use VALUES
UPDATE products p
SET price = v.new_price
FROM (VALUES (1, 100), (2, 200)) AS v(id, new_price)
WHERE p.id = v.id;

Monitoring Queries#

-- PostgreSQL: Find slow queries
SELECT
  query,
  calls,
  mean_exec_time,
  total_exec_time
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;

-- Find missing indexes
SELECT
  schemaname,
  relname,
  seq_scan,
  seq_tup_read,
  idx_scan
FROM pg_stat_user_tables
WHERE seq_scan > idx_scan
ORDER BY seq_tup_read DESC;

-- Find unused indexes
SELECT
  indexrelname,
  idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0;

Best Practices#

DO:
✓ Always check EXPLAIN before production
✓ Index columns used in WHERE, JOIN, ORDER BY
✓ Use covering indexes for frequent queries
✓ Batch operations when possible
✓ Use connection pooling
✓ Monitor slow query logs

DON'T:
✗ Select more columns than needed
✗ Use functions on indexed columns
✗ Ignore query plans
✗ Create indexes on low-cardinality columns
✗ Use OFFSET for deep pagination

Conclusion#

Query optimization is iterative. Use EXPLAIN to understand execution plans, add appropriate indexes, and refactor queries that cause full table scans.

Monitor your production queries regularly—performance degrades as data grows.

Share this article