Database Indexing Strategies for Better Performance

Indexes are the most important tool for query optimization. A well-designed index can turn a query from seconds to milliseconds. Here's how to use them effectively.

How Indexes Work#

Without index (table scan):
Query: SELECT * FROM users WHERE email = 'john@example.com'

Table: 1,000,000 rows
Operation: Scan every row → 1,000,000 comparisons
Time: O(n)

With index (index lookup):
B-tree index on email column
Operation: Tree traversal → ~20 comparisons
Time: O(log n)

Index Types#

B-Tree (Default)#

-- Most common, works for equality and range queries
CREATE INDEX idx_users_email ON users(email);

-- Good for:
SELECT * FROM users WHERE email = 'john@example.com';
SELECT * FROM users WHERE created_at > '2024-01-01';
SELECT * FROM users WHERE name LIKE 'John%';  -- prefix search
SELECT * FROM users ORDER BY created_at DESC;

Hash Index#

-- Only for equality comparisons, faster than B-tree for exact matches
CREATE INDEX idx_users_email_hash ON users USING HASH (email);

-- Good for:
SELECT * FROM users WHERE email = 'john@example.com';

-- NOT good for:
SELECT * FROM users WHERE email LIKE 'john%';  -- Can't use hash
SELECT * FROM users ORDER BY email;  -- Can't use for sorting

GIN (Generalized Inverted Index)#

-- For array, JSONB, and full-text search
CREATE INDEX idx_users_tags ON users USING GIN (tags);
CREATE INDEX idx_products_data ON products USING GIN (metadata);

-- Good for:
SELECT * FROM users WHERE tags @> ARRAY['developer'];
SELECT * FROM products WHERE metadata @> '{"featured": true}';

GiST (Generalized Search Tree)#

-- For geometric and full-text data
CREATE INDEX idx_locations_point ON locations USING GIST (coordinates);
CREATE INDEX idx_documents_search ON documents USING GIST (search_vector);

-- Good for:
SELECT * FROM locations
WHERE coordinates <@ box '((0,0),(100,100))';

Composite Indexes#

Column Order Matters#

-- Index on (a, b, c)
CREATE INDEX idx_composite ON orders(customer_id, status, created_at);

-- Can use index:
WHERE customer_id = 123
WHERE customer_id = 123 AND status = 'pending'
WHERE customer_id = 123 AND status = 'pending' AND created_at > '2024-01-01'

-- Cannot use index efficiently:
WHERE status = 'pending'  -- Skips first column
WHERE customer_id = 123 AND created_at > '2024-01-01'  -- Skips middle column

Design Principles#

-- Put equality conditions first, range conditions last
-- Good
CREATE INDEX idx_orders ON orders(status, customer_id, created_at);
WHERE status = 'pending' AND customer_id = 123 AND created_at > '2024-01-01'

-- Put high-selectivity columns first
-- If status has 5 values but customer_id has 100,000 values:
CREATE INDEX idx_orders ON orders(customer_id, status);

Covering Indexes (Index-Only Scans)#

-- Include all columns needed by query
CREATE INDEX idx_users_email_name ON users(email) INCLUDE (name, created_at);

-- Query satisfied entirely from index, no table access
SELECT name, created_at FROM users WHERE email = 'john@example.com';

-- Check if using index-only scan
EXPLAIN SELECT name, created_at FROM users WHERE email = 'john@example.com';
-- Look for "Index Only Scan"

Partial Indexes#

-- Index only subset of rows
CREATE INDEX idx_active_users ON users(email) WHERE status = 'active';
CREATE INDEX idx_recent_orders ON orders(customer_id) WHERE created_at > '2024-01-01';

-- Much smaller than full index
-- Only used when query matches WHERE clause

-- Good for:
SELECT * FROM users WHERE email = 'john@example.com' AND status = 'active';

Expression Indexes#

-- Index on computed values
CREATE INDEX idx_users_lower_email ON users(LOWER(email));
CREATE INDEX idx_orders_year ON orders(EXTRACT(YEAR FROM created_at));
CREATE INDEX idx_products_price_cents ON products((price * 100)::integer);

-- Query must match expression exactly
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';  -- Uses index
SELECT * FROM users WHERE email = 'john@example.com';  -- Doesn't use index

Analyzing Index Usage#

EXPLAIN ANALYZE#

EXPLAIN ANALYZE SELECT * FROM users WHERE email = 'john@example.com';

-- Output:
Index Scan using idx_users_email on users  (cost=0.42..8.44 rows=1 width=128) (actual time=0.028..0.029 rows=1 loops=1)
  Index Cond: (email = 'john@example.com'::text)
Planning Time: 0.081 ms
Execution Time: 0.048 ms

-- Key things to look for:
-- - "Index Scan" or "Index Only Scan" = using index
-- - "Seq Scan" = table scan (might need index)
-- - "Bitmap Index Scan" = using index, then fetching rows
-- - actual time vs estimated cost

Finding Unused Indexes#

-- PostgreSQL: Check index usage stats
SELECT
  schemaname || '.' || relname AS table,
  indexrelname AS index,
  idx_scan AS times_used,
  pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC;

Finding Missing Indexes#

-- Check for sequential scans on large tables
SELECT
  schemaname || '.' || relname AS table,
  seq_scan,
  seq_tup_read,
  idx_scan,
  n_live_tup AS row_count
FROM pg_stat_user_tables
WHERE seq_scan > 0
  AND n_live_tup > 10000
ORDER BY seq_tup_read DESC;

Common Patterns#

Foreign Key Indexes#

-- Always index foreign keys (not automatic in PostgreSQL)
CREATE TABLE orders (
  id SERIAL PRIMARY KEY,
  customer_id INTEGER REFERENCES customers(id),
  product_id INTEGER REFERENCES products(id)
);

CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_orders_product ON orders(product_id);

Pagination#

-- Keyset pagination is faster than OFFSET
CREATE INDEX idx_posts_created ON posts(created_at DESC, id DESC);

-- Instead of:
SELECT * FROM posts ORDER BY created_at DESC LIMIT 20 OFFSET 1000;

-- Use:
SELECT * FROM posts
WHERE (created_at, id) < ('2024-01-15', 12345)
ORDER BY created_at DESC, id DESC
LIMIT 20;

Soft Deletes#

-- Partial index for non-deleted rows
CREATE INDEX idx_users_active_email ON users(email) WHERE deleted_at IS NULL;

-- Most queries only need active records
SELECT * FROM users WHERE email = 'john@example.com' AND deleted_at IS NULL;

Anti-Patterns#

Over-Indexing#

-- Don't index every column
-- Each index:
-- - Uses disk space
-- - Slows down writes (INSERT, UPDATE, DELETE)
-- - Requires maintenance

-- Only index columns that are:
-- - Used in WHERE clauses
-- - Used in JOIN conditions
-- - Used in ORDER BY

Low Selectivity Indexes#

-- Boolean columns often not worth indexing
CREATE INDEX idx_users_active ON users(is_active);  -- Usually not helpful

-- Better: partial index
CREATE INDEX idx_users_inactive ON users(email) WHERE is_active = false;

Functions Preventing Index Use#

-- Index on email won't help:
SELECT * FROM users WHERE LOWER(email) = 'john@example.com';
SELECT * FROM users WHERE email::text = 'john@example.com';
SELECT * FROM users WHERE COALESCE(email, '') = 'john@example.com';

-- Either create expression index or rewrite query

Maintenance#

Monitoring Index Bloat#

-- Check index bloat (PostgreSQL)
SELECT
  schemaname || '.' || relname AS table,
  indexrelname AS index,
  pg_size_pretty(pg_relation_size(indexrelid)) AS size,
  idx_scan AS scans
FROM pg_stat_user_indexes
ORDER BY pg_relation_size(indexrelid) DESC;

Rebuilding Indexes#

-- Rebuild without blocking (PostgreSQL)
REINDEX INDEX CONCURRENTLY idx_users_email;

-- Or create new and swap
CREATE INDEX CONCURRENTLY idx_users_email_new ON users(email);
DROP INDEX idx_users_email;
ALTER INDEX idx_users_email_new RENAME TO idx_users_email;

Indexes are essential for query performance but come with trade-offs. Start with indexes on foreign keys and frequently-queried columns. Use EXPLAIN ANALYZE to verify index usage. Monitor for unused indexes and remove them.

Remember: the best index is one that helps your actual queries. Profile first, then optimize.