Advanced Indexing Strategies
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
B-Tree Index Fundamentals
-- Standard B-tree index
CREATE INDEX idx_users_email ON users (email);
-- Composite B-tree index
CREATE INDEX idx_orders_customer_date ON orders (customer_id, order_date DESC);
-- Unique constraint (creates unique B-tree index)
ALTER TABLE users ADD CONSTRAINT uk_users_email UNIQUE (email);
βΉοΈ
Key Insight: B-tree indexes support equality and range queries. The column order in composite indexes matters: put equality columns first, then range columns, then sorting columns.
Partial Indexes
-- Index only active users
CREATE INDEX idx_active_users ON users (email)
WHERE status = 'active';
-- Index recent orders
CREATE INDEX idx_recent_orders ON orders (customer_id, order_date)
WHERE order_date > CURRENT_DATE - INTERVAL '90' DAY;
-- Index pending tasks
CREATE INDEX idx_pending_tasks ON tasks (assigned_to, priority)
WHERE status = 'pending';
GIN Index for Arrays and JSON
-- GIN index for array containment
CREATE INDEX idx_user_tags ON users USING GIN (tags);
-- GIN index for JSONB
CREATE INDEX idx_user_data ON users USING GIN (data);
-- GIN index with specific operator class
CREATE INDEX idx_user_data_path ON users USING GIN (data jsonb_path_ops);
GiST Index for Geospatial
-- GiST index for geometric data
CREATE INDEX idx_locations ON stores USING GiST (
ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)
);
-- GiST index for range types
CREATE INDEX idx_booking_dates ON bookings USING GiST (
tsrange(check_in, check_out)
);
Expression Indexes
-- Index on function result
CREATE INDEX idx_users_lower_email ON users (LOWER(email));
-- Index on computed value
CREATE INDEX idx_orders_total ON orders (
(quantity * unit_price * (1 - discount))
);
-- Index on date extraction
CREATE INDEX idx_orders_month ON orders (EXTRACT(MONTH FROM order_date));
β οΈ
Performance Tip: Expression indexes are only used when the query uses the exact same expression. Make sure your query matches the index definition precisely.
Covering Indexes
-- Covering index with INCLUDE
CREATE INDEX idx_orders_covering ON orders (customer_id, order_date)
INCLUDE (total_amount, status);
-- Verify index-only scan
EXPLAIN (ANALYZE)
SELECT order_date, total_amount, status
FROM orders
WHERE customer_id = 123;
BRIN Index for Large Tables
-- Block Range Index for time-series data
CREATE INDEX idx_events_timestamp ON events USING BRIN (event_time);
-- BRIN with custom page range
CREATE INDEX idx_logs_time ON logs USING BRIN (created_at)
WITH (pages_per_range = 32);
Multi-Column Index Strategies
-- Optimal column order
-- 1. Equality predicates
-- 2. Range predicates
-- 3. Sort columns
-- 4. SELECT list columns (INCLUDE)
CREATE INDEX idx_optimal ON orders (
status, -- Equality
order_date, -- Range
customer_id -- Sort
) INCLUDE (total_amount);
Index Maintenance
-- Check index size
SELECT
indexname,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_indexes
WHERE tablename = 'employees';
-- Rebuild bloated indexes
REINDEX INDEX idx_users_email;
-- Check index usage statistics
SELECT
indexrelname,
idx_scan,
idx_tup_read,
idx_tup_fetch,
pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE relname = 'employees'
ORDER BY idx_scan DESC;
Concurrent Index Creation
-- Create index without locking table
CREATE INDEX CONCURRENTLY idx_large_table ON large_table (column_name);
-- Check for invalid indexes
SELECT
indexrelname,
indisvalid,
indisready
FROM pg_stat_user_indexes
INNER JOIN pg_index ON indexrelid = pg_stat_user_indexes.indexrelid
WHERE NOT indisvalid;
Index-Only Scan Optimization
-- Ensure visibility map is up to date
VACUUM employees;
-- Check for index-only scans
EXPLAIN (ANALYZE, BUFFERS)
SELECT department_id, COUNT(*)
FROM employees
GROUP BY department_id;
Specialized Index Types
-- Hash index for equality-only queries
CREATE INDEX idx_users_email_hash ON users USING HASH (email);
-- SP-GiST index for partitioned data
CREATE INDEX idx_ip_addresses ON ip_addresses USING spgist (ip_addr inet_ops);
-- Bloom filter index for multiple columns
CREATE INDEX idx_users_bloom ON users USING bloom (status, country, age);
Index Selection Guidelines
-- Analyze query patterns
SELECT
query,
calls,
mean_time,
rows
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
-- Identify missing indexes
SELECT
relname,
seq_scan,
idx_scan,
CASE
WHEN seq_scan > 100 AND idx_scan = 0 THEN 'MISSING INDEX'
WHEN seq_scan > idx_scan THEN 'NEEDS INDEX'
ELSE 'OK'
END AS status
FROM pg_stat_user_tables
ORDER BY seq_scan DESC;
βΉοΈ
Rule of Thumb: If a table scan returns more than 5-10% of rows, the optimizer may choose a sequential scan over an index scan. Partial indexes can help by reducing the index size.
Follow-Up Questions
- When would you choose a GIN index over a B-tree index?
- How do partial indexes improve query performance?
- Explain the difference between GiST and SP-GiST indexes.
- How do you determine the optimal column order for composite indexes?
- What's the impact of index maintenance on write performance?
- How do you handle indexes on partitioned tables?