Advanced Indexing Strategies

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

B-Tree Index Fundamentals

-- Standard B-tree index
CREATE INDEX idx_users_email ON users (email);

-- Composite B-tree index
CREATE INDEX idx_orders_customer_date ON orders (customer_id, order_date DESC);

-- Unique constraint (creates unique B-tree index)
ALTER TABLE users ADD CONSTRAINT uk_users_email UNIQUE (email);

ℹ️

Key Insight: B-tree indexes support equality and range queries. The column order in composite indexes matters: put equality columns first, then range columns, then sorting columns.

Partial Indexes

-- Index only active users
CREATE INDEX idx_active_users ON users (email)
WHERE status = 'active';

-- Index recent orders
CREATE INDEX idx_recent_orders ON orders (customer_id, order_date)
WHERE order_date > CURRENT_DATE - INTERVAL '90' DAY;

-- Index pending tasks
CREATE INDEX idx_pending_tasks ON tasks (assigned_to, priority)
WHERE status = 'pending';

GIN Index for Arrays and JSON

-- GIN index for array containment
CREATE INDEX idx_user_tags ON users USING GIN (tags);

-- GIN index for JSONB
CREATE INDEX idx_user_data ON users USING GIN (data);

-- GIN index with specific operator class
CREATE INDEX idx_user_data_path ON users USING GIN (data jsonb_path_ops);

GiST Index for Geospatial

-- GiST index for geometric data
CREATE INDEX idx_locations ON stores USING GiST (
  ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)
);

-- GiST index for range types
CREATE INDEX idx_booking_dates ON bookings USING GiST (
  tsrange(check_in, check_out)
);

Expression Indexes

-- Index on function result
CREATE INDEX idx_users_lower_email ON users (LOWER(email));

-- Index on computed value
CREATE INDEX idx_orders_total ON orders (
  (quantity * unit_price * (1 - discount))
);

-- Index on date extraction
CREATE INDEX idx_orders_month ON orders (EXTRACT(MONTH FROM order_date));

⚠️

Performance Tip: Expression indexes are only used when the query uses the exact same expression. Make sure your query matches the index definition precisely.

Covering Indexes

-- Covering index with INCLUDE
CREATE INDEX idx_orders_covering ON orders (customer_id, order_date)
INCLUDE (total_amount, status);

-- Verify index-only scan
EXPLAIN (ANALYZE)
SELECT order_date, total_amount, status
FROM orders
WHERE customer_id = 123;

BRIN Index for Large Tables

-- Block Range Index for time-series data
CREATE INDEX idx_events_timestamp ON events USING BRIN (event_time);

-- BRIN with custom page range
CREATE INDEX idx_logs_time ON logs USING BRIN (created_at)
WITH (pages_per_range = 32);

Multi-Column Index Strategies

-- Optimal column order
-- 1. Equality predicates
-- 2. Range predicates
-- 3. Sort columns
-- 4. SELECT list columns (INCLUDE)

CREATE INDEX idx_optimal ON orders (
  status,           -- Equality
  order_date,       -- Range
  customer_id       -- Sort
) INCLUDE (total_amount);

Index Maintenance

-- Check index size
SELECT
  indexname,
  pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_indexes
WHERE tablename = 'employees';

-- Rebuild bloated indexes
REINDEX INDEX idx_users_email;

-- Check index usage statistics
SELECT
  indexrelname,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch,
  pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE relname = 'employees'
ORDER BY idx_scan DESC;

Concurrent Index Creation

-- Create index without locking table
CREATE INDEX CONCURRENTLY idx_large_table ON large_table (column_name);

-- Check for invalid indexes
SELECT
  indexrelname,
  indisvalid,
  indisready
FROM pg_stat_user_indexes
INNER JOIN pg_index ON indexrelid = pg_stat_user_indexes.indexrelid
WHERE NOT indisvalid;

Index-Only Scan Optimization

-- Ensure visibility map is up to date
VACUUM employees;

-- Check for index-only scans
EXPLAIN (ANALYZE, BUFFERS)
SELECT department_id, COUNT(*)
FROM employees
GROUP BY department_id;

Specialized Index Types

-- Hash index for equality-only queries
CREATE INDEX idx_users_email_hash ON users USING HASH (email);

-- SP-GiST index for partitioned data
CREATE INDEX idx_ip_addresses ON ip_addresses USING spgist (ip_addr inet_ops);

-- Bloom filter index for multiple columns
CREATE INDEX idx_users_bloom ON users USING bloom (status, country, age);

Index Selection Guidelines

-- Analyze query patterns
SELECT
  query,
  calls,
  mean_time,
  rows
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;

-- Identify missing indexes
SELECT
  relname,
  seq_scan,
  idx_scan,
  CASE
    WHEN seq_scan > 100 AND idx_scan = 0 THEN 'MISSING INDEX'
    WHEN seq_scan > idx_scan THEN 'NEEDS INDEX'
    ELSE 'OK'
  END AS status
FROM pg_stat_user_tables
ORDER BY seq_scan DESC;

ℹ️

Rule of Thumb: If a table scan returns more than 5-10% of rows, the optimizer may choose a sequential scan over an index scan. Partial indexes can help by reducing the index size.

Follow-Up Questions

When would you choose a GIN index over a B-tree index?
How do partial indexes improve query performance?
Explain the difference between GiST and SP-GiST indexes.
How do you determine the optimal column order for composite indexes?
What's the impact of index maintenance on write performance?
How do you handle indexes on partitioned tables?