Query Optimization Deep Dive
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
EXPLAIN ANALYZE
-- PostgreSQL execution plan analysis
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT
d.department_name,
COUNT(e.employee_id) AS emp_count,
AVG(e.salary) AS avg_salary
FROM departments d
INNER JOIN employees e ON d.department_id = e.department_id
WHERE e.status = 'active'
GROUP BY d.department_name
ORDER BY avg_salary DESC;
βΉοΈ
Key Insight: Always use EXPLAIN ANALYZE (not just EXPLAIN) to see actual execution times. The BUFFERS option shows I/O usage. Look for sequential scans on large tables as optimization targets.
Index-Only Scans
-- Create covering index for index-only scan
CREATE INDEX idx_emp_dept_salary ON employees (
department_id, salary
) INCLUDE (employee_id, status);
-- Verify index usage
EXPLAIN (ANALYZE)
SELECT employee_id, salary
FROM employees
WHERE department_id = 5 AND status = 'active';
Join Optimization
-- Optimize join order
-- Bad: Large table driving small table
SELECT *
FROM transactions t
INNER JOIN users u ON t.user_id = u.user_id
WHERE t.amount > 100;
-- Good: Use CTE to filter first
WITH filtered_transactions AS (
SELECT user_id, amount
FROM transactions
WHERE amount > 100
)
SELECT *
FROM filtered_transactions ft
INNER JOIN users u ON ft.user_id = u.user_id;
Subquery vs JOIN Performance
-- Slow: Correlated subquery
SELECT
department_id,
(SELECT AVG(salary) FROM employees e2
WHERE e2.department_id = e1.department_id) AS avg_salary
FROM employees e1;
-- Fast: Window function
SELECT DISTINCT
department_id,
AVG(salary) OVER (PARTITION BY department_id) AS avg_salary
FROM employees;
-- Fast: JOIN with aggregation
SELECT
d.department_id,
AVG(e.salary) AS avg_salary
FROM departments d
INNER JOIN employees e ON d.department_id = e.department_id
GROUP BY d.department_id;
Predicate Pushdown
-- Push filters as close to source as possible
-- Bad: Filter after aggregation
SELECT *
FROM (
SELECT department_id, SUM(salary) AS total
FROM employees
GROUP BY department_id
) sub
WHERE total > 1000000;
-- Good: Filter in subquery
SELECT department_id, SUM(salary) AS total
FROM employees
GROUP BY department_id
HAVING SUM(salary) > 1000000;
β οΈ
Optimization Tip: Use CTEs to pre-filter data before joining. This reduces the number of rows participating in expensive operations like joins and sorts.
Materialized CTEs
-- Force CTE materialization in PostgreSQL 12+
WITH active_users AS MATERIALIZED (
SELECT user_id, username, email
FROM users
WHERE status = 'active'
AND last_login > CURRENT_DATE - INTERVAL '30' DAY
)
SELECT
au.user_id,
COUNT(o.order_id) AS order_count
FROM active_users au
LEFT JOIN orders o ON au.user_id = o.user_id
GROUP BY au.user_id;
Window Function Optimization
-- Optimize window functions by limiting frame size
-- Bad: Full partition scan
SELECT
employee_id,
SUM(salary) OVER (
PARTITION BY department_id
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS running_total
FROM employees;
-- Better: Fixed window when appropriate
SELECT
employee_id,
SUM(salary) OVER (
PARTITION BY department_id
ORDER BY hire_date
ROWS BETWEEN 11 PRECEDING AND CURRENT ROW
) AS rolling_12_month
FROM employees;
Index Strategy for Queries
-- Composite index for multi-predicate queries
CREATE INDEX idx_orders_status_date_amount ON orders (
status, order_date, amount
);
-- Partial index for selective queries
CREATE INDEX idx_active_users ON users (email)
WHERE status = 'active';
-- Expression index for function-based queries
CREATE INDEX idx_users_lower_email ON users (LOWER(email));
Avoiding SELECT *
-- Bad: Fetches all columns
SELECT * FROM orders WHERE status = 'completed';
-- Good: Only needed columns
SELECT order_id, order_date, total_amount
FROM orders
WHERE status = 'completed';
UNION vs UNION ALL
-- Bad: Unnecessary DISTINCT operation
SELECT user_id FROM orders_2023
UNION
SELECT user_id FROM orders_2024;
-- Good: If duplicates are acceptable or impossible
SELECT user_id FROM orders_2023
UNION ALL
SELECT user_id FROM orders_2024;
Query Rewriting Examples
-- IN vs EXISTS
-- Often slower
SELECT * FROM users
WHERE user_id IN (SELECT user_id FROM orders);
-- Often faster
SELECT * FROM users u
WHERE EXISTS (
SELECT 1 FROM orders o WHERE o.user_id = u.user_id
);
-- NOT IN vs NOT EXISTS
-- Handle NULLs better
SELECT * FROM users u
WHERE NOT EXISTS (
SELECT 1 FROM orders o WHERE o.user_id = u.user_id
);
Batch Processing
-- Process large updates in batches
WITH batch AS (
SELECT order_id
FROM orders
WHERE status = 'pending'
LIMIT 1000
FOR UPDATE SKIP LOCKED
)
UPDATE orders
SET status = 'processing'
WHERE order_id IN (SELECT order_id FROM batch);
Statistics and Planning
-- Update table statistics
ANALYZE employees;
-- Check table bloat
SELECT
pg_size_pretty(pg_total_relation_size('employees')) AS total_size,
pg_size_pretty(pg_relation_size('employees')) AS table_size,
pg_size_pretty(pg_indexes_size('employees')) AS index_size;
-- Check index usage
SELECT
indexrelname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
WHERE relname = 'employees'
ORDER BY idx_scan DESC;
Follow-Up Questions
- How do you identify slow queries in a production environment?
- What's the difference between a table scan and an index scan?
- How do you optimize queries that use multiple OR conditions?
- Explain the concept of query plan caching.
- How do you handle query optimization for partitioned tables?
- What's the impact of table statistics on query planning?