Query Optimization Deep Dive

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

EXPLAIN ANALYZE

-- PostgreSQL execution plan analysis
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT
  d.department_name,
  COUNT(e.employee_id) AS emp_count,
  AVG(e.salary) AS avg_salary
FROM departments d
INNER JOIN employees e ON d.department_id = e.department_id
WHERE e.status = 'active'
GROUP BY d.department_name
ORDER BY avg_salary DESC;

ℹ️

Key Insight: Always use EXPLAIN ANALYZE (not just EXPLAIN) to see actual execution times. The BUFFERS option shows I/O usage. Look for sequential scans on large tables as optimization targets.

Index-Only Scans

-- Create covering index for index-only scan
CREATE INDEX idx_emp_dept_salary ON employees (
  department_id, salary
) INCLUDE (employee_id, status);

-- Verify index usage
EXPLAIN (ANALYZE)
SELECT employee_id, salary
FROM employees
WHERE department_id = 5 AND status = 'active';

Join Optimization

-- Optimize join order
-- Bad: Large table driving small table
SELECT *
FROM transactions t
INNER JOIN users u ON t.user_id = u.user_id
WHERE t.amount > 100;

-- Good: Use CTE to filter first
WITH filtered_transactions AS (
  SELECT user_id, amount
  FROM transactions
  WHERE amount > 100
)
SELECT *
FROM filtered_transactions ft
INNER JOIN users u ON ft.user_id = u.user_id;

Subquery vs JOIN Performance

-- Slow: Correlated subquery
SELECT
  department_id,
  (SELECT AVG(salary) FROM employees e2
   WHERE e2.department_id = e1.department_id) AS avg_salary
FROM employees e1;

-- Fast: Window function
SELECT DISTINCT
  department_id,
  AVG(salary) OVER (PARTITION BY department_id) AS avg_salary
FROM employees;

-- Fast: JOIN with aggregation
SELECT
  d.department_id,
  AVG(e.salary) AS avg_salary
FROM departments d
INNER JOIN employees e ON d.department_id = e.department_id
GROUP BY d.department_id;

Predicate Pushdown

-- Push filters as close to source as possible
-- Bad: Filter after aggregation
SELECT *
FROM (
  SELECT department_id, SUM(salary) AS total
  FROM employees
  GROUP BY department_id
) sub
WHERE total > 1000000;

-- Good: Filter in subquery
SELECT department_id, SUM(salary) AS total
FROM employees
GROUP BY department_id
HAVING SUM(salary) > 1000000;

⚠️

Optimization Tip: Use CTEs to pre-filter data before joining. This reduces the number of rows participating in expensive operations like joins and sorts.

Materialized CTEs

-- Force CTE materialization in PostgreSQL 12+
WITH active_users AS MATERIALIZED (
  SELECT user_id, username, email
  FROM users
  WHERE status = 'active'
    AND last_login > CURRENT_DATE - INTERVAL '30' DAY
)
SELECT
  au.user_id,
  COUNT(o.order_id) AS order_count
FROM active_users au
LEFT JOIN orders o ON au.user_id = o.user_id
GROUP BY au.user_id;

Window Function Optimization

-- Optimize window functions by limiting frame size
-- Bad: Full partition scan
SELECT
  employee_id,
  SUM(salary) OVER (
    PARTITION BY department_id
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) AS running_total
FROM employees;

-- Better: Fixed window when appropriate
SELECT
  employee_id,
  SUM(salary) OVER (
    PARTITION BY department_id
    ORDER BY hire_date
    ROWS BETWEEN 11 PRECEDING AND CURRENT ROW
  ) AS rolling_12_month
FROM employees;

Index Strategy for Queries

-- Composite index for multi-predicate queries
CREATE INDEX idx_orders_status_date_amount ON orders (
  status, order_date, amount
);

-- Partial index for selective queries
CREATE INDEX idx_active_users ON users (email)
WHERE status = 'active';

-- Expression index for function-based queries
CREATE INDEX idx_users_lower_email ON users (LOWER(email));

Avoiding SELECT *

-- Bad: Fetches all columns
SELECT * FROM orders WHERE status = 'completed';

-- Good: Only needed columns
SELECT order_id, order_date, total_amount
FROM orders
WHERE status = 'completed';

UNION vs UNION ALL

-- Bad: Unnecessary DISTINCT operation
SELECT user_id FROM orders_2023
UNION
SELECT user_id FROM orders_2024;

-- Good: If duplicates are acceptable or impossible
SELECT user_id FROM orders_2023
UNION ALL
SELECT user_id FROM orders_2024;

Query Rewriting Examples

-- IN vs EXISTS
-- Often slower
SELECT * FROM users
WHERE user_id IN (SELECT user_id FROM orders);

-- Often faster
SELECT * FROM users u
WHERE EXISTS (
  SELECT 1 FROM orders o WHERE o.user_id = u.user_id
);

-- NOT IN vs NOT EXISTS
-- Handle NULLs better
SELECT * FROM users u
WHERE NOT EXISTS (
  SELECT 1 FROM orders o WHERE o.user_id = u.user_id
);

Batch Processing

-- Process large updates in batches
WITH batch AS (
  SELECT order_id
  FROM orders
  WHERE status = 'pending'
  LIMIT 1000
  FOR UPDATE SKIP LOCKED
)
UPDATE orders
SET status = 'processing'
WHERE order_id IN (SELECT order_id FROM batch);

Statistics and Planning

-- Update table statistics
ANALYZE employees;

-- Check table bloat
SELECT
  pg_size_pretty(pg_total_relation_size('employees')) AS total_size,
  pg_size_pretty(pg_relation_size('employees')) AS table_size,
  pg_size_pretty(pg_indexes_size('employees')) AS index_size;

-- Check index usage
SELECT
  indexrelname,
  idx_scan,
  idx_tup_read,
  idx_tup_fetch
FROM pg_stat_user_indexes
WHERE relname = 'employees'
ORDER BY idx_scan DESC;

Follow-Up Questions

How do you identify slow queries in a production environment?
What's the difference between a table scan and an index scan?
How do you optimize queries that use multiple OR conditions?
Explain the concept of query plan caching.
How do you handle query optimization for partitioned tables?
What's the impact of table statistics on query planning?