Query Statistics & Execution Plans
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
EXPLAIN ANALYZE Deep Dive
-- Full execution plan analysis
EXPLAIN (
ANALYZE,
BUFFERS,
COSTS,
TIMING,
VERBOSE,
FORMAT JSON
)
SELECT
d.department_name,
COUNT(e.employee_id) AS emp_count
FROM departments d
INNER JOIN employees e ON d.department_id = e.department_id
GROUP BY d.department_name;
βΉοΈ
Key Insight: The Buffers option shows shared hits (from cache), shared reads (from disk), and temp usage. High shared reads indicates the query is I/O bound.
Reading Execution Plans
-- Sequential Scan (full table scan)
EXPLAIN SELECT * FROM large_table WHERE column = 'value';
-- Index Scan
EXPLAIN SELECT * FROM users WHERE user_id = 123;
-- Bitmap Index Scan
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;
-- Nested Loop Join
EXPLAIN SELECT * FROM a INNER JOIN b ON a.id = b.a_id;
-- Hash Join
EXPLAIN SELECT * FROM large_a INNER JOIN large_b ON a.id = b.a_id;
-- Merge Join
EXPLAIN SELECT * FROM sorted_a INNER JOIN sorted_b ON a.id = b.a_id;
pg_stat_statements
-- Enable pg_stat_statements
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- Top queries by total time
SELECT
query,
calls,
total_time,
mean_time,
rows
FROM pg_stat_statements
ORDER BY total_time DESC
LIMIT 10;
-- Queries with highest I/O
SELECT
query,
shared_blks_read,
shared_blks_hit,
ROUND(
shared_blks_hit * 100.0 /
NULLIF(shared_blks_hit + shared_blks_read, 0),
2
) AS cache_hit_ratio
FROM pg_stat_statements
ORDER BY shared_blks_read DESC
LIMIT 10;
Index Usage Statistics
-- Check index usage
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan DESC;
-- Identify unused indexes
SELECT
indexrelname,
idx_scan,
pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexrelname NOT LIKE '%pkey%'
ORDER BY pg_relation_size(indexrelid) DESC;
Table Statistics
-- Table bloat analysis
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
pg_size_pretty(pg_indexes_size(schemaname||'.'||tablename)) AS index_size,
n_live_tup,
n_dead_tup,
ROUND(n_dead_tup * 100.0 / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
β οΈ
Warning: High dead tuple percentage (>10%) indicates vacuum is falling behind. Consider running VACUUM VERBOSE tablename; manually.
Query Plan Costs
-- Understanding cost estimates
EXPLAIN (COSTS, VERBOSE)
SELECT * FROM orders
WHERE customer_id = 123
AND order_date > '2023-01-01';
-- Compare estimated vs actual rows
EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM orders
WHERE status = 'completed'
LIMIT 100;
BigQuery Execution Details
-- BigQuery job statistics
SELECT
job_id,
creation_time,
end_time,
TIMESTAMP_DIFF(end_time, creation_time, SECOND) AS duration_seconds,
total_bytes_processed,
total_bytes_billed,
ROUND(total_bytes_processed / POWER(1024, 3), 2) AS processed_gb,
cache_hit
FROM `project.dataset.INFORMATION_SCHEMA.JOBS`
WHERE query LIKE '%SELECT%'
ORDER BY creation_time DESC
LIMIT 10;
Execution Plan Operators
-- Seq Scan vs Index Scan
EXPLAIN SELECT * FROM employees WHERE department_id = 5;
-- Sort operations
EXPLAIN SELECT * FROM employees ORDER BY salary DESC LIMIT 10;
-- Hash Aggregate
EXPLAIN SELECT department_id, AVG(salary) FROM employees GROUP BY department_id;
-- Window functions
EXPLAIN SELECT
employee_id,
ROW_NUMBER() OVER (ORDER BY salary DESC) AS rank
FROM employees;
Performance Baseline
-- Create performance baseline
CREATE TABLE query_baselines (
query_id SERIAL PRIMARY KEY,
query_text TEXT,
avg_duration_ms FLOAT,
avg_rows_returned INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Record baseline
INSERT INTO query_baselines (query_text, avg_duration_ms, avg_rows_returned)
SELECT
'SELECT * FROM orders WHERE status = $1',
AVG(mean_time),
AVG(rows)
FROM pg_stat_statements
WHERE query LIKE '%orders%status%';
Lock Monitoring
-- Check current locks
SELECT
l.pid,
l.mode,
l.granted,
a.query,
a.query_start,
NOW() - a.query_start AS duration
FROM pg_locks l
INNER JOIN pg_stat_activity a ON l.pid = a.pid
WHERE NOT l.granted
ORDER BY a.query_start;
-- Check for long-running queries
SELECT
pid,
query,
state,
query_start,
NOW() - query_start AS duration
FROM pg_stat_activity
WHERE state = 'active'
AND query_start < NOW() - INTERVAL '5 minutes'
ORDER BY query_start;
Connection Statistics
-- Connection pool statistics
SELECT
datname,
usename,
client_addr,
state,
COUNT(*) AS count
FROM pg_stat_activity
GROUP BY datname, usename, client_addr, state
ORDER BY count DESC;
-- Check for connection leaks
SELECT
datname,
state,
COUNT(*) AS count
FROM pg_stat_activity
GROUP BY datname, state
ORDER BY count DESC;
Vacuum and Analyze
-- Manual vacuum with statistics update
VACUUM (ANALYZE, VERBOSE) employees;
-- Check vacuum progress
SELECT
relname,
phase,
heap_blks_total,
heap_blks_scanned,
heap_blks_vacuumed
FROM pg_stat_progress_vacuum;
-- Update table statistics
ANALYZE employees;
-- Check statistics target
ALTER TABLE employees ALTER COLUMN salary SET STATISTICS 1000;
Follow-Up Questions
- How do you interpret a query plan's cost estimates?
- What's the difference between a bitmap scan and an index scan?
- How do you identify queries that need optimization?
- Explain the impact of table statistics on query planning.
- How do you monitor query performance in real-time?
- What's the best approach for setting performance baselines?