π¦ Table Partitioning
Uber & Netflix Interview Deep Dive
π Interview Question
βΉοΈπ΄ Uber/Netflix Interview Question
"Design a partitioning strategy for a ride-sharing events table with 10 billion rows. Consider: 1) Range vs hash partitioning trade-offs, 2) Partition pruning optimization, 3) Partition maintenance and archival, 4) How to handle hot partitions during peak hours."
Companies: Uber, Netflix | Difficulty: Hard | Time: 40 minutes
π Setup: Large Events Table
-- Create a large events table (simulating ride-sharing data)
CREATE TABLE ride_events (
event_id BIGSERIAL,
ride_id UUID,
driver_id INT,
rider_id INT,
event_type VARCHAR(50), -- 'request', 'accept', 'pickup', 'dropoff', 'cancel'
event_timestamp TIMESTAMP,
pickup_lat DECIMAL(10, 7),
pickup_lon DECIMAL(10, 7),
dropoff_lat DECIMAL(10, 7),
dropoff_lon DECIMAL(10, 7),
fare_amount DECIMAL(10, 2),
distance_miles DECIMAL(8, 2),
duration_minutes INT,
city VARCHAR(100),
state VARCHAR(50),
PRIMARY KEY (event_id, event_timestamp)
) PARTITION BY RANGE (event_timestamp);
π Part 1: Range Partitioning
βΉοΈπ Range Partitioning
Range partitioning divides data by a range of values (e.g., dates). Ideal for:
- Time-series data
- Data with natural ordering
- Queries that filter by time ranges
Monthly Partitions
-- Create monthly partitions
CREATE TABLE ride_events_2023_01 PARTITION OF ride_events
FOR VALUES FROM ('2023-01-01') TO ('2023-02-01');
CREATE TABLE ride_events_2023_02 PARTITION OF ride_events
FOR VALUES FROM ('2023-02-01') TO ('2023-03-01');
CREATE TABLE ride_events_2023_03 PARTITION OF ride_events
FOR VALUES FROM ('2023-03-01') TO ('2023-04-01');
-- Create partitions for entire year
DO $$
DECLARE
month_date DATE;
next_month DATE;
partition_name TEXT;
BEGIN
FOR i IN 0..11 LOOP
month_date := DATE '2023-01-01' + (i * INTERVAL '1 month');
next_month := month_date + INTERVAL '1 month';
partition_name := 'ride_events_' || TO_CHAR(month_date, 'YYYY_MM');
EXECUTE FORMAT(
'CREATE TABLE IF NOT EXISTS %I PARTITION OF ride_events
FOR VALUES FROM (%L) TO (%L)',
partition_name,
month_date,
next_month
);
END LOOP;
END $$;
-- Create default partition for unpartitioned data
CREATE TABLE ride_events_default PARTITION OF ride_events DEFAULT;
Partition Pruning
-- Query that benefits from partition pruning
EXPLAIN ANALYZE
SELECT *
FROM ride_events
WHERE event_timestamp >= '2023-06-01'
AND event_timestamp < '2023-07-01';
-- Only scans ride_events_2023_06 partition
-- Output shows: Partition for ride_events_2023_06
-- Query without partition pruning (BAD)
EXPLAIN ANALYZE
SELECT *
FROM ride_events
WHERE EXTRACT(MONTH FROM event_timestamp) = 6;
-- Scans all partitions!
π’ Part 2: List Partitioning
-- List partitioning by city/region
CREATE TABLE ride_events_by_city (
event_id BIGSERIAL,
ride_id UUID,
event_timestamp TIMESTAMP,
city VARCHAR(100),
event_type VARCHAR(50),
fare_amount DECIMAL(10, 2),
PRIMARY KEY (event_id, city)
) PARTITION BY LIST (city);
-- Create partitions for major cities
CREATE TABLE ride_events_nyc PARTITION OF ride_events_by_city
FOR VALUES IN ('New York', 'Brooklyn', 'Queens', 'Bronx');
CREATE TABLE ride_events_sf PARTITION OF ride_events_by_city
FOR VALUES IN ('San Francisco', 'Oakland', 'San Jose');
CREATE TABLE ride_events_chicago PARTITION OF ride_events_by_city
FOR VALUES IN ('Chicago', 'Evanston', 'Naperville');
CREATE TABLE ride_events_la PARTITION OF ride_events_by_city
FOR VALUES IN ('Los Angeles', 'Santa Monica', 'Pasadena');
CREATE TABLE ride_events_seattle PARTITION OF ride_events_by_city
FOR VALUES IN ('Seattle', 'Bellevue', 'Redmond');
-- Default partition for other cities
CREATE TABLE ride_events_other_cities PARTITION OF ride_events_by_city DEFAULT;
π Part 3: Hash Partitioning
-- Hash partitioning for even distribution
CREATE TABLE ride_events_hash (
event_id BIGSERIAL,
ride_id UUID,
driver_id INT,
rider_id INT,
event_timestamp TIMESTAMP,
event_type VARCHAR(50),
fare_amount DECIMAL(10, 2),
PRIMARY KEY (event_id, driver_id)
) PARTITION BY HASH (driver_id);
-- Create 8 hash partitions
CREATE TABLE ride_events_hash_0 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 0);
CREATE TABLE ride_events_hash_1 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 1);
CREATE TABLE ride_events_hash_2 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 2);
CREATE TABLE ride_events_hash_3 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 3);
CREATE TABLE ride_events_hash_4 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 4);
CREATE TABLE ride_events_hash_5 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 5);
CREATE TABLE ride_events_hash_6 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 6);
CREATE TABLE ride_events_hash_7 PARTITION OF ride_events_hash
FOR VALUES WITH (MODULUS 8, REMAINDER 7);
π Part 4: Composite Partitioning
-- Composite: Range + List
CREATE TABLE ride_events_composite (
event_id BIGSERIAL,
ride_id UUID,
event_timestamp TIMESTAMP,
city VARCHAR(100),
event_type VARCHAR(50),
fare_amount DECIMAL(10, 2),
PRIMARY KEY (event_id, event_timestamp, city)
) PARTITION BY RANGE (event_timestamp);
-- Sub-partitions by city for each month
CREATE TABLE ride_events_2023_01_nyc PARTITION OF ride_events_composite
FOR VALUES FROM ('2023-01-01') TO ('2023-02-01')
PARTITION BY LIST (city);
-- This is complex; typically use partition inheritance instead
π§ Part 5: Partition Maintenance
Automatic Partition Creation
-- Procedure to create future partitions
CREATE OR REPLACE PROCEDURE create_monthly_partitions(
p_start_date DATE,
p_months_ahead INT DEFAULT 3
)
LANGUAGE plpgsql
AS $$
DECLARE
month_date DATE;
next_month DATE;
partition_name TEXT;
BEGIN
FOR i IN 0..(p_months_ahead - 1) LOOP
month_date := p_start_date + (i * INTERVAL '1 month');
next_month := month_date + INTERVAL '1 month';
partition_name := 'ride_events_' || TO_CHAR(month_date, 'YYYY_MM');
-- Check if partition exists
IF NOT EXISTS (
SELECT 1 FROM pg_class
WHERE relname = partition_name
) THEN
EXECUTE FORMAT(
'CREATE TABLE %I PARTITION OF ride_events
FOR VALUES FROM (%L) TO (%L)',
partition_name,
month_date,
next_month
);
RAISE NOTICE 'Created partition: %', partition_name;
END IF;
END LOOP;
END;
$$;
-- Create partitions for next 3 months
CALL create_monthly_partitions(CURRENT_DATE, 3);
Partition Archival
-- Archive old partitions
CREATE OR REPLACE PROCEDURE archive_old_partitions(
p_retention_months INT DEFAULT 12
)
LANGUAGE plpgsql
AS $$
DECLARE
partition RECORD;
cutoff_date DATE;
BEGIN
cutoff_date := DATE_TRUNC('month', NOW()) - (p_retention_months || ' months')::INTERVAL;
FOR partition IN
SELECT tablename
FROM pg_tables
WHERE tablename LIKE 'ride_events_%'
AND tablename != 'ride_events_default'
AND tablename < 'ride_events_' || TO_CHAR(cutoff_date, 'YYYY_MM')
LOOP
-- Rename to archive
EXECUTE FORMAT(
'ALTER TABLE IF EXISTS %I RENAME TO %I',
partition.tablename,
partition.tablename || '_archived'
);
RAISE NOTICE 'Archived partition: %', partition.tablename;
END LOOP;
END;
$$;
Partition Statistics
-- View partition information
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
pg_size_pretty(pg_indexes_size(schemaname||'.'||tablename::regclass)) AS index_size
FROM pg_tables
WHERE tablename LIKE 'ride_events_%'
ORDER BY tablename;
-- Check partition bounds
SELECT
nmsp_parent.nspname AS parent_schema,
parent.relname AS parent_table,
nmsp_child.nspname AS child_schema,
child.relname AS child_table,
pg_get_expr(child.relpartbound, child.oid) AS partition_bounds
FROM pg_inherits
JOIN pg_class parent ON pg_inherits.inhparent = parent.oid
JOIN pg_class child ON pg_inherits.inhrelid = child.oid
JOIN pg_namespace nmsp_parent ON parent.relnamespace = nmsp_parent.oid
JOIN pg_namespace nmsp_child ON child.relnamespace = nmsp_child.oid
WHERE parent.relname = 'ride_events';
π Part 6: Performance Analysis
Partition Pruning Verification
-- Verify partition pruning with EXPLAIN
EXPLAIN (ANALYZE, BUFFERS)
SELECT *
FROM ride_events
WHERE event_timestamp >= '2023-06-01'
AND event_timestamp < '2023-07-01'
AND city = 'New York';
-- Should show only relevant partitions being scanned
-- Look for: "Append" with specific partition names
Hot Partition Handling
-- Identify hot partitions (frequent writes)
SELECT
tablename,
n_tup_ins AS inserts,
n_tup_upd AS updates,
n_tup_del AS deletes,
n_live_tup AS live_rows,
n_dead_tup AS dead_rows,
last_vacuum,
last_autovacuum
FROM pg_stat_user_tables
WHERE tablename LIKE 'ride_events_%'
ORDER BY n_tup_ins DESC;
-- Check for lock contention
SELECT
l.locktype,
l.relation::regclass,
l.mode,
l.granted,
a.pid,
a.query
FROM pg_locks l
JOIN pg_stat_activity a ON l.pid = a.pid
WHERE l.relation::regclass::text LIKE 'ride_events_%'
AND NOT l.granted;
π― Quiz Section
π Best Practices for Interviews
π‘β Partitioning Best Practices
1. Choose Partition Key Carefully:
-- Good: Queries always filter by date
PARTITION BY RANGE (event_timestamp)
-- Good: Queries always filter by region
PARTITION BY LIST (region)
-- Avoid: Column rarely used in WHERE
PARTITION BY RANGE (random_column)
2. Ensure Partition Pruning Works:
-- BAD: Prevents pruning
WHERE EXTRACT(MONTH FROM event_timestamp) = 6
-- GOOD: Enables pruning
WHERE event_timestamp >= '2023-06-01'
AND event_timestamp < '2023-07-01'
3. Plan for Growth:
-- Create partitions ahead of time
-- Use procedures to automate creation
-- Monitor partition sizes
4. Archive Old Data:
-- Drop old partitions instead of DELETE
-- Much faster and reclaim space immediately
DROP TABLE ride_events_2022_01;
5. Index Each Partition:
-- Create indexes on the parent table (inherited by partitions)
CREATE INDEX idx_ride_events_timestamp
ON ride_events (event_timestamp);
-- Or create partition-specific indexes
CREATE INDEX idx_ride_events_2023_06_timestamp
ON ride_events_2023_06 (event_timestamp);
6. Monitor Partition Health:
-- Check for skewed partitions
SELECT
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename))
FROM pg_tables
WHERE tablename LIKE 'ride_events_%'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
β οΈβ οΈ Partitioning Pitfalls
- Over-partitioning: Too many small partitions hurt performance
- Under-partitioning: Too few large partitions don't provide benefits
- Wrong partition key: Column not used in queries won't enable pruning
- Missing default partition: Inserts fail for non-matching values
- Forgetting maintenance: Old partitions accumulate and waste space