Gap & Island Problems
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
Classic Gap Problem
-- Find missing dates in a date series
WITH date_range AS (
SELECT generate_series(
MIN(sale_date),
MAX(sale_date),
'1 day'::INTERVAL
)::DATE AS expected_date
FROM sales
)
SELECT
dr.expected_date AS missing_date
FROM date_range dr
LEFT JOIN sales s ON dr.expected_date = s.sale_date
WHERE s.sale_date IS NULL
ORDER BY dr.expected_date;
βΉοΈ
Key Insight: The gap-island pattern uses ROW_NUMBER() to identify consecutive sequences. When you subtract the row number from the value, consecutive values produce the same result, creating "islands."
Island Detection with ROW_NUMBER
-- Find consecutive days of user activity
WITH numbered AS (
SELECT
user_id,
activity_date,
activity_date - ROW_NUMBER() OVER (
PARTITION BY user_id ORDER BY activity_date
)::INT AS island_id
FROM user_activity
)
SELECT
user_id,
MIN(activity_date) AS streak_start,
MAX(activity_date) AS streak_end,
COUNT(*) AS streak_length,
MAX(activity_date) - MIN(activity_date) + 1 AS days_span
FROM numbered
GROUP BY user_id, island_id
HAVING COUNT(*) >= 3 -- At least 3 consecutive days
ORDER BY user_id, streak_start;
Gap Detection in Sequential Data
-- Find gaps in order sequences
WITH numbered_orders AS (
SELECT
order_id,
order_date,
LAG(order_date) OVER (ORDER BY order_date) AS prev_date
FROM orders
)
SELECT
prev_date AS gap_end,
order_date AS gap_start,
order_date - prev_date - 1 AS gap_days
FROM numbered_orders
WHERE order_date - prev_date > 1
ORDER BY prev_date;
Consecutive Value Islands
-- Find consecutive identical values
WITH numbered AS (
SELECT
product_id,
sale_date,
revenue,
sale_date - ROW_NUMBER() OVER (
PARTITION BY product_id ORDER BY sale_date
)::INT AS island_id
FROM daily_sales
)
SELECT
product_id,
MIN(sale_date) AS period_start,
MAX(sale_date) AS period_end,
revenue AS daily_revenue,
COUNT(*) AS consecutive_days,
SUM(revenue) AS total_revenue
FROM numbered
GROUP BY product_id, island_id, revenue
HAVING COUNT(*) >= 5
ORDER BY product_id, period_start;
β οΈ
Common Mistake: When working with dates, remember that subtracting ROW_NUMBER from a DATE produces an INTEGER, not a DATE. Cast appropriately when using this technique.
Multi-Column Gap Detection
-- Find gaps across multiple dimensions
WITH numbered AS (
SELECT
department_id,
employee_id,
hire_date,
hire_date - ROW_NUMBER() OVER (
PARTITION BY department_id, employee_id
ORDER BY hire_date
)::INT AS island_id
FROM employee_history
)
SELECT
department_id,
employee_id,
MIN(hire_date) AS period_start,
MAX(hire_date) AS period_end,
COUNT(*) AS consecutive_periods
FROM numbered
GROUP BY department_id, employee_id, island_id
HAVING COUNT(*) >= 2;
Range-Based Islands
-- Group overlapping or adjacent ranges
WITH ordered_ranges AS (
SELECT
room_id,
start_time,
end_time,
ROW_NUMBER() OVER (ORDER BY start_time) AS rn
FROM bookings
),
merged AS (
SELECT
room_id,
start_time,
end_time,
start_time AS group_start,
end_time AS group_end
FROM ordered_ranges
WHERE rn = 1
UNION ALL
SELECT
m.room_id,
o.start_time,
o.end_time,
CASE
WHEN o.start_time <= m.group_end + INTERVAL '30 minutes'
THEN m.group_start
ELSE o.start_time
END,
CASE
WHEN o.end_time > m.group_end
THEN o.end_time
ELSE m.group_end
END
FROM merged m
INNER JOIN ordered_ranges o
ON o.room_id = m.room_id
AND o.rn = (
SELECT MIN(rn) FROM ordered_ranges
WHERE room_id = m.room_id AND rn > (
SELECT MAX(rn) FROM ordered_ranges
WHERE room_id = m.room_id AND start_time <= m.group_end
)
)
)
SELECT
room_id,
MIN(group_start) AS block_start,
MAX(group_end) AS block_end,
MAX(group_end) - MIN(group_start) AS total_duration
FROM merged
GROUP BY room_id;
Missing Consecutive Numbers
-- Find missing consecutive IDs (like missing seats)
WITH numbered AS (
SELECT
seat_number,
seat_number - ROW_NUMBER() OVER (ORDER BY seat_number) AS gap_id
FROM occupied_seats
)
SELECT
MIN(seat_number) + 1 AS gap_start,
MAX(seat_number) - 1 AS gap_end,
MAX(seat_number) - MIN(seat_number) - 1 AS gap_size
FROM numbered
GROUP BY gap_id
HAVING MAX(seat_number) - MIN(seat_number) > 1;
Temporal Gap Detection
-- Find gaps in time series data
WITH time_gaps AS (
SELECT
sensor_id,
reading_time,
LAG(reading_time) OVER (
PARTITION BY sensor_id ORDER BY reading_time
) AS prev_reading,
reading_time - LAG(reading_time) OVER (
PARTITION BY sensor_id ORDER BY reading_time
) AS time_diff
FROM sensor_readings
)
SELECT
sensor_id,
prev_reading AS gap_start,
reading_time AS gap_end,
time_diff AS gap_duration
FROM time_gaps
WHERE time_diff > INTERVAL '5 minutes'
ORDER BY sensor_id, gap_start;
Complex Gap Filling
-- Fill gaps with interpolated values
WITH RECURSIVE date_series AS (
SELECT
MIN(sale_date) AS date,
MAX(sale_date) AS end_date
FROM sales
WHERE product_id = 1
UNION ALL
SELECT
date + 1,
end_date
FROM date_series
WHERE date < end_date
)
SELECT
ds.date,
COALESCE(s.revenue, 0) AS actual_revenue,
-- Linear interpolation
CASE
WHEN s.revenue IS NULL THEN
(SELECT AVG(revenue)
FROM sales
WHERE product_id = 1
AND sale_date BETWEEN ds.date - 7 AND ds.date + 7)
ELSE s.revenue
END AS interpolated_revenue
FROM date_series ds
LEFT JOIN sales s ON ds.date = s.sale_date AND s.product_id = 1
ORDER BY ds.date;
Gap Analysis with Statistics
-- Analyze gap patterns
WITH gaps AS (
SELECT
user_id,
activity_date,
LAG(activity_date) OVER (
PARTITION BY user_id ORDER BY activity_date
) AS prev_activity
FROM user_activity
)
SELECT
user_id,
COUNT(*) AS total_gaps,
AVG(activity_date - prev_activity) AS avg_gap_days,
MAX(activity_date - prev_activity) AS max_gap_days,
MIN(activity_date - prev_activity) AS min_gap_days,
STDDEV(activity_date - prev_activity) AS stddev_gap_days
FROM gaps
WHERE prev_activity IS NOT NULL
AND activity_date - prev_activity > 1
GROUP BY user_id
HAVING COUNT(*) > 5
ORDER BY avg_gap_days DESC;
βΉοΈ
Optimization Tip: For large datasets, create an index on the date column and consider using materialized views for frequently accessed gap analysis queries.
Follow-Up Questions
- How would you find gaps longer than a specific duration?
- What's the most efficient way to fill gaps in a time series?
- How do you handle gaps across multiple columns simultaneously?
- Explain how to detect overlapping ranges and merge them.
- How would you implement a sliding window gap detection?
- What's the best approach for gap analysis in partitioned tables?