Interview Question: "Explain the difference between LATERAL JOIN and CROSS APPLY. When would you use GENERATE_SERIES vs recursive CTE? How does UNNEST work with arrays?" — Asked at Amazon, Google, Netflix for Senior Data Engineer roles
ℹ️
Difficulty: Advanced | Companies: Amazon, Google, Netflix, Uber, Stripe | Time: 45-60 minutes
LATERAL Join Fundamentals
LATERAL allows subqueries to reference preceding FROM items:
-- Create sample data
CREATE TABLE departments (
dept_id INT PRIMARY KEY,
dept_name VARCHAR(100)
);
CREATE TABLE employees (
emp_id INT PRIMARY KEY,
name VARCHAR(100),
dept_id INT,
salary DECIMAL(12,2)
);
INSERT INTO departments VALUES
(1, 'Engineering'),
(2, 'Marketing'),
(3, 'Sales');
INSERT INTO employees VALUES
(1, 'Alice', 1, 120000),
(2, 'Bob', 1, 110000),
(3, 'Charlie', 1, 100000),
(4, 'Diana', 2, 95000),
(5, 'Eve', 2, 90000),
(6, 'Frank', 3, 85000),
(7, 'Grace', 3, 80000),
(8, 'Heidi', 3, 75000);
-- LATERAL: Get top 2 employees per department
SELECT
d.dept_name,
e.name,
e.salary
FROM departments d
CROSS JOIN LATERAL (
SELECT name, salary
FROM employees
WHERE dept_id = d.dept_id
ORDER BY salary DESC
LIMIT 2
) e
ORDER BY d.dept_name, e.salary DESC;
-- LATERAL with array
SELECT
d.dept_name,
e.top_employees
FROM departments d,
LATERAL (
SELECT ARRAY_AGG(name ORDER BY salary DESC) AS top_employees
FROM employees
WHERE dept_id = d.dept_id
) e;
Output:
| dept_name | name | salary |
|---|---|---|
| Engineering | Alice | 120000 |
| Engineering | Bob | 110000 |
| Marketing | Diana | 95000 |
| Marketing | Eve | 90000 |
| Sales | Frank | 85000 |
| Sales | Grace | 80000 |
CROSS APPLY (SQL Server)
-- SQL Server syntax (PostgreSQL uses LATERAL)
SELECT
d.dept_name,
e.name,
e.salary
FROM departments d
CROSS APPLY (
SELECT TOP 2 name, salary
FROM employees
WHERE dept_id = d.dept_id
ORDER BY salary DESC
) e;
-- OUTER APPLY (like LEFT JOIN LATERAL)
SELECT
d.dept_name,
e.name,
e.salary
FROM departments d
OUTER APPLY (
SELECT TOP 1 name, salary
FROM employees
WHERE dept_id = d.dept_id
AND salary > 100000
) e;
UNNEST with Arrays
-- Create table with array column
CREATE TABLE user_preferences (
user_id INT PRIMARY KEY,
name VARCHAR(100),
interests TEXT[],
scores INT[]
);
INSERT INTO user_preferences VALUES
(1, 'Alice', ARRAY['hiking', 'reading', 'coding'], ARRAY[90, 85, 95]),
(2, 'Bob', ARRAY['gaming', 'cooking'], ARRAY[80, 70]),
(3, 'Charlie', ARRAY['travel', 'photography', 'music', 'sports'], ARRAY[88, 75, 92, 85]);
-- UNNEST array to rows
SELECT
name,
UNNEST(interests) AS interest
FROM user_preferences
ORDER BY name, interest;
-- UNNEST with ordinality
SELECT
name,
interest,
position
FROM user_preferences,
LATERAL UNNEST(interests) WITH ORDINALITY AS t(interest, position)
ORDER BY name, position;
-- UNNEST multiple arrays simultaneously
SELECT
name,
interest,
score
FROM user_preferences,
LATERAL UNNEST(interests, scores) AS t(interest, score)
ORDER BY name, score DESC;
Output:
| name | interest | score |
|---|---|---|
| Alice | coding | 95 |
| Alice | hiking | 90 |
| Alice | reading | 85 |
| Charlie | music | 92 |
| Charlie | travel | 88 |
| Charlie | sports | 85 |
| Charlie | photography | 75 |
| Bob | gaming | 80 |
| Bob | cooking | 70 |
GENERATE_SERIES Patterns
-- Generate date series
SELECT
generate_series(
'2024-01-01'::date,
'2024-12-31'::date,
'1 day'::interval
)::date AS date_series;
-- Generate number series with step
SELECT
generate_series(1, 100, 5) AS number_series;
-- Fill gaps in time series
WITH date_series AS (
SELECT generate_series(
'2024-01-01'::date,
'2024-01-10'::date,
'1 day'::interval
)::date AS sale_date
)
SELECT
ds.sale_date,
COALESCE(s.amount, 0) AS amount
FROM date_series ds
LEFT JOIN sales s ON ds.sale_date = s.sale_date
ORDER BY ds.sale_date;
-- Generate hourly intervals
SELECT
generate_series(
'2024-01-01 00:00:00'::timestamp,
'2024-01-01 23:00:00'::timestamp,
'1 hour'::interval
) AS hour_series;
Advanced CTE Patterns
-- Recursive CTE with LATERAL
WITH RECURSIVE org_chart AS (
SELECT
emp_id,
name,
1 AS level,
ARRAY[name] AS path
FROM employees
WHERE emp_id = 1
UNION ALL
SELECT
e.emp_id,
e.name,
oc.level + 1,
oc.path || e.name
FROM org_chart oc
CROSS JOIN LATERAL (
SELECT emp_id, name
FROM employees
WHERE manager_id = oc.emp_id
) e
WHERE oc.level < 5
)
SELECT * FROM org_chart;
-- CTE with window functions
WITH ranked_employees AS (
SELECT
dept_id,
name,
salary,
RANK() OVER (PARTITION BY dept_id ORDER BY salary DESC) AS rank
FROM employees
)
SELECT
d.dept_name,
r.name,
r.salary,
r.rank
FROM ranked_employees r
JOIN departments d ON r.dept_id = d.dept_id
WHERE r.rank <= 2
ORDER BY d.dept_name, r.rank;
JSONB with LATERAL
-- Create table with JSONB
CREATE TABLE orders (
order_id SERIAL PRIMARY KEY,
customer_id INT,
items JSONB
);
INSERT INTO orders (customer_id, items) VALUES
(1, '[{"product": "A", "qty": 2, "price": 10}, {"product": "B", "qty": 1, "price": 20}]'),
(2, '[{"product": "C", "qty": 3, "price": 15}]'),
(3, '[{"product": "A", "qty": 1, "price": 10}, {"product": "C", "qty": 2, "price": 15}]');
-- Unnest JSON array with LATERAL
SELECT
o.order_id,
item->>'product' AS product,
(item->>'qty')::int AS quantity,
(item->>'price')::decimal AS price
FROM orders o,
LATERAL jsonb_array_elements(o.items) AS item
ORDER BY o.order_id;
-- Aggregate JSON items
SELECT
o.order_id,
SUM((item->>'qty')::int * (item->>'price')::decimal) AS total
FROM orders o,
LATERAL jsonb_array_elements(o.items) AS item
GROUP BY o.order_id;
Mathematical Properties
For a LATERAL join with outer rows and inner rows:
For GENERATE_SERIES with range and step :
ℹ️
Performance Tip: LATERAL joins are powerful but can be expensive. Ensure the inner query is indexed and returns few rows per outer row.
Time Series Gap Filling
-- Create time series with gaps
CREATE TABLE metrics (
metric_time TIMESTAMP,
value DECIMAL(10,2)
);
INSERT INTO metrics VALUES
('2024-01-01 00:00:00', 100),
('2024-01-01 01:00:00', 110),
('2024-01-01 03:00:00', 95), -- Gap at hour 2
('2024-01-01 05:00:00', 105); -- Gap at hour 4
-- Fill gaps with interpolation
WITH time_series AS (
SELECT generate_series(
'2024-01-01 00:00:00'::timestamp,
'2024-01-01 05:00:00'::timestamp,
'1 hour'::interval
) AS ts
)
SELECT
ts.ts AS metric_time,
COALESCE(m.value,
LAG(m.value) OVER (ORDER BY ts.ts) +
(ts.ts - LAG(ts.ts) OVER (ORDER BY ts.ts))::interval /
(LEAD(ts.ts) OVER (ORDER BY ts.ts) - LAG(ts.ts) OVER (ORDER BY ts.ts))::interval *
(LEAD(m.value) OVER (ORDER BY ts.ts) - LAG(m.value) OVER (ORDER BY ts.ts))
) AS interpolated_value
FROM time_series ts
LEFT JOIN metrics m ON ts.ts = m.metric_time
ORDER BY ts.ts;
⚠️
Common Pitfall: LATERAL with correlated subqueries can cause N+1 query problems. Always check execution plans.