Advanced SQL for Data Engineers

Beyond the Basics

Advanced SQL separates competent data engineers from exceptional ones. These techniques enable you to solve complex data problems, optimize performance, and build sophisticated transformations.

Complex Window Functions

Advanced Window Frames

-- Cumulative distribution
SELECT 
    employee_name,
    salary,
    department,
    CUME_DIST() OVER (
        PARTITION BY department 
        ORDER BY salary DESC
    ) AS percentile,
    PERCENT_RANK() OVER (
        PARTITION BY department 
        ORDER BY salary DESC
    ) AS percentile_rank
FROM employees;

-- First and last values in window
SELECT 
    order_date,
    daily_revenue,
    FIRST_VALUE(daily_revenue) OVER (
        ORDER BY order_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS first_day_revenue,
    LAST_VALUE(daily_revenue) OVER (
        ORDER BY order_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_day_revenue
FROM daily_sales;

-- NTH value
SELECT 
    product_name,
    category,
    total_sales,
    NTH_VALUE(product_name, 3) OVER (
        PARTITION BY category 
        ORDER BY total_sales DESC
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS third_best_seller
FROM product_sales;

Window Function Statistical Analysis

Window Functions for Data Quality

-- Detect consecutive days with zero sales
WITH daily_sales AS (
    SELECT generate_series('2024-01-01'::date, '2024-12-31'::date, '1 day') AS sale_date
),
actual_sales AS (
    SELECT DISTINCT order_date::date AS sale_date
    FROM orders
),
missing_days AS (
    SELECT d.sale_date
    FROM daily_sales d
    LEFT JOIN actual_sales a ON d.sale_date = a.sale_date
    WHERE a.sale_date IS NULL
),
consecutive_groups AS (
    SELECT 
        sale_date,
        sale_date - ROW_NUMBER() OVER (ORDER BY sale_date)::int AS grp
    FROM missing_days
)
SELECT 
    MIN(sale_date) AS gap_start,
    MAX(sale_date) AS gap_end,
    COUNT(*) AS consecutive_days
FROM consecutive_groups
GROUP BY grp
HAVING COUNT(*) >= 3
ORDER BY gap_start;

-- Find gaps in sequential IDs
WITH numbered AS (
    SELECT 
        order_id,
        order_id - ROW_NUMBER() OVER (ORDER BY order_id) AS gap_group
    FROM orders
)
SELECT 
    MIN(order_id) AS gap_start,
    MAX(order_id) AS gap_end,
    MAX(order_id) - MIN(order_id) - COUNT(*) + 1 AS gap_size
FROM numbered
GROUP BY gap_group
HAVING MAX(order_id) - MIN(order_id) > COUNT(*) - 1
ORDER BY gap_start;

Window Function Performance Considerations

Function	Performance	Memory Usage	Optimization
ROW_NUMBER()	Fast	Low	Use for deduplication
RANK()	Fast	Low	Use for leaderboards
LAG/LEAD	Fast	Low	Use for time series
SUM/AVG OVER	Medium	Medium	Partition appropriately
NTILE()	Fast	Low	Use for bucketing
FIRST_VALUE/LAST_VALUE	Fast	Low	Use for snapshots

Recursive CTEs

Recursive CTEs enable hierarchical and graph-like queries — essential for org charts, bill of materials, and pathfinding.

Syntax

WITH RECURSIVE cte_name AS (
    -- Base case (anchor member)
    SELECT ... FROM table WHERE condition
    
    UNION ALL
    
    -- Recursive member
    SELECT ... FROM table 
    INNER JOIN cte_name ON ...
)
SELECT * FROM cte_name;

Practical Examples

-- Organizational hierarchy
WITH RECURSIVE org_chart AS (
    -- Base case: CEO (no manager)
    SELECT 
        employee_id,
        name,
        manager_id,
        1 AS level,
        name::text AS path
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    -- Recursive: employees with managers
    SELECT 
        e.employee_id,
        e.name,
        e.manager_id,
        oc.level + 1,
        oc.path || ' -> ' || e.name
    FROM employees e
    INNER JOIN org_chart oc ON e.manager_id = oc.employee_id
)
SELECT 
    employee_id,
    name,
    level,
    path
FROM org_chart
ORDER BY path;

-- Bill of materials (product components)
WITH RECURSIVE bom AS (
    -- Base case: top-level product
    SELECT 
        product_id,
        component_id,
        quantity,
        1 AS depth,
        product_id::text AS path
    FROM bill_of_materials
    WHERE product_id = 1001
    
    UNION ALL
    
    -- Recursive: sub-components
    SELECT 
        b.product_id,
        b.component_id,
        b.quantity * bom.quantity AS quantity,
        bom.depth + 1,
        bom.path || '.' || b.component_id::text
    FROM bill_of_materials b
    INNER JOIN bom ON b.product_id = bom.component_id
)
SELECT 
    component_id,
    SUM(quantity) AS total_quantity,
    MAX(depth) AS max_depth
FROM bom
GROUP BY component_id
ORDER BY component_id;

-- Find all ancestors of a node
WITH RECURSIVE ancestors AS (
    SELECT 
        employee_id,
        name,
        manager_id,
        ARRAY[employee_id] AS visited
    FROM employees
    WHERE employee_id = 42  -- Target employee
    
    UNION ALL
    
    SELECT 
        e.employee_id,
        e.name,
        e.manager_id,
        a.visited || e.employee_id
    FROM employees e
    INNER JOIN ancestors a ON e.employee_id = a.manager_id
    WHERE e.employee_id != ALL(a.visited)  -- Prevent cycles
)
SELECT employee_id, name FROM ancestors;

Recursive CTE Performance Optimization

# Example: Recursive CTE performance analysis
from dataclasses import dataclass
from typing import List, Dict
import time

@dataclass
class RecursiveCTEConfig:
    """Configuration for recursive CTE performance."""
    
    max_recursion_depth: int = 100      # Maximum recursion depth
    use_cycle_detection: bool = True    # Detect cycles in data
    materialize_intermediate: bool = True  # Materialize intermediate results
    
    def estimate_performance(
        self,
        table_size: int,               # Number of rows in table
        average_children: float,        # Average children per node
        depth: int                      # Expected recursion depth
    ) -> Dict:
        """Estimate recursive CTE performance."""
        
        # Estimate total rows processed
        total_rows = 0
        current_level_rows = 1  # Start with root
        
        for d in range(depth):
            total_rows += current_level_rows
            current_level_rows *= average_children
            
            # Stop if we exceed max recursion
            if d >= self.max_recursion_depth:
                break
        
        # Estimate time based on rows processed
        # Assuming ~1000 rows/second processing speed
        estimated_time_seconds = total_rows / 1000
        
        return {
            'total_rows_processed': total_rows,
            'estimated_time_seconds': estimated_time_seconds,
            'recommended_materialization': total_rows > 100000,
            'optimization_suggestions': self._get_optimizations(total_rows)
        }
    
    def _get_optimizations(self, total_rows: int) -> List[str]:
        """Get optimization suggestions based on row count."""
        suggestions = []
        
        if total_rows > 1000000:
            suggestions.append("Consider using MATERIALIZED VIEW for intermediate results")
        
        if not self.use_cycle_detection:
            suggestions.append("Enable cycle detection to prevent infinite loops")
        
        if self.max_recursion_depth > 50:
            suggestions.append("Reduce max recursion depth if possible")
        
        return suggestions

# Example usage
config = RecursiveCTEConfig(
    max_recursion_depth=100,
    use_cycle_detection=True,
    materialize_intermediate=True
)

performance = config.estimate_performance(
    table_size=1000000,
    average_children=2.5,
    depth=10
)

print(f"Total rows to process: {performance['total_rows_processed']:,}")
print(f"Estimated time: {performance['estimated_time_seconds']:.2f} seconds")
print(f"Optimization suggestions: {performance['optimization_suggestions']}")

Pivot and Unpivot

Manual Pivot (Works in all databases)

-- Pivot: rows to columns
SELECT 
    product_name,
    SUM(CASE WHEN EXTRACT(quarter FROM order_date) = 1 THEN amount ELSE 0 END) AS q1_sales,
    SUM(CASE WHEN EXTRACT(quarter FROM order_date) = 2 THEN amount ELSE 0 END) AS q2_sales,
    SUM(CASE WHEN EXTRACT(quarter FROM order_date) = 3 THEN amount ELSE 0 END) AS q3_sales,
    SUM(CASE WHEN EXTRACT(quarter FROM order_date) = 4 THEN amount ELSE 0 END) AS q4_sales
FROM orders
GROUP BY product_name;

-- Pivot with dynamic columns (PostgreSQL)
DO $$
DECLARE
    rec RECORD;
    sql TEXT := 'SELECT product_name';
BEGIN
    FOR rec IN 
        SELECT DISTINCT EXTRACT(quarter FROM order_date) AS q
        FROM orders
        ORDER BY q
    LOOP
        sql := sql || ', SUM(CASE WHEN EXTRACT(quarter FROM order_date) = ' 
             || rec.q || ' THEN amount ELSE 0 END) AS q' || rec.q || '_sales';
    END LOOP;
    
    sql := sql || ' FROM orders GROUP BY product_name';
    RAISE NOTICE '%', sql;
END $$;

Native Pivot (Database-Specific)

-- PostgreSQL: Using crosstab
CREATE EXTENSION IF NOT EXISTS tablefunc;

SELECT *
FROM crosstab(
    'SELECT product_name, 
            EXTRACT(quarter FROM order_date)::int AS quarter, 
            SUM(amount)::numeric AS sales
     FROM orders
     GROUP BY product_name, EXTRACT(quarter FROM order_date)
     ORDER BY 1, 2',
    'SELECT DISTINCT EXTRACT(quarter FROM order_date)::int 
     FROM orders ORDER BY 1'
) AS ct(product_name text, q1 numeric, q2 numeric, q3 numeric, q4 numeric);

-- SQL Server: PIVOT operator
SELECT *
FROM (
    SELECT 
        product_name,
        'Q' + CAST(EXTRACT(quarter FROM order_date) AS VARCHAR) AS quarter,
        amount
    FROM orders
) src
PIVOT (
    SUM(amount)
    FOR quarter IN ([Q1], [Q2], [Q3], [Q4])
) pvt;

Pivot Performance Analysis

LATERAL Joins

LATERAL joins allow subqueries to reference columns from preceding tables — essential for "top-N per group" and correlated calculations.

-- Top 3 orders per customer (LATERAL approach)
SELECT 
    c.customer_id,
    c.first_name,
    top_orders.*
FROM customers c
CROSS JOIN LATERAL (
    SELECT 
        order_id,
        order_date,
        amount
    FROM orders o
    WHERE o.customer_id = c.customer_id
    ORDER BY amount DESC
    LIMIT 3
) top_orders;

-- Get most recent order per customer with full details
SELECT 
    c.customer_id,
    c.first_name,
    c.last_name,
    recent.order_id,
    recent.order_date,
    recent.total_amount
FROM customers c
CROSS JOIN LATERAL (
    SELECT *
    FROM orders o
    WHERE o.customer_id = c.customer_id
    ORDER BY o.order_date DESC
    LIMIT 1
) recent;

-- Running totals per customer
SELECT 
    c.customer_id,
    o.order_date,
    o.amount,
    running.total AS running_total
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
CROSS JOIN LATERAL (
    SELECT SUM(o2.amount) AS total
    FROM orders o2
    WHERE o2.customer_id = c.customer_id
      AND o2.order_date <= o.order_date
) running;

LATERAL vs Window Functions

Aspect	LATERAL Join	Window Function
Flexibility	Can access other tables	Only current row
Performance	Better for top-N	Better for rankings
Readability	More complex	Cleaner syntax
Use Case	Correlated subqueries	Row-level calculations
Indexing	Can use indexes on joined tables	Limited index usage

Query Execution Plans

Reading EXPLAIN Output

-- Basic EXPLAIN
EXPLAIN SELECT * FROM orders WHERE customer_id = 123;

-- With actual execution statistics
EXPLAIN ANALYZE
SELECT 
    c.first_name,
    COUNT(o.order_id)
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE c.country = 'USA'
GROUP BY c.first_name;

-- Output example:
/*
HashAggregate  (cost=12345.67..12345.89 rows=22 width=20)
  Group Key: c.first_name
  ->  Hash Join  (cost=100.00..12345.00 rows=50000 width=16)
        Hash Cond: (o.customer_id = c.customer_id)
        ->  Seq Scan on orders o  (cost=0.00..10000.00 rows=500000 width=12)
        ->  Hash  (cost=100.00..100.00 rows=5000 width=16)
              ->  Seq Scan on customers c  (cost=0.00..100.00 rows=5000 width=16)
                    Filter: (country = 'USA')
Planning Time: 0.123 ms
Execution Time: 123.456 ms
*/

Common Execution Plan Operations

Operation	Description	Performance Impact	When to Use
Seq Scan	Reads entire table	Slow on large tables	Small tables, full scans
Index Scan	Uses index to find rows	Fast	Filtered queries
Index Only Scan	Data from index only	Very fast	Covering indexes
Hash Join	Builds hash table for join	Good for large joins	Equi-joins
Nested Loop	Iterates and looks up	Good for small datasets	Small result sets
Sort	Orders result set	Can be expensive	ORDER BY operations
Aggregate	Groups and aggregates	May require sorting	GROUP BY operations

EXPLAIN Output Metrics

# Example: Parse EXPLAIN ANALYZE output
import re
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class ExplainPlan:
    """Parsed EXPLAIN ANALYZE output."""
    
    planning_time: float          # Planning time in ms
    execution_time: float         # Execution time in ms
    total_cost: float             # Estimated total cost
    actual_rows: int              # Actual rows returned
    estimated_rows: int           # Estimated rows
    operations: List[Dict]        # List of operations
    
    def calculate_accuracy(self) -> float:
        """Calculate estimation accuracy."""
        if self.estimated_rows == 0:
            return 0.0
        return min(self.actual_rows / self.estimated_rows, 
                   self.estimated_rows / self.actual_rows) * 100
    
    def identify_bottlenecks(self) -> List[str]:
        """Identify performance bottlenecks."""
        bottlenecks = []
        
        for op in self.operations:
            if op.get('type') == 'Seq Scan' and op.get('rows_removed', 0) > 10000:
                bottlenecks.append(f"Sequential scan on {op.get('table')} removed {op.get('rows_removed')} rows")
            
            if op.get('actual_time', 0) > 1000:
                bottlenecks.append(f"Slow operation: {op.get('type')} took {op.get('actual_time')}ms")
        
        return bottlenecks

def parse_explain_output(explain_text: str) -> ExplainPlan:
    """Parse EXPLAIN ANALYZE output text."""
    # This is a simplified parser - real implementation would be more complex
    
    planning_time = 0.0
    execution_time = 0.0
    total_cost = 0.0
    operations = []
    
    # Extract planning time
    planning_match = re.search(r'Planning Time: ([\d.]+) ms', explain_text)
    if planning_match:
        planning_time = float(planning_match.group(1))
    
    # Extract execution time
    execution_match = re.search(r'Execution Time: ([\d.]+) ms', explain_text)
    if execution_match:
        execution_time = float(execution_match.group(1))
    
    # Extract cost information
    cost_match = re.search(r'cost=([\d.]+)\.\.([\d.]+)', explain_text)
    if cost_match:
        total_cost = float(cost_match.group(2))
    
    return ExplainPlan(
        planning_time=planning_time,
        execution_time=execution_time,
        total_cost=total_cost,
        actual_rows=0,
        estimated_rows=0,
        operations=operations
    )

Partitioning Strategies

Range Partitioning

-- PostgreSQL: Create partitioned table
CREATE TABLE orders (
    order_id BIGINT,
    customer_id INTEGER,
    order_date DATE,
    amount DECIMAL(10,2)
) PARTITION BY RANGE (order_date);

-- Create partitions
CREATE TABLE orders_2023 PARTITION OF orders
    FOR VALUES FROM ('2023-01-01') TO ('2024-01-01');

CREATE TABLE orders_2024 PARTITION OF orders
    FOR VALUES FROM ('2024-01-01') TO ('2025-01-01');

CREATE TABLE orders_2025 PARTITION OF orders
    FOR VALUES FROM ('2025-01-01') TO ('2026-01-01');

-- Query automatically uses partition pruning
SELECT * FROM orders 
WHERE order_date >= '2024-06-01' 
  AND order_date < '2024-07-01';
-- Only scans orders_2024 partition

Hash Partitioning

-- Distribute data evenly across partitions
CREATE TABLE events (
    event_id BIGINT,
    user_id INTEGER,
    event_type VARCHAR(50),
    event_time TIMESTAMP
) PARTITION BY HASH (user_id);

-- Create partitions
CREATE TABLE events_p0 PARTITION OF events
    FOR VALUES WITH (MODULUS 4, REMAINDER 0);
CREATE TABLE events_p1 PARTITION OF events
    FOR VALUES WITH (MODULUS 4, REMAINDER 1);
CREATE TABLE events_p2 PARTITION OF events
    FOR VALUES WITH (MODULUS 4, REMAINDER 2);
CREATE TABLE events_p3 PARTITION OF events
    FOR VALUES WITH (MODULUS 4, REMAINDER 3);

Partitioning Strategy Selection

Partition Performance Comparison

Strategy	Query Performance	Load Balance	Maintenance	Best For
Range	Excellent for range queries	Can be uneven	Easy (add/drop partitions)	Time-series data
Hash	Good for point queries	Excellent	Moderate	Even distribution
List	Excellent for categorical	Can be uneven	Easy	Regional data
Composite	Best overall	Good	Complex	Multi-dimensional queries

Advanced Aggregations

-- GROUPING SETS: Multiple aggregation levels
SELECT 
    COALESCE(country, '(All Countries)') AS country,
    COALESCE(city, '(All Cities)') AS city,
    SUM(amount) AS total_revenue
FROM orders
GROUP BY GROUPING SETS (
    (country, city),  -- Country + City level
    (country),        -- Country level
    ()                -- Grand total
);

-- ROLLUP: Hierarchical aggregation
SELECT 
    country,
    city,
    SUM(amount) AS total_revenue
FROM orders
GROUP BY ROLLUP (country, city);
-- Produces: country+city, country, grand total

-- CUBE: All combinations
SELECT 
    country,
    city,
    category,
    SUM(amount) AS total_revenue
FROM orders
GROUP BY CUBE (country, city, category);
-- Produces: all possible combinations of the 3 columns

-- FILTER clause (PostgreSQL)
SELECT 
    order_date,
    COUNT(*) AS total_orders,
    COUNT(*) FILTER (WHERE status = 'completed') AS completed_orders,
    COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_orders,
    SUM(amount) FILTER (WHERE amount > 100) AS high_value_revenue
FROM orders
GROUP BY order_date;

Aggregation Performance Analysis

# Example: Aggregation performance comparison
from dataclasses import dataclass
from typing import Dict, List
import time

@dataclass
class AggregationPerformance:
    """Compare performance of different aggregation methods."""
    
    method_name: str                 # Aggregation method
    data_size: int                   # Number of rows
    execution_time: float            # Execution time in seconds
    memory_usage_mb: float           # Memory usage in MB
    
    def calculate_throughput(self) -> float:
        """Calculate rows per second."""
        if self.execution_time == 0:
            return float('inf')
        return self.data_size / self.execution_time
    
    def calculate_efficiency(self) -> float:
        """Calculate efficiency score (rows per MB)."""
        if self.memory_usage_mb == 0:
            return float('inf')
        return self.data_size / self.memory_usage_mb

def compare_aggregation_methods(
    data_sizes: List[int]             # Different data sizes to test
) -> Dict[str, List[AggregationPerformance]]:
    """Compare aggregation methods across data sizes."""
    
    results = {
        'GROUP BY': [],
        'GROUPING SETS': [],
        'ROLLUP': [],
        'CUBE': []
    }
    
    for size in data_sizes:
        # Simulate performance results
        # In reality, these would be actual measurements
        
        group_by_time = size / 1000000  # Simulated
        grouping_sets_time = size / 800000  # Slightly slower
        rollup_time = size / 900000
        cube_time = size / 700000  # Slowest due to all combinations
        
        results['GROUP BY'].append(AggregationPerformance(
            method_name='GROUP BY',
            data_size=size,
            execution_time=group_by_time,
            memory_usage_mb=size * 0.001
        ))
        
        results['GROUPING SETS'].append(AggregationPerformance(
            method_name='GROUPING SETS',
            data_size=size,
            execution_time=grouping_sets_time,
            memory_usage_mb=size * 0.0015
        ))
        
        results['ROLLUP'].append(AggregationPerformance(
            method_name='ROLLUP',
            data_size=size,
            execution_time=rollup_time,
            memory_usage_mb=size * 0.0012
        ))
        
        results['CUBE'].append(AggregationPerformance(
            method_name='CUBE',
            data_size=size,
            execution_time=cube_time,
            memory_usage_mb=size * 0.002
        ))
    
    return results

# Example usage
sizes = [100000, 1000000, 10000000]
comparison = compare_aggregation_methods(sizes)

for method, performances in comparison.items():
    print(f"\n{method}:")
    for perf in performances:
        print(f"  Size: {perf.data_size:,}, Time: {perf.execution_time:.2f}s, "
              f"Throughput: {perf.calculate_throughput():,.0f} rows/sec")

Performance Tuning Checklist

Performance Optimization Examples

-- Before: Suboptimal query
EXPLAIN ANALYZE
SELECT *
FROM orders
WHERE EXTRACT(year FROM order_date) = 2024;

-- Problem: Function on column prevents index use
-- Seq Scan on orders (cost=0.00..15000.00 rows=100000)

-- After: Optimized query
EXPLAIN ANALYZE
SELECT *
FROM orders
WHERE order_date >= '2024-01-01'
  AND order_date < '2025-01-01';

-- Improvement: Range scan can use index
-- Index Scan using idx_orders_date on orders (cost=0.43..5000.00 rows=100000)

-- Before: Suboptimal JOIN
EXPLAIN ANALYZE
SELECT c.name, COUNT(o.id)
FROM customers c
LEFT JOIN orders o ON c.id = o.customer_id
WHERE c.country = 'USA'
GROUP BY c.name;

-- After: Optimized with filter first
EXPLAIN ANALYZE
WITH usa_customers AS (
    SELECT id, name FROM customers WHERE country = 'USA'
)
SELECT uc.name, COUNT(o.id)
FROM usa_customers uc
LEFT JOIN orders o ON uc.id = o.customer_id
GROUP BY uc.name;

Key Takeaways

Window functions are essential — master frame specifications, ranking, and analytical functions
Recursive CTEs enable hierarchical queries — org charts, bill of materials, pathfinding
LATERAL joins solve top-N per group — more flexible than window functions in some cases
Understand execution plans — EXPLAIN ANALYZE is your best debugging tool
Partitioning improves query performance — use range partitioning for time-series data
GROUPING SETS, ROLLUP, CUBE — provide flexible multi-level aggregations
Always optimize before scaling — a well-tuned query can outperform a faster database

Practice Exercises

Window functions: Write a query that calculates the 7-day moving average of daily revenue and identifies days where actual revenue is more than 20% above or below the moving average.
Recursive CTE: Write a query to find the shortest path between two nodes in a graph represented by an adjacency list table.
Pivot: Transform a table of monthly sales into a quarterly summary with columns for Q1-Q4 and year-over-year comparison.
LATERAL: For each customer, find their most recent order and the time since their previous order.
Performance: Take a complex query from your work, create proper indexes, and demonstrate at least 10x improvement using EXPLAIN ANALYZE.