SQL for Data Scientists
SQL is the fundamental language for data extraction and manipulation. This lesson covers essential SQL techniques for data science, from basic queries to advanced window functions.
SQL Query Structure
SELECT column1, column2, ...
FROM table_name
WHERE condition
GROUP BY column1, HAVING condition
ORDER BY column1 [ASC|DESC]
LIMIT n OFFSET m;
JOINs
<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
<!-- Background -->
<rect width="600" height="400" fill="#f8f9fa" rx="10"/>
<!-- Title -->
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">SQL JOIN Types</text>
<!-- INNER JOIN -->
<circle cx="200" cy="120" r="60" fill="#3498db" opacity="0.7"/>
<circle cx="280" cy="120" r="60" fill="#2ecc71" opacity="0.7"/>
<text x="240" y="125" text-anchor="middle" font-size="12" fill="white">INNER</text>
<text x="240" y="155" text-anchor="middle" font-size="10" fill="white">WHERE A.key = B.key</text>
<!-- LEFT JOIN -->
<circle cx="200" cy="240" r="60" fill="#3498db" opacity="0.7"/>
<circle cx="280" cy="240" r="60" fill="#2ecc71" opacity="0.3"/>
<text x="240" y="245" text-anchor="middle" font-size="12" fill="white">LEFT</text>
<text x="240" y="275" text-anchor="middle" font-size="10" fill="white">FROM A LEFT JOIN B</text>
<!-- Labels -->
<text x="140" y="120" text-anchor="middle" font-size="12" fill="#2c3e50">Table A</text>
<text x="340" y="120" text-anchor="middle" font-size="12" fill="#2c3e50">Table B</text>
<!-- Arrow -->
<line x1="200" y1="180" x2="200" y2="200" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<defs>
<marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
<path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
</marker>
</defs>
</svg>
Common JOIN Examples
-- INNER JOIN: Returns matching rows
SELECT e.name, d.department_name
FROM employees e
INNER JOIN departments d ON e.dept_id = d.id;
-- LEFT JOIN: All from left, matching from right
SELECT e.name, d.department_name
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;
-- FULL OUTER JOIN: All from both sides
SELECT e.name, d.department_name
FROM employees e
FULL OUTER JOIN departments d ON e.dept_id = d.id;
-- Self JOIN: Join table to itself
SELECT e.name AS employee, m.name AS manager
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.id;
Subqueries and CTEs
-- Subquery in WHERE
SELECT * FROM orders
WHERE customer_id IN (
SELECT id FROM customers WHERE region = 'West'
);
-- CTE (Common Table Expression)
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', order_date) AS month,
SUM(amount) AS total_sales
FROM orders
GROUP BY 1
)
SELECT * FROM monthly_sales
WHERE total_sales > 10000;
-- Recursive CTE for hierarchical data
WITH RECURSIVE category_tree AS (
SELECT id, name, parent_id, 1 AS level
FROM categories WHERE parent_id IS NULL
UNION ALL
SELECT c.id, c.name, c.parent_id, ct.level + 1
FROM categories c
JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT * FROM category_tree;
Window Functions
-- Row number, rank, dense rank
SELECT
name,
department,
salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank_num,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank_num
FROM employees;
-- Running total and moving average
SELECT
order_date,
amount,
SUM(amount) OVER (ORDER BY order_date) AS running_total,
AVG(amount) OVER (ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg_7d
FROM orders;
-- Lag and lead for time comparisons
SELECT
month,
revenue,
LAG(revenue, 1) OVER (ORDER BY month) AS prev_month,
LEAD(revenue, 1) OVER (ORDER BY month) AS next_month,
revenue - LAG(revenue, 1) OVER (ORDER BY month) AS month_over_month
FROM monthly_revenue;
Aggregation and Grouping
-- Basic aggregation
SELECT
department,
COUNT(*) AS employee_count,
AVG(salary) AS avg_salary,
MAX(salary) AS max_salary
FROM employees
GROUP BY department
HAVING COUNT(*) > 5;
-- GROUPING SETS for multiple levels
SELECT
COALESCE(department, 'All Depts') AS department,
COALESCE(region, 'All Regions') AS region,
SUM(salary) AS total_salary
FROM employees
GROUP BY GROUPING SETS (
(department, region),
(department),
(region),
()
);
Optimization Tips
-- Use EXPLAIN to analyze queries
EXPLAIN ANALYZE
SELECT * FROM orders WHERE customer_id = 123;
-- Create indexes for frequently queried columns
CREATE INDEX idx_orders_customer_id ON orders(customer_id);
CREATE INDEX idx_orders_date ON orders(order_date);
-- Use EXPLAIN ANALYZE to see actual execution plan
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT c.name, COUNT(o.id)
FROM customers c
JOIN orders o ON c.id = o.customer_id
GROUP BY c.name;
-- Avoid SELECT * in production
-- Bad: SELECT * FROM large_table;
-- Good: SELECT col1, col2, col3 FROM large_table;
-- Use appropriate JOIN order
-- Start with smallest filtered result set
Key Takeaways
- Master JOINs for combining datasets
- Use CTEs for readable, modular queries
- Window functions enable powerful analytics without self-joins
- Always use EXPLAIN ANALYZE to optimize performance
- Index strategically based on query patterns