CW

SQL for Data Scientists

Module 5: Data Collection & SQLFree Lesson

Advertisement

SQL for Data Scientists

SQL is the fundamental language for data extraction and manipulation. This lesson covers essential SQL techniques for data science, from basic queries to advanced window functions.

SQL Query Structure

SELECT column1, column2, ...
FROM table_name
WHERE condition
GROUP BY column1, HAVING condition
ORDER BY column1 [ASC|DESC]
LIMIT n OFFSET m;

JOINs

<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
  <!-- Background -->
  <rect width="600" height="400" fill="#f8f9fa" rx="10"/>
  
  <!-- Title -->
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">SQL JOIN Types</text>
  
  <!-- INNER JOIN -->
  <circle cx="200" cy="120" r="60" fill="#3498db" opacity="0.7"/>
  <circle cx="280" cy="120" r="60" fill="#2ecc71" opacity="0.7"/>
  <text x="240" y="125" text-anchor="middle" font-size="12" fill="white">INNER</text>
  <text x="240" y="155" text-anchor="middle" font-size="10" fill="white">WHERE A.key = B.key</text>
  
  <!-- LEFT JOIN -->
  <circle cx="200" cy="240" r="60" fill="#3498db" opacity="0.7"/>
  <circle cx="280" cy="240" r="60" fill="#2ecc71" opacity="0.3"/>
  <text x="240" y="245" text-anchor="middle" font-size="12" fill="white">LEFT</text>
  <text x="240" y="275" text-anchor="middle" font-size="10" fill="white">FROM A LEFT JOIN B</text>
  
  <!-- Labels -->
  <text x="140" y="120" text-anchor="middle" font-size="12" fill="#2c3e50">Table A</text>
  <text x="340" y="120" text-anchor="middle" font-size="12" fill="#2c3e50">Table B</text>
  
  <!-- Arrow -->
  <line x1="200" y1="180" x2="200" y2="200" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <defs>
    <marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
      <path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
    </marker>
  </defs>
</svg>

Common JOIN Examples

-- INNER JOIN: Returns matching rows
SELECT e.name, d.department_name
FROM employees e
INNER JOIN departments d ON e.dept_id = d.id;

-- LEFT JOIN: All from left, matching from right
SELECT e.name, d.department_name
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;

-- FULL OUTER JOIN: All from both sides
SELECT e.name, d.department_name
FROM employees e
FULL OUTER JOIN departments d ON e.dept_id = d.id;

-- Self JOIN: Join table to itself
SELECT e.name AS employee, m.name AS manager
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.id;

Subqueries and CTEs

-- Subquery in WHERE
SELECT * FROM orders
WHERE customer_id IN (
    SELECT id FROM customers WHERE region = 'West'
);

-- CTE (Common Table Expression)
WITH monthly_sales AS (
    SELECT 
        DATE_TRUNC('month', order_date) AS month,
        SUM(amount) AS total_sales
    FROM orders
    GROUP BY 1
)
SELECT * FROM monthly_sales
WHERE total_sales > 10000;

-- Recursive CTE for hierarchical data
WITH RECURSIVE category_tree AS (
    SELECT id, name, parent_id, 1 AS level
    FROM categories WHERE parent_id IS NULL
    UNION ALL
    SELECT c.id, c.name, c.parent_id, ct.level + 1
    FROM categories c
    JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT * FROM category_tree;

Window Functions

-- Row number, rank, dense rank
SELECT 
    name,
    department,
    salary,
    ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank_num,
    DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank_num
FROM employees;

-- Running total and moving average
SELECT 
    order_date,
    amount,
    SUM(amount) OVER (ORDER BY order_date) AS running_total,
    AVG(amount) OVER (ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg_7d
FROM orders;

-- Lag and lead for time comparisons
SELECT 
    month,
    revenue,
    LAG(revenue, 1) OVER (ORDER BY month) AS prev_month,
    LEAD(revenue, 1) OVER (ORDER BY month) AS next_month,
    revenue - LAG(revenue, 1) OVER (ORDER BY month) AS month_over_month
FROM monthly_revenue;

Aggregation and Grouping

-- Basic aggregation
SELECT 
    department,
    COUNT(*) AS employee_count,
    AVG(salary) AS avg_salary,
    MAX(salary) AS max_salary
FROM employees
GROUP BY department
HAVING COUNT(*) > 5;

-- GROUPING SETS for multiple levels
SELECT 
    COALESCE(department, 'All Depts') AS department,
    COALESCE(region, 'All Regions') AS region,
    SUM(salary) AS total_salary
FROM employees
GROUP BY GROUPING SETS (
    (department, region),
    (department),
    (region),
    ()
);

Optimization Tips

-- Use EXPLAIN to analyze queries
EXPLAIN ANALYZE
SELECT * FROM orders WHERE customer_id = 123;

-- Create indexes for frequently queried columns
CREATE INDEX idx_orders_customer_id ON orders(customer_id);
CREATE INDEX idx_orders_date ON orders(order_date);

-- Use EXPLAIN ANALYZE to see actual execution plan
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT c.name, COUNT(o.id) 
FROM customers c
JOIN orders o ON c.id = o.customer_id
GROUP BY c.name;

-- Avoid SELECT * in production
-- Bad: SELECT * FROM large_table;
-- Good: SELECT col1, col2, col3 FROM large_table;

-- Use appropriate JOIN order
-- Start with smallest filtered result set

Key Takeaways

  1. Master JOINs for combining datasets
  2. Use CTEs for readable, modular queries
  3. Window functions enable powerful analytics without self-joins
  4. Always use EXPLAIN ANALYZE to optimize performance
  5. Index strategically based on query patterns

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement