Advanced Analytic Functions

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

Statistical Aggregate Functions

-- Advanced statistical calculations
SELECT
  department_id,
  COUNT(*) AS n,
  AVG(salary) AS mean,
  STDDEV_POP(salary) AS std_dev_population,
  STDDEV_SAMP(salary) AS std_dev_sample,
  VAR_POP(salary) AS variance_population,
  VAR_SAMP(salary) AS variance_sample,
  CORR(salary, years_experience) AS correlation,
  REGR_SLOPE(salary, years_experience) AS regression_slope,
  REGR_INTERCEPT(salary, years_experience) AS regression_intercept,
  REGR_R2(salary, years_experience) AS r_squared
FROM employees
GROUP BY department_id;

ℹ️

Key Insight: Use _POP for population statistics and _SAMP for sample statistics. CORR() returns the Pearson correlation coefficient (-1 to 1). REGR_SLOPE and REGR_INTERCEPT compute linear regression coefficients.

Linear Regression Analysis

-- Perform linear regression per department
WITH regression_stats AS (
  SELECT
    department_id,
    REGR_SLOPE(salary, years_experience) AS slope,
    REGR_INTERCEPT(salary, years_experience) AS intercept,
    REGR_R2(salary, years_experience) AS r_squared
  FROM employees
  GROUP BY department_id
)
SELECT
  e.employee_id,
  e.department_id,
  e.salary,
  e.years_experience,
  r.slope * e.years_experience + r.intercept AS predicted_salary,
  e.salary - (r.slope * e.years_experience + r.intercept) AS residual,
  r.r_squared
FROM employees e
INNER JOIN regression_stats r ON e.department_id = r.department_id;

Percentile and Median Calculations

-- Compute percentiles with different methods
SELECT
  department_id,
  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS median_cont,
  PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median_disc,
  PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) AS q1,
  PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) AS q3,
  PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95,
  PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99
FROM employees
GROUP BY department_id;

Moving Aggregations

-- Complex moving window calculations
SELECT
  sale_date,
  revenue,
  -- 7-day moving average
  AVG(revenue) OVER w7 AS ma_7day,
  -- 30-day moving sum
  SUM(revenue) OVER w30 AS sum_30day,
  -- Moving standard deviation
  STDDEV(revenue) OVER w7 AS stddev_7day,
  -- Moving correlation
  CORR(revenue, units_sold) OVER w7 AS corr_7day,
  -- Moving linear regression
  REGR_SLOPE(revenue, units_sold) OVER w7 AS slope_7day
FROM daily_sales
WINDOW
  w7 AS (ORDER BY sale_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW),
  w30 AS (ORDER BY sale_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW);

⚠️

Performance Note: Named window clauses (WINDOW clause) improve readability and ensure consistency. Define them once and reference throughout the query.

Ratio Calculations

-- Compute various ratios
SELECT
  department_id,
  employee_id,
  salary,
  -- Ratio to department average
  salary / AVG(salary) OVER (PARTITION BY department_id) AS ratio_to_avg,
  -- Ratio to max
  salary / MAX(salary) OVER (PARTITION BY department_id) AS ratio_to_max,
  -- Percentage of department total
  salary / SUM(salary) OVER (PARTITION BY department_id) * 100 AS pct_of_dept_total,
  -- Running ratio
  salary / SUM(salary) OVER (
    PARTITION BY department_id
    ORDER BY salary DESC
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) * 100 AS cumulative_pct
FROM employees;

Statistical Outlier Detection

-- Identify outliers using Z-score
WITH stats AS (
  SELECT
    department_id,
    AVG(salary) AS mean_salary,
    STDDEV(salary) AS stddev_salary
  FROM employees
  GROUP BY department_id
)
SELECT
  e.employee_id,
  e.department_id,
  e.salary,
  (e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0) AS z_score,
  CASE
    WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 3
      THEN 'OUTLIER'
    WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 2
      THEN 'FAR'
    ELSE 'NORMAL'
  END AS salary_classification
FROM employees e
INNER JOIN stats s ON e.department_id = s.department_id;

BigQuery Analytic Functions

-- BigQuery-specific analytic functions
SELECT
  user_id,
  event_date,
  -- Approximate count distinct
  APPROX_COUNT_DISTINCT(session_id) OVER w AS approx_sessions,
  -- Top values
  TOP_COUNT(event_type, 5) OVER w AS top_events,
  -- Quantiles
  QUANTILES(revenue, 10) OVER w AS deciles,
  -- Correlation
  CORR(revenue, session_duration) OVER w AS corr_rev_duration
FROM events
WINDOW w AS (PARTITION BY user_id ORDER BY event_date);

Frequency Distribution

-- Create frequency distribution
SELECT
  WIDTH_BUCKET(salary, 30000, 200000, 10) AS salary_bucket,
  COUNT(*) AS frequency,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) AS pct,
  SUM(COUNT(*)) OVER (ORDER BY WIDTH_BUCKET(salary, 30000, 200000, 10)
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_freq
FROM employees
GROUP BY WIDTH_BUCKET(salary, 30000, 200000, 10)
ORDER BY salary_bucket;

Window Function with FILTER

-- Conditional analytic calculations
SELECT
  department_id,
  employee_id,
  salary,
  -- Average salary (active employees only)
  AVG(salary) FILTER (WHERE status = 'active')
    OVER (PARTITION BY department_id) AS avg_active_salary,
  -- Percent rank among senior employees
  PERCENT_RANK() OVER (
    PARTITION BY department_id
    ORDER BY salary
    FILTER (WHERE years_experience > 5)
  ) AS senior_percent_rank,
  -- Count of high performers
  COUNT(*) FILTER (WHERE performance_rating >= 4)
    OVER (PARTITION BY department_id) AS high_performer_count
FROM employees;

Covariance and Correlation Matrix

-- Compute correlation matrix for multiple variables
SELECT
  'salary' AS var1,
  'years_experience' AS var2,
  CORR(salary, years_experience) AS correlation
FROM employees
UNION ALL
SELECT
  'salary',
  'performance_rating',
  CORR(salary, performance_rating)
FROM employees
UNION ALL
SELECT
  'years_experience',
  'performance_rating',
  CORR(years_experience, performance_rating)
FROM employees;

Follow-Up Questions

What's the difference between PERCENTILE_CONT and PERCENTILE_DISC?
How would you compute a weighted moving average using window functions?
Explain the assumptions behind REGR_SLOPE and when they might be violated.
How do you handle NULL values in statistical aggregate functions?
What's the best approach for computing rolling percentiles?
How would you implement a Z-score normalization across partitioned data?