Advanced Analytic Functions
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
Statistical Aggregate Functions
-- Advanced statistical calculations
SELECT
department_id,
COUNT(*) AS n,
AVG(salary) AS mean,
STDDEV_POP(salary) AS std_dev_population,
STDDEV_SAMP(salary) AS std_dev_sample,
VAR_POP(salary) AS variance_population,
VAR_SAMP(salary) AS variance_sample,
CORR(salary, years_experience) AS correlation,
REGR_SLOPE(salary, years_experience) AS regression_slope,
REGR_INTERCEPT(salary, years_experience) AS regression_intercept,
REGR_R2(salary, years_experience) AS r_squared
FROM employees
GROUP BY department_id;
βΉοΈ
Key Insight: Use _POP for population statistics and _SAMP for sample statistics. CORR() returns the Pearson correlation coefficient (-1 to 1). REGR_SLOPE and REGR_INTERCEPT compute linear regression coefficients.
Linear Regression Analysis
-- Perform linear regression per department
WITH regression_stats AS (
SELECT
department_id,
REGR_SLOPE(salary, years_experience) AS slope,
REGR_INTERCEPT(salary, years_experience) AS intercept,
REGR_R2(salary, years_experience) AS r_squared
FROM employees
GROUP BY department_id
)
SELECT
e.employee_id,
e.department_id,
e.salary,
e.years_experience,
r.slope * e.years_experience + r.intercept AS predicted_salary,
e.salary - (r.slope * e.years_experience + r.intercept) AS residual,
r.r_squared
FROM employees e
INNER JOIN regression_stats r ON e.department_id = r.department_id;
Percentile and Median Calculations
-- Compute percentiles with different methods
SELECT
department_id,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS median_cont,
PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median_disc,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) AS q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) AS q3,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95,
PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99
FROM employees
GROUP BY department_id;
Moving Aggregations
-- Complex moving window calculations
SELECT
sale_date,
revenue,
-- 7-day moving average
AVG(revenue) OVER w7 AS ma_7day,
-- 30-day moving sum
SUM(revenue) OVER w30 AS sum_30day,
-- Moving standard deviation
STDDEV(revenue) OVER w7 AS stddev_7day,
-- Moving correlation
CORR(revenue, units_sold) OVER w7 AS corr_7day,
-- Moving linear regression
REGR_SLOPE(revenue, units_sold) OVER w7 AS slope_7day
FROM daily_sales
WINDOW
w7 AS (ORDER BY sale_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW),
w30 AS (ORDER BY sale_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW);
β οΈ
Performance Note: Named window clauses (WINDOW clause) improve readability and ensure consistency. Define them once and reference throughout the query.
Ratio Calculations
-- Compute various ratios
SELECT
department_id,
employee_id,
salary,
-- Ratio to department average
salary / AVG(salary) OVER (PARTITION BY department_id) AS ratio_to_avg,
-- Ratio to max
salary / MAX(salary) OVER (PARTITION BY department_id) AS ratio_to_max,
-- Percentage of department total
salary / SUM(salary) OVER (PARTITION BY department_id) * 100 AS pct_of_dept_total,
-- Running ratio
salary / SUM(salary) OVER (
PARTITION BY department_id
ORDER BY salary DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) * 100 AS cumulative_pct
FROM employees;
Statistical Outlier Detection
-- Identify outliers using Z-score
WITH stats AS (
SELECT
department_id,
AVG(salary) AS mean_salary,
STDDEV(salary) AS stddev_salary
FROM employees
GROUP BY department_id
)
SELECT
e.employee_id,
e.department_id,
e.salary,
(e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0) AS z_score,
CASE
WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 3
THEN 'OUTLIER'
WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 2
THEN 'FAR'
ELSE 'NORMAL'
END AS salary_classification
FROM employees e
INNER JOIN stats s ON e.department_id = s.department_id;
BigQuery Analytic Functions
-- BigQuery-specific analytic functions
SELECT
user_id,
event_date,
-- Approximate count distinct
APPROX_COUNT_DISTINCT(session_id) OVER w AS approx_sessions,
-- Top values
TOP_COUNT(event_type, 5) OVER w AS top_events,
-- Quantiles
QUANTILES(revenue, 10) OVER w AS deciles,
-- Correlation
CORR(revenue, session_duration) OVER w AS corr_rev_duration
FROM events
WINDOW w AS (PARTITION BY user_id ORDER BY event_date);
Frequency Distribution
-- Create frequency distribution
SELECT
WIDTH_BUCKET(salary, 30000, 200000, 10) AS salary_bucket,
COUNT(*) AS frequency,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) AS pct,
SUM(COUNT(*)) OVER (ORDER BY WIDTH_BUCKET(salary, 30000, 200000, 10)
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_freq
FROM employees
GROUP BY WIDTH_BUCKET(salary, 30000, 200000, 10)
ORDER BY salary_bucket;
Window Function with FILTER
-- Conditional analytic calculations
SELECT
department_id,
employee_id,
salary,
-- Average salary (active employees only)
AVG(salary) FILTER (WHERE status = 'active')
OVER (PARTITION BY department_id) AS avg_active_salary,
-- Percent rank among senior employees
PERCENT_RANK() OVER (
PARTITION BY department_id
ORDER BY salary
FILTER (WHERE years_experience > 5)
) AS senior_percent_rank,
-- Count of high performers
COUNT(*) FILTER (WHERE performance_rating >= 4)
OVER (PARTITION BY department_id) AS high_performer_count
FROM employees;
Covariance and Correlation Matrix
-- Compute correlation matrix for multiple variables
SELECT
'salary' AS var1,
'years_experience' AS var2,
CORR(salary, years_experience) AS correlation
FROM employees
UNION ALL
SELECT
'salary',
'performance_rating',
CORR(salary, performance_rating)
FROM employees
UNION ALL
SELECT
'years_experience',
'performance_rating',
CORR(years_experience, performance_rating)
FROM employees;
Follow-Up Questions
- What's the difference between
PERCENTILE_CONTandPERCENTILE_DISC? - How would you compute a weighted moving average using window functions?
- Explain the assumptions behind
REGR_SLOPEand when they might be violated. - How do you handle NULL values in statistical aggregate functions?
- What's the best approach for computing rolling percentiles?
- How would you implement a Z-score normalization across partitioned data?