Bayesian Statistics
Bayesian methods update beliefs with evidence. Learn to build probabilistic models that quantify uncertainty and make principled decisions.
Bayesian Inference Fundamentals
Bayesian Inference with PyMC
import pymc as pm
import numpy as np
import arviz as az
# Simple Bayesian linear regression
np.random.seed(42)
n = 100
X = np.random.uniform(0, 10, n)
true_slope = 2.5
true_intercept = 1.0
y = true_slope * X + true_intercept + np.random.normal(0, 1, n)
with pm.Model() as linear_model:
# Priors
slope = pm.Normal("slope", mu=0, sigma=10)
intercept = pm.Normal("intercept", mu=0, sigma=10)
sigma = pm.HalfNormal("sigma", sigma=5)
# Likelihood
mu = slope * X + intercept
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)
# Inference
trace = pm.sample(2000, tune=1000, cores=4, return_inferencedata=True)
# Analyze results
print(az.summary(trace, var_names=["slope", "intercept", "sigma"]))
# Posterior predictive checks
with linear_model:
ppc = pm.sample_posterior_predictive(trace)
az.plot_ppc(az.from_pymc(posterior_predictive=ppc, observed={"y_obs": y}))
Bayesian A/B Testing
import pymc as pm
import numpy as np
import arviz as az
def bayesian_ab_test(control_conversions, control_total,
treatment_conversions, treatment_total):
with pm.Model() as ab_model:
# Priors
p_control = pm.Beta("p_control", alpha=1, beta=1)
p_treatment = pm.Beta("p_treatment", alpha=1, beta=1)
# Likelihood
obs_control = pm.Binomial("obs_control", n=control_total, p=p_control,
observed=control_conversions)
obs_treatment = pm.Binomial("obs_treatment", n=treatment_total, p=p_treatment,
observed=treatment_conversions)
# Derived quantities
lift = pm.Deterministic("lift", (p_treatment - p_control) / p_control)
is_better = pm.Deterministic("is_better", pm.math.gt(p_treatment, p_control))
# Sample
trace = pm.sample(5000, tune=2000, cores=4, return_inferencedata=True)
# Results
summary = az.summary(trace, var_names=["p_control", "p_treatment", "lift", "is_better"])
prob_better = trace.posterior["is_better"].mean().item()
return {
"control_rate": control_conversions / control_total,
"treatment_rate": treatment_conversions / treatment_total,
"lift_mean": trace.posterior["lift"].mean().item(),
"lift_hdi": az.hdi(trace, var_names=["lift"])["lift"].values,
"prob_treatment_better": prob_better,
"summary": summary
}
# Example
results = bayesian_ab_test(
control_conversions=120, control_total=1000,
treatment_conversions=150, treatment_total=1000
)
print(f"Control rate: {results['control_rate']:.3f}")
print(f"Treatment rate: {results['treatment_rate']:.3f}")
print(f"Prob treatment better: {results['prob_treatment_better']:.3f}")
Hierarchical Models
import pymc as pm
import numpy as np
def hierarchical_model():
# Simulate grouped data (e.g., sales across stores)
n_groups = 20
n_per_group = 50
group_effects = np.random.normal(0, 2, n_groups)
group_ids = np.repeat(range(n_groups), n_per_group)
X = np.random.normal(0, 1, n_groups * n_per_group)
y = 3 * X + group_effects[group_ids] + np.random.normal(0, 1, n_groups * n_per_group)
with pm.Model() as hierarchical:
# Hyperpriors (global parameters)
mu_global = pm.Normal("mu_global", mu=0, sigma=5)
sigma_global = pm.HalfNormal("sigma_global", sigma=5)
# Group-level parameters (partially pooled)
group_offset = pm.Normal("group_offset", mu=0, sigma=1, shape=n_groups)
group_intercept = pm.Deterministic("group_intercept",
mu_global + sigma_global * group_offset)
# Shared slope
slope = pm.Normal("slope", mu=0, sigma=10)
# Observation noise
sigma = pm.HalfNormal("sigma", sigma=5)
# Expected value
mu = slope * X + group_intercept[group_ids]
# Likelihood
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=y)
trace = pm.sample(2000, tune=1000, cores=4)
return trace
trace = hierarchical_model()
# Compare group-level intercepts
group_means = trace.posterior["group_intercept"].mean(dim=["chain", "draw"]).values
print("Group intercepts:", group_means)
MCMC Diagnostics
import arviz as az
import matplotlib.pyplot as plt
def diagnose_mcmc(trace):
"""Comprehensive MCMC diagnostics"""
# 1. Trace plots
az.plot_trace(trace, var_names=["slope", "intercept"])
plt.tight_layout()
# 2. Autocorrelation
az.plot_autocorr(trace, var_names=["slope", "intercept"])
plt.tight_layout()
# 3. Rank plots (better than trace plots)
az.plot_rank(trace, var_names=["slope"])
# 4. R-hat (should be close to 1.0)
rhat = az.rhat(trace)
print("R-hat values:")
print(rhat)
# 5. Effective sample size
ess = az.ess(trace)
print("\nEffective Sample Size:")
print(ess)
# 6. Divergences
if hasattr(trace, 'sample_stats'):
divergences = trace.sample_stats.diverging.sum().item()
print(f"\nDivergences: {divergences}")
# 7. WAIC/LOO for model comparison
# waic = az.waic(trace)
# loo = az.loo(trace)
return {
'rhat': rhat,
'ess': ess,
'converged': all(r.values < 1.05 for r in rhat.values())
}
diagnostics = diagnose_mcmc(trace)
print(f"\nConverged: {diagnostics['converged']}")
Bayesian Time Series
import pymc as pm
import numpy as np
def bayesian_structural_time_series(y):
"""Bayesian structural time series model"""
n = len(y)
t = np.arange(n)
with pm.Model() as sts_model:
# Local linear trend
level_sigma = pm.HalfNormal("level_sigma", sigma=0.1)
slope_sigma = pm.HalfNormal("slope_sigma", sigma=0.01)
# Random walk priors
level_innovations = pm.Normal("level_innovations", mu=0, sigma=1, shape=n)
slope_innovations = pm.Normal("slope_innovations", mu=0, sigma=1, shape=n)
level = pm.math.cumsum(level_sigma * level_innovations)
slope = pm.math.cumsum(slope_sigma * slope_innovations)
# Observation noise
obs_sigma = pm.HalfNormal("obs_sigma", sigma=1)
# Expected value
mu = level + slope * t
# Likelihood
y_obs = pm.Normal("y_obs", mu=mu, sigma=obs_sigma, observed=y)
trace = pm.sample(2000, tune=1000, cores=4)
return trace
# Example
t = np.linspace(0, 10, 200)
y = 2 * t + 5 * np.sin(t) + np.random.normal(0, 0.5, 200)
trace = bayesian_structural_time_series(y)
Model Comparison
import pymc as pm
import arviz as az
def compare_models(data):
"""Compare multiple models using WAIC"""
with pm.Model() as linear:
slope = pm.Normal("slope", mu=0, sigma=10)
intercept = pm.Normal("intercept", mu=0, sigma=10)
sigma = pm.HalfNormal("sigma", sigma=5)
mu = slope * data['x'] + intercept
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=data['y'])
trace_linear = pm.sample(1000, return_inferencedata=True)
with pm.Model() as quadratic:
b1 = pm.Normal("b1", mu=0, sigma=10)
b2 = pm.Normal("b2", mu=0, sigma=10)
intercept = pm.Normal("intercept", mu=0, sigma=10)
sigma = pm.HalfNormal("sigma", sigma=5)
mu = b1 * data['x'] + b2 * data['x']**2 + intercept
y_obs = pm.Normal("y_obs", mu=mu, sigma=sigma, observed=data['y'])
trace_quad = pm.sample(1000, return_inferencedata=True)
# Compare
comp = az.compare({
"linear": trace_linear,
"quadratic": trace_quad
}, ic="waic")
print(comp)
az.plot_compare(comp)
return comp
Best Practices
- Start with informative priors when domain knowledge exists
- Check MCMC diagnostics – R-hat, ESS, divergences
- Use hierarchical models for grouped data (partial pooling)
- Compare models with WAIC/LOO, not just posterior means
- Report full posterior distributions, not just point estimates