Survey Sampling and Weights

Real-world data rarely comes from simple random samples. Surveys use stratification, clustering, and unequal selection probabilities. Ignoring survey design produces biased estimates and wrong standard errors. Survey methods correct for this.

Why Survey Methods Matter

Census data, health surveys, election polls – they all use complex sampling designs. A simple average of survey respondents is biased if some groups are oversampled. Survey weights restore representativeness.

import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.weightstats import DescrStatsW
import warnings
warnings.filterwarnings('ignore')

Survey Design Concepts

np.random.seed(42)
n = 2000

# Simulate a stratified, clustered survey
strata = np.random.choice(['urban', 'suburban', 'rural'], n, p=[0.4, 0.35, 0.25])
clusters = np.array([f'{s}_{i}' for s, i in zip(strata, np.random.randint(0, 20, n))])

# True population (unequal selection probabilities)
true_income = np.random.lognormal(10.5, 0.8, n)
true_age = np.random.normal(45, 15, n).clip(18, 90)
true_education = np.random.choice(['HS', 'College', 'Graduate'], n, p=[0.3, 0.5, 0.2])

# Sampling probability varies by stratum (oversample rural)
sampling_prob = np.where(strata == 'rural', 0.3, np.where(strata == 'urban', 0.1, 0.15))

# Survey response (not everyone responds)
response_prob = 0.5 + 0.2 * (true_income > np.median(true_income)) + np.random.normal(0, 0.1, n)
responded = np.random.binomial(1, np.clip(response_prob, 0.1, 0.9))

survey_df = pd.DataFrame({
    'stratum': strata,
    'cluster': clusters,
    'income': true_income + np.random.normal(0, 5000, n),
    'age': true_age + np.random.normal(0, 2, n),
    'education': true_education,
    'sampling_prob': sampling_prob,
    'responded': responded
})

# Observed sample (non-response)
sample = survey_df[survey_df['responded'] == 1].copy()
print(f"Population: {len(survey_df)}, Sample: {len(sample)}")
print(f"Response rate: {sample.shape[0] / survey_df.shape[0]:.1%}")

Sampling Weights

The sampling weight is the inverse of the selection probability, adjusted for non-response.

# Base sampling weight
sample['base_weight'] = 1 / sample['sampling_prob']

# Adjust for non-response (post-stratification)
# Assume known population totals by stratum
pop_totals = {'urban': 400, 'suburban': 350, 'rural': 250}
sample_totals = sample.groupby('stratum').size()

# Raking / iterative proportional fitting
def raking_adjustment(sample_df, pop_totals, max_iter=50, tol=1e-6):
    """Iterative proportional fitting (raking)."""
    weights = sample_df['base_weight'].copy()
    
    for iteration in range(max_iter):
        old_weights = weights.copy()
        
        for col, totals in pop_totals.items():
            # Current weighted totals
            current_totals = sample_df.groupby(col).apply(
                lambda x: weights[x.index].sum()
            )
            
            # Adjustment factors
            adjustment = pd.Series(totals) / current_totals
            weights *= sample_df[col].map(adjustment)
        
        # Check convergence
        if np.abs(weights - old_weights).max() < tol:
            print(f"Raking converged after {iteration + 1} iterations")
            break
    
    return weights

sample['survey_weight'] = raking_adjustment(sample, pop_totals)

# Normalize weights to sum to sample size
sample['survey_weight'] *= len(sample) / sample['survey_weight'].sum()

print(f"Weight range: {sample['survey_weight'].min():.2f} to {sample['survey_weight'].max():.2f}")
print(f"Weight mean: {sample['survey_weight'].mean():.2f}")

Weighted Descriptive Statistics

# Weighted mean and standard error
desc_stats = DescrStatsW(sample['income'], weights=sample['survey_weight'])
print(f"Weighted mean income: ${desc_stats.mean:,.0f}")
print(f"Weighted std error: ${desc_stats.std:,.0f}")
print(f"95% CI: ${desc_stats.conf_int()[0]:,.0f} to ${desc_stats.conf_int()[1]:,.0f}")

# Unweighted (biased) comparison
print(f"\nUnweighted mean: ${sample['income'].mean():,.0f}")
print(f"Difference: ${desc_stats.mean - sample['income'].mean():,.0f}")

Design Effects

The design effect (DEFF) measures how much less efficient a complex design is compared to simple random sampling.

def design_effect(weights):
    """Compute design effect from weights."""
    n = len(weights)
    return (weights.sum() ** 2) / (n * (weights ** 2).sum())

deff = design_effect(sample['survey_weight'])
print(f"Design effect: {deff:.2f}")
print(f"Effective sample size: {len(sample) / deff:.0f}")
print(f"Interpretation: Complex design is {deff:.1f}x less efficient than SRS")

Stratified Analysis

# Weighted estimates by stratum
stratum_stats = sample.groupby('stratum').apply(
    lambda x: pd.Series({
        'n': len(x),
        'weighted_mean': np.average(x['income'], weights=x['survey_weight']),
        'weighted_se': np.sqrt(np.average((x['income'] - np.average(x['income'], weights=x['survey_weight'])) ** 2, weights=x['survey_weight']) / x['survey_weight'].sum()),
        'deff': design_effect(x['survey_weight'])
    })
).round(2)

print("Stratum-level statistics:")
print(stratum_stats)

Weighted Regression

# Weighted OLS
X = sm.add_constant(pd.get_dummies(sample[['age', 'education']], drop_first=True))
y = sample['income']

# Survey-weighted regression
model = sm.WLS(y, X, weights=sample['survey_weight']).fit()
print("Weighted Regression Results:")
print(model.summary().tables[1])

# Compare with unweighted
model_unweighted = sm.OLS(y, X).fit()
print(f"\nWeighted R²: {model.rsquared:.4f}")
print(f"Unweighted R²: {model_unweighted.rsquared:.4f}")

Multiple Imputation for Surveys

# Rubin's combining rules for multiply imputed data
def rubin_combine(estimates, variances):
    """Combine estimates from multiple imputations."""
    m = len(estimates)
    Q_bar = np.mean(estimates)
    U_bar = np.mean(variances)
    B = np.var(estimates, ddof=1)
    
    total_var = U_bar + (1 + 1/m) * B
    total_se = np.sqrt(total_var)
    
    # Degrees of freedom (Barnard-Rubin)
    df = (m - 1) * (1 + U_bar / ((1 + 1/m) * B)) ** 2
    
    return Q_bar, total_se, df

# Simulate multiple imputations
n_imputations = 5
estimates = []
variances = []

for i in range(n_imputations):
    # Add noise to income (simulating imputation)
    sample[f'income_imputed_{i}'] = sample['income'] + np.random.normal(0, 1000, len(sample))
    
    model_imp = sm.WLS(
        sample[f'income_imputed_{i}'],
        X, weights=sample['survey_weight']
    ).fit()
    
    estimates.append(model_imp.params['age'])
    variances.append(model_imp.bse['age'] ** 2)

# Combine
combined_est, combined_se, df = rubin_combine(estimates, variances)
print(f"Combined estimate: {combined_est:.4f}")
print(f"Combined SE: {combined_se:.4f}")
print(f"Degrees of freedom: {df:.1f}")

Cluster-Robust Standard Errors

# Cluster-robust standard errors for surveys
from statsmodels.stats.sandwich_covariance import cov_cluster

model_cluster = sm.WLS(y, X, weights=sample['survey_weight']).fit()

# Get cluster IDs
cluster_ids = sample['cluster'].values
unique_clusters = np.unique(cluster_ids)

# Manual cluster-robust SE
X_arr = X.values
y_arr = y.values
w = sample['survey_weight'].values

# Weighted residuals
resid = model_cluster.resid.values
bread = np.linalg.inv(X_arr.T @ np.diag(w) @ X_arr)

# Cluster-robust meat
meat = np.zeros((X_arr.shape[1], X_arr.shape[1]))
for c in unique_clusters:
    mask = cluster_ids == c
    score = (X_arr[mask].T @ np.diag(w[mask]) @ resid[mask]).reshape(-1, 1)
    meat += score @ score.T

V_cluster = bread @ meat @ bread
se_cluster = np.sqrt(np.diag(V_cluster))

print("Comparison of standard errors:")
print(f"Model-based SE: {model_cluster.bse['age']:.4f}")
print(f"Cluster-robust SE: {se_cluster[1]:.4f}")
print(f"Ratio: {se_cluster[1] / model_cluster.bse['age']:.2f}")

Post-Stratification

# Known population totals for post-stratification
population_totals = {
    ('urban', 'College'): 150,
    ('urban', 'HS'): 100,
    ('urban', 'Graduate'): 80,
    ('suburban', 'College'): 120,
    ('suburban', 'HS'): 80,
    ('suburban', 'Graduate'): 50,
    ('rural', 'College'): 70,
    ('rural', 'HS'): 80,
    ('rural', 'Graduate'): 30
}

# Post-stratification weights
sample['cell'] = list(zip(sample['stratum'], sample['education']))
sample['cell_count'] = sample['cell'].map(population_totals)
sample['poststrat_weight'] = sample['cell_count'] / sample.groupby('cell').cumcount() - 1

# Weighted estimate
poststrat_mean = np.average(sample['income'], weights=sample['poststrat_weight'].clip(lower=1))
print(f"Post-stratified mean income: ${poststrat_mean:,.0f}")

Best Practices

Always use survey weights – unweighted estimates are biased
Account for design effects – complex designs need larger sample sizes
Use survey packages – survey in R, statsmodels for survey regression
Check non-response bias – compare respondents to known population totals
Report weighted and unweighted – transparency about design impact
Combine with multiple imputation – handle missing data properly in surveys

Summary

Survey sampling methods ensure representative inference from complex data. Sampling weights correct for unequal selection, design effects quantify efficiency loss, and weighted regression produces valid estimates. Master these tools for working with any survey or census data.