Probability, Bayes' Theorem and PDF/CDF

Why Probability Matters

Probability is the mathematics of uncertainty. In data science, we use probability to quantify uncertainty in predictions, make inferences about populations from samples, build probabilistic models, and test hypotheses.

Fundamental Concepts

Sample Space and Events

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Sample space: All possible outcomes
# Event: A subset of the sample space

# Example: Rolling two dice
sample_space = {(i, j) for i in range(1, 7) for j in range(1, 7)}
print(f"Sample space size: {len(sample_space)}")  # 36

# Event: Sum equals 7
event_sum_7 = {(i, j) for i, j in sample_space if i + j == 7}
print(f"Event sum=7: {event_sum_7}")
print(f"Probability: {len(event_sum_7)/len(sample_space):.4f}")  # 6/36 = 0.1667

Basic Probability Rules

# Verify addition rule
p_red = 26/52
p_king = 4/52
p_red_and_king = 2/52
p_red_or_king = p_red + p_king - p_red_and_king
print(f"P(Red or King): {p_red_or_king:.4f}")

Probability Distributions

Discrete Distributions

# 1. Bernoulli Distribution (single trial)
p = 0.3
bernoulli = stats.bernoulli(p)
print(f"Bernoulli(p={p}):")
print(f"  P(X=0): {bernoulli.pmf(0):.2f}")
print(f"  P(X=1): {bernoulli.pmf(1):.2f}")
print(f"  Mean: {bernoulli.mean():.2f}")
print(f"  Variance: {bernoulli.var():.2f}")

# 2. Binomial Distribution (n trials)
n, p = 10, 0.5
binomial = stats.binom(n, p)
x = np.arange(0, n+1)

plt.figure(figsize=(10, 5))
plt.bar(x, binomial.pmf(x), color='skyblue', edgecolor='black')
plt.title(f'Binomial Distribution (n={n}, p={p})')
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.show()

# 3. Poisson Distribution (events per interval)
lambda_param = 5
poisson = stats.poisson(lambda_param)
x = np.arange(0, 15)

plt.figure(figsize=(10, 5))
plt.bar(x, poisson.pmf(x), color='lightgreen', edgecolor='black')
plt.title(f'Poisson Distribution (λ={lambda_param})')
plt.xlabel('Number of Events')
plt.ylabel('Probability')
plt.show()

Continuous Distributions

# 1. Normal Distribution (Gaussian)
mu, sigma = 0, 1
normal = stats.norm(mu, sigma)
x = np.linspace(-4, 4, 1000)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].plot(x, normal.pdf(x), 'b-', linewidth=2)
axes[0].fill_between(x, normal.pdf(x), alpha=0.3)
axes[0].set_title('Normal Distribution PDF')

axes[1].plot(x, normal.cdf(x), 'r-', linewidth=2)
axes[1].set_title('Normal Distribution CDF')
plt.tight_layout()
plt.show()

# 2. Exponential Distribution
lambda_param = 1
exponential = stats.expon(scale=1/lambda_param)
x = np.linspace(0, 5, 1000)

plt.figure(figsize=(10, 5))
plt.plot(x, exponential.pdf(x), 'r-', linewidth=2)
plt.fill_between(x, exponential.pdf(x), alpha=0.3)
plt.title(f'Exponential Distribution (λ={lambda_param})')
plt.show()

# 3. Beta Distribution
alpha, beta_param = 2, 5
beta = stats.beta(alpha, beta_param)
x = np.linspace(0, 1, 1000)

plt.figure(figsize=(10, 5))
plt.plot(x, beta.pdf(x), 'g-', linewidth=2)
plt.fill_between(x, beta.pdf(x), alpha=0.3)
plt.title(f'Beta Distribution (α={alpha}, β={beta_param})')
plt.show()

PDF vs CDF

x = np.linspace(-4, 4, 1000)
pdf = stats.norm.pdf(x)
cdf = stats.norm.cdf(x)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].plot(x, pdf, 'b-', linewidth=2)
axes[0].fill_between(x, pdf, where=(x >= -1) & (x <= 1), alpha=0.5, label='P(-1 ≤ X ≤ 1)')
axes[0].set_title('PDF: P(-1 ≤ X ≤ 1) ≈ 0.68')
axes[0].legend()

axes[1].plot(x, cdf, 'r-', linewidth=2)
axes[1].axhline(y=0.84, color='g', linestyle='--', label='P(X ≤ 1)')
axes[1].axvline(x=1, color='g', linestyle='--')
axes[1].set_title('CDF: P(X ≤ 1)')
axes[1].legend()
plt.tight_layout()
plt.show()

# Calculating probabilities
p_between = stats.norm.cdf(1) - stats.norm.cdf(-1)
print(f"P(-1 ≤ X ≤ 1): {p_between:.4f}")

p_greater = 1 - stats.norm.cdf(2)
print(f"P(X > 2): {p_greater:.4f}")

Bayes' Theorem

Intuitive Understanding

Example: Medical Testing

# Medical test example
p_disease = 0.01
p_positive_given_disease = 0.99
p_positive_given_healthy = 0.05  # False positive rate

# P(Positive) = P(Positive|Disease)×P(Disease) + P(Positive|Healthy)×P(Healthy)
p_positive = (p_positive_given_disease * p_disease + 
              p_positive_given_healthy * (1 - p_disease))

# Apply Bayes' theorem
p_disease_given_positive = (p_positive_given_disease * p_disease) / p_positive

print("Medical Test Analysis:")
print(f"P(Disease): {p_disease:.2%}")
print(f"P(Positive|Disease): {p_positive_given_disease:.2%}")
print(f"P(Positive|Healthy): {p_positive_given_healthy:.2%}")
print(f"P(Disease|Positive): {p_disease_given_positive:.2%}")

Naive Bayes Classification

# Spam classification
p_spam = 0.3
p_ham = 0.7

likelihoods = {
    'free': {'spam': 0.8, 'ham': 0.1},
    'money': {'spam': 0.7, 'ham': 0.05},
    'meeting': {'spam': 0.1, 'ham': 0.6},
    'project': {'spam': 0.2, 'ham': 0.5}
}

words = ['free', 'money', 'meeting']

p_words_given_spam = np.prod([likelihoods[w]['spam'] for w in words])
p_words_given_ham = np.prod([likelihoods[w]['ham'] for w in words])

p_words = p_words_given_spam * p_spam + p_words_given_ham * p_ham
p_spam_given_words = (p_words_given_spam * p_spam) / p_words

print(f"P(Spam|Words): {p_spam_given_words:.4f}")
print(f"Classification: {'Spam' if p_spam_given_words > 0.5 else 'Ham'}")

Central Limit Theorem (CLT)

# Demonstrate CLT with different distributions
np.random.seed(42)

distributions = {
    'Uniform': np.random.uniform(0, 10, 10000),
    'Exponential': np.random.exponential(2, 10000),
    'Chi-Square': np.random.chisquare(2, 10000),
    'Bimodal': np.concatenate([np.random.normal(-2, 1, 5000), 
                                np.random.normal(2, 1, 5000)])
}

sample_sizes = [1, 5, 10, 30]

fig, axes = plt.subplots(len(distributions), len(sample_sizes), figsize=(16, 12))

for i, (dist_name, data) in enumerate(distributions.items()):
    for j, n in enumerate(sample_sizes):
        sample_means = [np.mean(np.random.choice(data, n)) for _ in range(1000)]
        axes[i, j].hist(sample_means, bins=30, density=True, alpha=0.7, edgecolor='black')
        axes[i, j].set_title(f'{dist_name}\nn={n}')

plt.suptitle('Central Limit Theorem Demonstration', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Practical Example: A/B Testing with Bayesian Analysis

np.random.seed(42)

# A/B test data
n_control, conversions_control = 1000, 120
n_treatment, conversions_treatment = 1000, 150

# Bayesian approach with Beta prior
prior_a, prior_b = 1, 1

posterior_control = stats.beta(conversions_control + prior_a,
                               n_control - conversions_control + prior_b)
posterior_treatment = stats.beta(conversions_treatment + prior_a,
                                 n_treatment - conversions_treatment + prior_b)

n_samples = 10000
samples_control = posterior_control.rvs(n_samples)
samples_treatment = posterior_treatment.rvs(n_samples)

prob_b_better = np.mean(samples_treatment > samples_control)
lift = (samples_treatment - samples_control) / samples_control

print(f"P(B > A): {prob_b_better:.2%}")
print(f"Expected Lift: {np.mean(lift)*100:.1f}% ± {np.std(lift)*100:.1f}%")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

x = np.linspace(0.08, 0.20, 1000)
axes[0].plot(x, posterior_control.pdf(x), 'b-', label='Control', linewidth=2)
axes[0].plot(x, posterior_treatment.pdf(x), 'r-', label='Treatment', linewidth=2)
axes[0].fill_between(x, posterior_control.pdf(x), alpha=0.3)
axes[0].fill_between(x, posterior_treatment.pdf(x), alpha=0.3)
axes[0].set_title('Posterior Distributions')
axes[0].legend()

axes[1].hist(lift * 100, bins=50, density=True, alpha=0.7, edgecolor='black')
axes[1].axvline(x=0, color='r', linestyle='--', label='No Effect')
axes[1].axvline(x=np.mean(lift)*100, color='g', linestyle='-',
                label=f'Mean: {np.mean(lift)*100:.1f}%')
axes[1].set_title('Lift Distribution (B vs A)')
axes[1].legend()
plt.tight_layout()
plt.show()

Key Takeaways

Practice Exercise

Calculate the probability of getting exactly 3 heads in 5 fair coin flips using the Binomial distribution
Apply Bayes' Theorem to a drug testing scenario: 2% prevalence, 95% sensitivity, 90% specificity
Demonstrate CLT by sampling from an exponential distribution with n = 2, 10, 30, 100
Build a Bayesian A/B testing framework that computes P(B > A) for different sample sizes
For a Poisson process with λ = 3 events/hour, compute the probability of observing 0-5+ events
Compare the PDF and CDF of the normal distribution visually
Verify that P(Μ - σ ≤ X ≤ Μ + σ) ≈ 0.6827 for a standard normal