Survival Analysis in Healthcare

Healthcare generates more survival data than any other field – time to death, time to readmission, time to relapse, progression-free survival. Mastering survival analysis for healthcare applications is essential for medical data scientists.

Healthcare Survival Data

Clinical data has unique characteristics: heavy censoring, competing risks, time-varying treatments, and strict regulatory requirements.

import pandas as pd
import numpy as np
from lifelines import (
    KaplanMeierFitter, CoxPHFitter, WeibullAFTFitter,
    LogNormalAFTFitter
)
from lifelines.statistics import logrank_test
import warnings
warnings.filterwarnings('ignore')

Simulating Clinical Trial Data

np.random.seed(42)
n = 500

# Patient characteristics
age = np.random.normal(65, 10, n).clip(40, 90)
sex = np.random.binomial(1, 0.55, n)
stage = np.random.choice([1, 2, 3, 4], n, p=[0.15, 0.3, 0.35, 0.2])
treatment = np.random.binomial(1, 0.5, n)
biomarker = np.random.normal(5, 2, n)

# Survival times (Weibull distribution)
shape = 1.2
scale = np.exp(
    3.5
    - 0.02 * (age - 65)
    + 0.3 * sex
    - 0.5 * stage
    - 0.4 * treatment
    - 0.1 * biomarker
)

survival_time = np.random.weibull(shape, n) * scale

# Censoring (administrative + lost to follow-up)
censor_time = np.random.uniform(12, 60, n)
lost_to_followup = np.random.binomial(1, 0.1, n)
event_time = np.minimum(survival_time, censor_time)
event_observed = ((survival_time <= censor_time) & (lost_to_followup == 0)).astype(int)

# Competing risk: death from other causes
other_cause_time = np.random.exponential(80, n)
competing_event = (other_cause_time < survival_time) & (other_cause_time < censor_time)
event_observed = np.where(competing_event, 2, event_observed)  # 2 = competing event

clinical_df = pd.DataFrame({
    'patient_id': range(1, n + 1),
    'time': event_time,
    'event': event_observed,
    'age': age,
    'sex': sex,
    'stage': stage,
    'treatment': treatment,
    'biomarker': biomarker
})

print(f"Clinical trial: {n} patients")
print(f"Events: {(clinical_df['event'] == 1).sum()}, Competing: {(clinical_df['event'] == 2).sum()}, Censored: {(clinical_df['event'] == 0).sum()}")

Kaplan-Meier Analysis by Treatment Arm

# Treatment vs control
treatment_arm = clinical_df[clinical_df['treatment'] == 1]
control_arm = clinical_df[clinical_df['treatment'] == 0]

kmf_treat = KaplanMeierFitter()
kmf_control = KaplanMeierFitter()

kmf_treat.fit(treatment_arm['time'], treatment_arm['event'] == 1, label='Treatment')
kmf_control.fit(control_arm['time'], control_arm['event'] == 1, label='Control')

# Log-rank test
result = logrank_test(
    treatment_arm['time'], control_arm['time'],
    event_observed_A=(treatment_arm['event'] == 1).astype(int),
    event_observed_B=(control_arm['event'] == 1).astype(int)
)

print(f"Treatment median survival: {kmf_treat.median_survival_time_:.1f} months")
print(f"Control median survival: {kmf_control.median_survival_time_:.1f} months")
print(f"Log-rank p-value: {result.p_value:.4f}")
print(f"Hazard ratio estimate: {result.test_statistic:.3f}")

Cox Regression for Clinical Data

# Prepare data for Cox model (cause-specific for primary event)
clinical_cox = clinical_df.copy()
clinical_cox['event_binary'] = (clinical_cox['event'] == 1).astype(int)

cph = CoxPHFitter()
cph.fit(
    clinical_cox[['time', 'event_binary', 'age', 'sex', 'stage', 'treatment', 'biomarker']],
    duration_col='time',
    event_col='event_binary'
)
cph.print_summary()

# Hazard ratios with confidence intervals
print("\nHazard Ratios:")
print(cph.hazard_ratios_)

Cumulative Incidence Function

For competing risks, Kaplan-Meier overestimates the cause-specific hazard.

def cumulative_incidence(times, events, cause):
    """Compute cumulative incidence for a specific cause."""
    unique_times = np.sort(np.unique(times[events > 0]))
    n_at_risk = np.zeros(len(unique_times))
    cumulative = np.zeros(len(unique_times))
    
    n_total = len(times)
    
    for i, t in enumerate(unique_times):
        at_risk = (times >= t).sum()
        n_at_risk[i] = at_risk
        
        events_at_t = ((times == t) & (events == cause)).sum()
        censored_at_t = ((times == t) & (events == 0)).sum()
        
        if i == 0:
            cumulative[i] = events_at_t / n_total
        else:
            cumulative[i] = cumulative[i-1] + (1 - cumulative[i-1]) * events_at_t / at_risk
    
    return unique_times, cumulative

# Primary cause (cancer death)
times_cause1, cuminc1 = cumulative_incidence(
    clinical_df['time'].values,
    clinical_df['event'].values,
    cause=1
)

# Competing cause (other death)
times_cause2, cuminc2 = cumulative_incidence(
    clinical_df['time'].values,
    clinical_df['event'].values,
    cause=2
)

print(f"5-year cumulative incidence (cancer): {cuminc1[np.searchsorted(times_cause1, 60)]:.3f}")
print(f"5-year cumulative incidence (other): {cuminc2[np.searchsorted(times_cause2, 60)]:.3f}")

Accelerated Failure Time Models

AFT models directly model survival time rather than the hazard.

# Weibull AFT model
aft = WeibullAFTFitter()
aft.fit(
    clinical_cox[['time', 'event_binary', 'age', 'sex', 'stage', 'treatment', 'biomarker']],
    duration_col='time',
    event_col='event_binary'
)
aft.print_summary()

# Predict median survival for different patients
new_patients = pd.DataFrame({
    'age': [55, 70, 80],
    'sex': [0, 1, 1],
    'stage': [2, 3, 4],
    'treatment': [1, 0, 1],
    'biomarker': [6, 4, 3]
})

median_survival = aft.predict_median(new_patients)
print("\nPredicted median survival (months):")
print(median_survival)

Time-Varying Covariates

Treatment changes over time in real clinical settings.

# Simulate time-varying treatment
tv_rows = []
for _, row in clinical_df.iterrows():
    patient_id = row['patient_id']
    
    # Baseline
    tv_rows.append({
        'patient_id': patient_id,
        'start': 0,
        'stop': 6,
        'treatment': 0,  # starts on control
        'age': row['age'],
        'stage': row['stage']
    })
    
    # Some patients switch treatment
    if np.random.random() < 0.3:
        tv_rows.append({
            'patient_id': patient_id,
            'start': 6,
            'stop': row['time'],
            'treatment': 1,  # switches to treatment
            'age': row['age'],
            'stage': row['stage']
        })
    else:
        tv_rows[-1]['stop'] = row['time']
        tv_rows[-1]['event'] = (row['event'] == 1).astype(int)

tv_df = pd.DataFrame(tv_rows)
tv_df['event'] = tv_df.get('event', 0)

print(f"Time-varying dataset: {tv_df.shape}")
print(f"Patients with treatment switch: {tv_df['patient_id'].nunique() - len(clinical_df)}")

Model Validation: C-Index and Calibration

from lifelines.utils import concordance_index

# Concordance index
c_index = concordance_index(
    clinical_cox['time'],
    -cph.predict_partial_hazard(clinical_cox).values.flatten(),
    clinical_cox['event_binary']
)
print(f"C-index: {c_index:.4f}")

# Calibration: compare predicted vs observed
clinical_cox['predicted_median'] = cph.predict_median(clinical_cox)
clinical_cox['risk_group'] = pd.qcut(
    clinical_cox['predicted_median'], q=4, labels=['low', 'medium', 'high', 'very_high']
)

calibration = clinical_cox.groupby('risk_group').apply(
    lambda x: pd.Series({
        'n': len(x),
        'predicted_survival': x['predicted_median'].mean(),
        'observed_survival': x[x['event_binary'] == 0]['time'].mean()
    })
)
print("\nCalibration by risk group:")
print(calibration)

Clinical Trial Design

# Sample size calculation for survival trials
def sample_size_survival(hazard_ratio, alpha=0.05, power=0.80, event_rate=0.5):
    """Calculate sample size for two-arm survival trial."""
    from scipy.stats import norm
    
    z_alpha = norm.ppf(1 - alpha/2)
    z_beta = norm.ppf(power)
    
    events_needed = ((z_alpha + z_beta) ** 2 * (1 + hazard_ratio) ** 2) / \
                    ((1 - hazard_ratio) ** 2 * np.log(hazard_ratio) ** 2)
    
    n_per_arm = events_needed / event_rate
    return int(np.ceil(n_per_arm * 2))

# Calculate for different hazard ratios
for hr in [0.7, 0.75, 0.8, 0.85]:
    n = sample_size_survival(hr, event_rate=0.6)
    print(f"HR={hr}: {n} patients needed")

Best Practices

Use cause-specific hazards – for competing risks in clinical data
Report C-index and calibration – discrimination alone isn't enough
Check proportional hazards – violations require stratified models
Account for informative censoring – dropouts may differ from completers
Use AFT models – when the time scale is of direct interest
Validate externally – internal validation overfits; use independent cohorts

Summary

Healthcare survival analysis requires handling competing risks, time-varying treatments, and clinical validation. Master Cox regression, AFT models, cumulative incidence, and trial design to build models that clinicians trust and patients benefit from.