Survival Analysis in Healthcare
Healthcare generates more survival data than any other field β time to death, time to readmission, time to relapse, progression-free survival. Mastering survival analysis for healthcare applications is essential for medical data scientists.
Healthcare Survival Data
Clinical data has unique characteristics: heavy censoring, competing risks, time-varying treatments, and strict regulatory requirements.
import pandas as pd
import numpy as np
from lifelines import (
KaplanMeierFitter, CoxPHFitter, WeibullAFTFitter,
LogNormalAFTFitter
)
from lifelines.statistics import logrank_test
import warnings
warnings.filterwarnings('ignore')
Simulating Clinical Trial Data
np.random.seed(42)
n = 500
# Patient characteristics
age = np.random.normal(65, 10, n).clip(40, 90)
sex = np.random.binomial(1, 0.55, n)
stage = np.random.choice([1, 2, 3, 4], n, p=[0.15, 0.3, 0.35, 0.2])
treatment = np.random.binomial(1, 0.5, n)
biomarker = np.random.normal(5, 2, n)
# Survival times (Weibull distribution)
shape = 1.2
scale = np.exp(
3.5
- 0.02 * (age - 65)
+ 0.3 * sex
- 0.5 * stage
- 0.4 * treatment
- 0.1 * biomarker
)
survival_time = np.random.weibull(shape, n) * scale
# Censoring (administrative + lost to follow-up)
censor_time = np.random.uniform(12, 60, n)
lost_to_followup = np.random.binomial(1, 0.1, n)
event_time = np.minimum(survival_time, censor_time)
event_observed = ((survival_time <= censor_time) & (lost_to_followup == 0)).astype(int)
# Competing risk: death from other causes
other_cause_time = np.random.exponential(80, n)
competing_event = (other_cause_time < survival_time) & (other_cause_time < censor_time)
event_observed = np.where(competing_event, 2, event_observed) # 2 = competing event
clinical_df = pd.DataFrame({
'patient_id': range(1, n + 1),
'time': event_time,
'event': event_observed,
'age': age,
'sex': sex,
'stage': stage,
'treatment': treatment,
'biomarker': biomarker
})
print(f"Clinical trial: {n} patients")
print(f"Events: {(clinical_df['event'] == 1).sum()}, Competing: {(clinical_df['event'] == 2).sum()}, Censored: {(clinical_df['event'] == 0).sum()}")
Kaplan-Meier Analysis by Treatment Arm
# Treatment vs control
treatment_arm = clinical_df[clinical_df['treatment'] == 1]
control_arm = clinical_df[clinical_df['treatment'] == 0]
kmf_treat = KaplanMeierFitter()
kmf_control = KaplanMeierFitter()
kmf_treat.fit(treatment_arm['time'], treatment_arm['event'] == 1, label='Treatment')
kmf_control.fit(control_arm['time'], control_arm['event'] == 1, label='Control')
# Log-rank test
result = logrank_test(
treatment_arm['time'], control_arm['time'],
event_observed_A=(treatment_arm['event'] == 1).astype(int),
event_observed_B=(control_arm['event'] == 1).astype(int)
)
print(f"Treatment median survival: {kmf_treat.median_survival_time_:.1f} months")
print(f"Control median survival: {kmf_control.median_survival_time_:.1f} months")
print(f"Log-rank p-value: {result.p_value:.4f}")
print(f"Hazard ratio estimate: {result.test_statistic:.3f}")
Cox Regression for Clinical Data
# Prepare data for Cox model (cause-specific for primary event)
clinical_cox = clinical_df.copy()
clinical_cox['event_binary'] = (clinical_cox['event'] == 1).astype(int)
cph = CoxPHFitter()
cph.fit(
clinical_cox[['time', 'event_binary', 'age', 'sex', 'stage', 'treatment', 'biomarker']],
duration_col='time',
event_col='event_binary'
)
cph.print_summary()
# Hazard ratios with confidence intervals
print("\nHazard Ratios:")
print(cph.hazard_ratios_)
Cumulative Incidence Function
For competing risks, Kaplan-Meier overestimates the cause-specific hazard.
def cumulative_incidence(times, events, cause):
"""Compute cumulative incidence for a specific cause."""
unique_times = np.sort(np.unique(times[events > 0]))
n_at_risk = np.zeros(len(unique_times))
cumulative = np.zeros(len(unique_times))
n_total = len(times)
for i, t in enumerate(unique_times):
at_risk = (times >= t).sum()
n_at_risk[i] = at_risk
events_at_t = ((times == t) & (events == cause)).sum()
censored_at_t = ((times == t) & (events == 0)).sum()
if i == 0:
cumulative[i] = events_at_t / n_total
else:
cumulative[i] = cumulative[i-1] + (1 - cumulative[i-1]) * events_at_t / at_risk
return unique_times, cumulative
# Primary cause (cancer death)
times_cause1, cuminc1 = cumulative_incidence(
clinical_df['time'].values,
clinical_df['event'].values,
cause=1
)
# Competing cause (other death)
times_cause2, cuminc2 = cumulative_incidence(
clinical_df['time'].values,
clinical_df['event'].values,
cause=2
)
print(f"5-year cumulative incidence (cancer): {cuminc1[np.searchsorted(times_cause1, 60)]:.3f}")
print(f"5-year cumulative incidence (other): {cuminc2[np.searchsorted(times_cause2, 60)]:.3f}")
Accelerated Failure Time Models
AFT models directly model survival time rather than the hazard.
# Weibull AFT model
aft = WeibullAFTFitter()
aft.fit(
clinical_cox[['time', 'event_binary', 'age', 'sex', 'stage', 'treatment', 'biomarker']],
duration_col='time',
event_col='event_binary'
)
aft.print_summary()
# Predict median survival for different patients
new_patients = pd.DataFrame({
'age': [55, 70, 80],
'sex': [0, 1, 1],
'stage': [2, 3, 4],
'treatment': [1, 0, 1],
'biomarker': [6, 4, 3]
})
median_survival = aft.predict_median(new_patients)
print("\nPredicted median survival (months):")
print(median_survival)
Time-Varying Covariates
Treatment changes over time in real clinical settings.
# Simulate time-varying treatment
tv_rows = []
for _, row in clinical_df.iterrows():
patient_id = row['patient_id']
# Baseline
tv_rows.append({
'patient_id': patient_id,
'start': 0,
'stop': 6,
'treatment': 0, # starts on control
'age': row['age'],
'stage': row['stage']
})
# Some patients switch treatment
if np.random.random() < 0.3:
tv_rows.append({
'patient_id': patient_id,
'start': 6,
'stop': row['time'],
'treatment': 1, # switches to treatment
'age': row['age'],
'stage': row['stage']
})
else:
tv_rows[-1]['stop'] = row['time']
tv_rows[-1]['event'] = (row['event'] == 1).astype(int)
tv_df = pd.DataFrame(tv_rows)
tv_df['event'] = tv_df.get('event', 0)
print(f"Time-varying dataset: {tv_df.shape}")
print(f"Patients with treatment switch: {tv_df['patient_id'].nunique() - len(clinical_df)}")
Model Validation: C-Index and Calibration
from lifelines.utils import concordance_index
# Concordance index
c_index = concordance_index(
clinical_cox['time'],
-cph.predict_partial_hazard(clinical_cox).values.flatten(),
clinical_cox['event_binary']
)
print(f"C-index: {c_index:.4f}")
# Calibration: compare predicted vs observed
clinical_cox['predicted_median'] = cph.predict_median(clinical_cox)
clinical_cox['risk_group'] = pd.qcut(
clinical_cox['predicted_median'], q=4, labels=['low', 'medium', 'high', 'very_high']
)
calibration = clinical_cox.groupby('risk_group').apply(
lambda x: pd.Series({
'n': len(x),
'predicted_survival': x['predicted_median'].mean(),
'observed_survival': x[x['event_binary'] == 0]['time'].mean()
})
)
print("\nCalibration by risk group:")
print(calibration)
Clinical Trial Design
# Sample size calculation for survival trials
def sample_size_survival(hazard_ratio, alpha=0.05, power=0.80, event_rate=0.5):
"""Calculate sample size for two-arm survival trial."""
from scipy.stats import norm
z_alpha = norm.ppf(1 - alpha/2)
z_beta = norm.ppf(power)
events_needed = ((z_alpha + z_beta) ** 2 * (1 + hazard_ratio) ** 2) / \
((1 - hazard_ratio) ** 2 * np.log(hazard_ratio) ** 2)
n_per_arm = events_needed / event_rate
return int(np.ceil(n_per_arm * 2))
# Calculate for different hazard ratios
for hr in [0.7, 0.75, 0.8, 0.85]:
n = sample_size_survival(hr, event_rate=0.6)
print(f"HR={hr}: {n} patients needed")
Best Practices
- Use cause-specific hazards β for competing risks in clinical data
- Report C-index and calibration β discrimination alone isn't enough
- Check proportional hazards β violations require stratified models
- Account for informative censoring β dropouts may differ from completers
- Use AFT models β when the time scale is of direct interest
- Validate externally β internal validation overfits; use independent cohorts
Summary
Healthcare survival analysis requires handling competing risks, time-varying treatments, and clinical validation. Master Cox regression, AFT models, cumulative incidence, and trial design to build models that clinicians trust and patients benefit from.