Data Science Capstone Masterclass
This is the culminating project that brings together everything you've learned. Follow this complete framework to execute an end-to-end data science project that stands out.
Project Lifecycle
Phase 1: Problem Framing
# Problem framing template
problem_statement = {
"business_context": """
E-commerce company X loses $5M annually to customer churn.
Current retention efforts are untargeted, spending equally on all customers.
""",
"objective": """
Build a churn prediction model to identify at-risk customers
30 days before churn, enabling targeted retention campaigns.
""",
"success_metrics": {
"primary": "AUC-ROC βΒ₯ 0.85 on held-out test set",
"business": "15% reduction in churn rate within 6 months",
"constraints": [
"Prediction latency < 100ms",
"Must be interpretable for marketing team",
"Cannot use PII in model features"
]
},
"scope": {
"in": [
"Customer behavior features",
"Product interaction data",
"Support ticket history"
],
"out": [
"Competitor pricing",
"Customer survey responses"
]
}
}
# Stakeholder alignment checklist
stakeholder_checklist = [
"Γ’Εβ Business problem clearly defined",
"Γ’Εβ Success metrics agreed upon",
"Γ’Εβ Data availability confirmed",
"Γ’Εβ Timeline and milestones set",
"Γ’Εβ Risks and dependencies identified",
"Γ’Εβ Go/No-go criteria established"
]
Phase 2: Data Collection & EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
class DataExploration:
def __init__(self, data_path):
self.df = pd.read_csv(data_path)
self.report = {}
def data_overview(self):
"""Comprehensive data overview"""
overview = {
"shape": self.df.shape,
"dtypes": self.df.dtypes.to_dict(),
"missing": self.df.isnull().sum().to_dict(),
"missing_pct": (self.df.isnull().sum() / len(self.df) * 100).to_dict(),
"duplicates": self.df.duplicated().sum(),
"memory_mb": self.df.memory_usage(deep=True).sum() / 1024**2
}
print(f"Dataset: {overview['shape'][0]:,} rows Γ β {overview['shape'][1]} columns")
print(f"Missing values: {sum(v > 0 for v in overview['missing'].values())} columns")
print(f"Duplicates: {overview['duplicates']:,}")
print(f"Memory: {overview['memory_mb']:.1f} MB")
return overview
def univariate_analysis(self, columns=None):
"""Analyze individual features"""
if columns is None:
columns = self.df.select_dtypes(include=[np.number]).columns
for col in columns:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Distribution
self.df[col].hist(ax=axes[0], bins=30, edgecolor='black')
axes[0].set_title(f'{col} Distribution')
axes[0].set_xlabel(col)
# Box plot
self.df.boxplot(column=col, ax=axes[1])
axes[1].set_title(f'{col} Box Plot')
plt.tight_layout()
plt.show()
print(f"\n{col}:")
print(f" Mean: {self.df[col].mean():.2f}")
print(f" Std: {self.df[col].std():.2f}")
print(f" Skew: {self.df[col].skew():.2f}")
def bivariate_analysis(self, target_col):
"""Analyze relationships with target"""
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
correlations = self.df[numeric_cols].corr()[target_col].drop(target_col)
correlations = correlations.sort_values(ascending=False)
print(f"\nTop correlations with {target_col}:")
print(correlations.head(10))
# Plot top correlations
top_features = correlations.head(6).index
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for idx, feat in enumerate(top_features):
ax = axes[idx // 3, idx % 3]
self.df.plot.scatter(x=feat, y=target_col, ax=ax, alpha=0.3)
ax.set_title(f'{feat} vs {target_col}')
plt.tight_layout()
plt.show()
def generate_eda_report(self):
"""Generate comprehensive EDA report"""
report = {
"data_overview": self.data_overview(),
"target_distribution": self.df['churned'].value_counts().to_dict(),
"feature_correlations": self._compute_correlations(),
"data_quality_issues": self._identify_quality_issues(),
"insights": self._extract_insights()
}
return report
def _compute_correlations(self):
numeric_df = self.df.select_dtypes(include=[np.number])
return numeric_df.corr().to_dict()
def _identify_quality_issues(self):
issues = []
# High cardinality
for col in self.df.columns:
if self.df[col].nunique() > 100:
issues.append(f"High cardinality: {col} ({self.df[col].nunique()} unique)")
# Skewed features
for col in self.df.select_dtypes(include=[np.number]).columns:
if abs(self.df[col].skew()) > 2:
issues.append(f"Highly skewed: {col} (skew={self.df[col].skew():.2f})")
return issues
def _extract_insights(self):
insights = []
# Correlation insights
corr_with_target = self.df.corr()['churned'].drop('churned')
strong_pos = corr_with_target[corr_with_target > 0.3]
strong_neg = corr_with_target[corr_with_target < -0.3]
if len(strong_pos) > 0:
insights.append(f"Strong positive correlations: {list(strong_pos.index)}")
if len(strong_neg) > 0:
insights.append(f"Strong negative correlations: {list(strong_neg.index)}")
return insights
# Usage
eda = DataExploration("customer_data.csv")
overview = eda.data_overview()
eda.univariate_analysis(['tenure_months', 'monthly_charges', 'total_charges'])
eda.bivariate_analysis('churned')
report = eda.generate_eda_report()
Phase 3: Feature Engineering & Modeling
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb
class FeatureEngineer:
def __init__(self):
self.feature_names = []
def create_features(self, df):
"""Create predictive features from raw data"""
features = df.copy()
# Tenure features
features['tenure_group'] = pd.cut(
features['tenure_months'],
bins=[0, 12, 24, 48, np.inf],
labels=['0-1y', '1-2y', '2-4y', '4y+']
)
# Engagement features
features['usage_trend_30d'] = (
features['usage_last_7d'] / (features['usage_last_30d'] / 4 + 1e-8)
)
features['support_ticket_rate'] = (
features['support_tickets_90d'] / (features['tenure_months'] + 1)
)
# Payment features
features['payment_delay_avg'] = features['avg_days_past_due']
features['has_payment_issue'] = (features['payment_delay_avg'] > 7).astype(int)
# Value features
features['ltv_to_date'] = features['total_revenue'] / (features['tenure_months'] + 1)
features['monthly_trend'] = (
features['revenue_last_3m'] / 3 - features['revenue_first_3m'] / 3
)
return features
def get_feature_names(self):
return self.feature_names
class ModelPipeline:
def __init__(self):
self.pipeline = None
self.model = None
def build_pipeline(self, numeric_features, categorical_features):
"""Build preprocessing + model pipeline"""
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
]
)
self.pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', lgb.LGBMClassifier(
n_estimators=200,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
is_unbalance=True
))
])
return self.pipeline
def train(self, X_train, y_train, X_val=None, y_val=None):
"""Train with early stopping"""
self.pipeline.fit(X_train, y_train)
# Cross-validation
cv_scores = cross_val_score(
self.pipeline, X_train, y_train,
cv=5, scoring='roc_auc', n_jobs=-1
)
return {
"cv_mean": cv_scores.mean(),
"cv_std": cv_scores.std()
}
def predict(self, X):
"""Generate predictions"""
return self.pipeline.predict(X)
def predict_proba(self, X):
"""Generate probability predictions"""
return self.pipeline.predict_proba(X)[:, 1]
# Training pipeline
def run_training_pipeline(data_path):
# Load and prepare data
df = pd.read_csv(data_path)
engineer = FeatureEngineer()
features = engineer.create_features(df)
# Split features and target
X = features.drop(['customer_id', 'churned'], axis=1)
y = features['churned']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Build and train model
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
pipeline = ModelPipeline()
pipeline.build_pipeline(numeric_features, categorical_features)
results = pipeline.train(X_train, y_train)
# Evaluate
from sklearn.metrics import roc_auc_score, classification_report
y_pred_proba = pipeline.predict_proba(X_test)
y_pred = pipeline.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"CV AUC: {results['cv_mean']:.4f} Β± {results['cv_std']:.4f}")
print(f"Test AUC: {test_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
return pipeline, results
pipeline, results = run_training_pipeline("customer_data.csv")
Phase 4: Evaluation & Iteration
from sklearn.metrics import (
roc_auc_score, precision_recall_curve, average_precision_score,
confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import numpy as np
class ModelEvaluator:
def __init__(self, y_true, y_pred_proba, threshold=0.5):
self.y_true = y_true
self.y_pred_proba = y_pred_proba
self.y_pred = (y_pred_proba >= threshold).astype(int)
self.threshold = threshold
def full_evaluation(self):
"""Comprehensive model evaluation"""
metrics = {
"auc_roc": roc_auc_score(self.y_true, self.y_pred_proba),
"avg_precision": average_precision_score(self.y_true, self.y_pred_proba),
"classification_report": classification_report(
self.y_true, self.y_pred, output_dict=True
),
"confusion_matrix": confusion_matrix(self.y_true, self.y_pred).tolist()
}
# Business metrics
metrics["business"] = self._compute_business_metrics()
return metrics
def _compute_business_metrics(self):
"""Translate model metrics to business impact"""
cm = confusion_matrix(self.y_true, self.y_pred)
tn, fp, fn, tp = cm.ravel()
# Assumptions
cost_per_retention_offer = 50 # dollars
value_of_retained_customer = 500 # dollars
retained_customers = tp # True positives we can target
wasted_offers = fp # False positives (wasted offers)
missed_churners = fn # False negatives (missed opportunities)
expected_savings = retained_customers * value_of_retained_customer
expected_costs = (retained_customers + wasted_offers) * cost_per_retention_offer
net_impact = expected_savings - expected_costs
return {
"retained_customers": int(retained_customers),
"wasted_offers": int(wasted_offers),
"missed_churners": int(missed_churners),
"expected_savings": float(expected_savings),
"expected_costs": float(expected_costs),
"net_impact": float(net_impact)
}
def plot_evaluation(self):
"""Visualize model performance"""
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(self.y_true, self.y_pred_proba)
axes[0, 0].plot(fpr, tpr, label=f'AUC = {roc_auc_score(self.y_true, self.y_pred_proba):.3f}')
axes[0, 0].plot([0, 1], [0, 1], 'k--')
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curve')
axes[0, 0].legend()
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(self.y_true, self.y_pred_proba)
axes[0, 1].plot(recall, precision)
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('Precision-Recall Curve')
# Confusion Matrix
cm = confusion_matrix(self.y_true, self.y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')
axes[1, 0].set_title('Confusion Matrix')
# Threshold Analysis
thresholds = np.arange(0.1, 0.9, 0.05)
f1_scores = []
for t in thresholds:
y_pred_t = (self.y_pred_proba >= t).astype(int)
from sklearn.metrics import f1_score
f1_scores.append(f1_score(self.y_true, y_pred_t))
axes[1, 1].plot(thresholds, f1_scores)
axes[1, 1].axvline(x=self.threshold, color='r', linestyle='--', label=f'Current: {self.threshold}')
axes[1, 1].set_xlabel('Threshold')
axes[1, 1].set_ylabel('F1 Score')
axes[1, 1].set_title('F1 Score vs Threshold')
axes[1, 1].legend()
plt.tight_layout()
plt.savefig('evaluation_report.png', dpi=150)
plt.show()
# Run evaluation
evaluator = ModelEvaluator(y_test, y_pred_proba)
metrics = evaluator.full_evaluation()
evaluator.plot_evaluation()
print(f"AUC-ROC: {metrics['auc_roc']:.4f}")
print(f"Expected Net Impact: ${metrics['business']['net_impact']:,.0f}")
Phase 5: Deployment
# Dockerfile
dockerfile = """
FROM python:3.10-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY model/ ./model/
COPY app.py .
COPY config.yaml .
EXPOSE 8000
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
"""
# FastAPI deployment
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
import joblib
app = FastAPI(title="Churn Prediction API")
model = joblib.load("model/churn_model.pkl")
feature_engineer = joblib.load("model/feature_engineer.pkl")
class CustomerData(BaseModel):
tenure_months: float
monthly_charges: float
total_charges: float
usage_last_7d: float
usage_last_30d: float
support_tickets_90d: int
payment_delay_avg: float
total_revenue: float
@app.post("/predict")
def predict_churn(customer: CustomerData):
# Convert to DataFrame
df = pd.DataFrame([customer.dict()])
# Engineer features
features = feature_engineer.transform(df)
# Predict
probability = model.predict_proba(features)[0, 1]
# Interpret
risk_level = "high" if probability > 0.7 else "medium" if probability > 0.3 else "low"
return {
"churn_probability": float(probability),
"risk_level": risk_level,
"recommended_action": get_recommendation(risk_level)
}
def get_recommendation(risk_level):
recommendations = {
"high": "Immediate outreach with retention offer",
"medium": "Engagement campaign with personalized content",
"low": "Standard nurture sequence"
}
return recommendations.get(risk_level, "Monitor")
@app.get("/health")
def health():
return {"status": "healthy"}
Phase 6: Presentation
# Presentation structure
presentation_outline = {
"slide_1_title": {
"title": "Customer Churn Prediction System",
"subtitle": "Identifying at-risk customers for targeted retention",
"author": "Your Name",
"date": "Date"
},
"slide_2_problem": {
"title": "Business Problem",
"content": [
"$5M annual revenue loss from churn",
"Untargeted retention spending",
"Need for predictive, actionable system"
]
},
"slide_3_approach": {
"title": "Our Approach",
"content": [
"End-to-end ML pipeline",
"Feature engineering from behavioral data",
"LightGBM model with 0.89 AUC",
"Real-time scoring API"
]
},
"slide_4_results": {
"title": "Results",
"content": [
"AUC-ROC: 0.89 (vs 0.82 baseline)",
"Precision @ 10%: 72%",
"Estimated impact: $2.3M annual savings",
"Latency: <50ms per prediction"
]
},
"slide_5_demo": {
"title": "Live Demo",
"content": [
"Walk through the prediction API",
"Show the dashboard",
"Demonstrate interpretability"
]
},
"slide_6_business_impact": {
"title": "Business Impact",
"content": [
"Targeted campaigns for top 10% risk",
"30% reduction in churn for targeted segment",
"ROI: 4.6x within 6 months",
"Scalable to other retention use cases"
]
},
"slide_7_next_steps": {
"title": "Next Steps",
"content": [
"A/B test retention campaigns",
"Add real-time features from streaming data",
"Build recommendation engine for offers",
"Expand to other customer segments"
]
}
}
# Key metrics to highlight
key_metrics = {
"model_performance": {
"auc_roc": 0.89,
"precision_at_10pct": 0.72,
"recall_at_10pct": 0.45
},
"business_impact": {
"annual_savings": 2300000,
"retention_rate_improvement": 0.15,
"roi": 4.6
},
"technical_metrics": {
"inference_latency_ms": 45,
"training_time_hours": 2.5,
"model_size_mb": 15
}
}
Final Checklist
capstone_checklist = {
"problem_framing": [
"Γ’Εβ Business problem clearly defined",
"Γ’Εβ Success metrics agreed with stakeholders",
"Γ’Εβ Scope and constraints documented",
"Γ’Εβ Timeline and milestones set"
],
"data": [
"Γ’Εβ Data sources identified and accessed",
"Γ’Εβ EDA completed with insights documented",
"Γ’Εβ Data quality issues addressed",
"Γ’Εβ Feature engineering rationale explained"
],
"modeling": [
"Γ’Εβ Multiple approaches compared",
"Γ’Εβ Cross-validation performed",
"Γ’Εβ Final model selected with justification",
"Γ’Εβ Interpretability analysis completed"
],
"evaluation": [
"Γ’Εβ Test set performance reported",
"Γ’Εβ Business metrics computed",
"Γ’Εβ Edge cases analyzed",
"Γ’Εβ Failure modes documented"
],
"deployment": [
"Γ’Εβ API endpoints working",
"Γ’Εβ Docker containerization complete",
"Γ’Εβ Health checks implemented",
"Γ’Εβ Monitoring configured"
],
"documentation": [
"Γ’Εβ README with clear instructions",
"Γ’Εβ Code well-documented",
"Γ’Εβ Architecture diagram included",
"Γ’Εβ Results reproducible"
],
"presentation": [
"Γ’Εβ Story arc compelling",
"Γ’Εβ Slides clean and focused",
"Γ’Εβ Demo working",
"Γ’Εβ Q&A preparation complete"
]
}
Key Takeaways
- Spend 40% of time on problem framing β a well-defined problem is half-solved
- Document everything β your future self and team will thank you
- Start simple β baseline first, then iterate
- Quantify business impact β this is what stakeholders care about
- Practice your presentation β rehearse until it flows naturally