The Capstone Project
This is your culminating project — a complete, end-to-end data science solution that demonstrates everything you've learned. It's your portfolio centerpiece and interview talking point.
Architecture Diagram
┌──────────────────────────────────────────────────────────────────┐
│ Capstone Project Flow │
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ PICK │ │ BUILD │ │ DEPLOY │ │ WRITE │ │
│ │ Problem │─>│ Solution │─>│ & Demo │─>│ Up │ │
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
│ │ │ │ │ │
│ Business Technical Production Communication │
│ Acumen Depth Readiness Skills │
│ │
│ Timeline: 4-6 weeks recommended │
└──────────────────────────────────────────────────────────────────┘
Project Selection
Choosing the Right Problem
DfCapstone Project Selection Criteria
Ideal capstone projects should be high impact and feasible, with clear business metrics and available data.
project_selection_matrix = {
"High Impact + Feasible": {
"description": "Ideal capstone projects",
"examples": [
"Customer churn prediction for a SaaS company",
"Demand forecasting for retail inventory",
"Content recommendation system",
"Fraud detection for transactions",
"Customer segmentation for marketing"
],
"criteria": [
"Clear business metric to optimize",
"Available data (public or synthetic)",
"Can show measurable improvement over baseline",
"Deployable within 2-4 weeks"
]
},
"Avoid": {
"description": "Projects that don't showcase well",
"examples": [
"Kaggle competition with no business context",
"Reproducing existing papers without extension",
"Projects using only toy datasets",
"Anything without a clear 'so what?'"
]
}
}
Problem-Solution Template
def define_capstone_problem() -> dict:
"""Framework for defining your capstone project."""
return {
"problem_statement": {
"template": "In [industry], [stakeholders] struggle with "
"[problem], resulting in [negative outcome]. "
"We can solve this by [approach].",
"example": "In e-commerce, marketing teams struggle with "
"identifying at-risk customers before they churn, "
"resulting in $2M annual revenue loss. We can solve "
"this by building a predictive model that flags "
"customers 30 days before churn."
},
"success_criteria": {
"technical": "Model AUC > 0.85, inference latency < 100ms",
"business": "Identify 80% of churners with <20% false positive rate",
"deployment": "API endpoint serving predictions in production"
},
"data_sources": {
"primary": "Kaggle dataset / company data / public API",
"secondary": "Enrichment data for features",
"volume": "Target 50K-500K rows for meaningful analysis"
}
}
End-to-End Workflow
Week 1-2: Data + EDA + Baseline
# capstone/data_pipeline.py
"""
Capstone Project: Customer Churn Prediction
Pipeline: Data → EDA → Baseline Model
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
classification_report, roc_auc_score,
confusion_matrix, roc_curve
)
class DataPipeline:
"""End-to-end data preparation pipeline."""
def __init__(self, data_path: str):
self.data_path = data_path
self.df = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
def load_and_validate(self) -> pd.DataFrame:
"""Load data and run quality checks."""
self.df = pd.read_csv(self.data_path)
print(f"Dataset shape: {self.df.shape}")
print(f"\nColumn types:\n{self.df.dtypes.value_counts()}")
print(f"\nMissing values:\n{self.df.isnull().sum()[self.df.isnull().sum() > 0]}")
print(f"\nTarget distribution:\n{self.df['churned'].value_counts(normalize=True):.2%}")
# Quality assertions
assert len(self.df) >= 10000, "Dataset too small for reliable modeling"
assert self.df['churned'].mean() > 0.02, "Target class too rare"
assert self.df['churned'].mean() < 0.8, "Target class too common"
return self.df
def perform_eda(self) -> dict:
"""Comprehensive exploratory data analysis."""
eda_results = {}
# Numerical distributions
num_cols = self.df.select_dtypes(include=[np.number]).columns
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
for i, col in enumerate(num_cols[:9]):
ax = axes[i // 3, i % 3]
self.df[col].hist(ax=ax, bins=30, edgecolor='black')
ax.set_title(f'{col} Distribution')
plt.tight_layout()
plt.savefig('reports/eda_distributions.png', dpi=150)
plt.close()
# Correlation with target
correlations = self.df[num_cols].corr()['churned'].sort_values(ascending=False)
eda_results['target_correlations'] = correlations
# Key findings
eda_results['findings'] = [
f"Churn rate: {self.df['churned'].mean():.1%}",
f"Most correlated feature: {correlations.index[1]} ({correlations.iloc[1]:.3f})",
f"Missing data in {self.df.isnull().any().sum()} columns"
]
return eda_results
def prepare_features(self) -> tuple:
"""Feature engineering and preprocessing."""
df = self.df.copy()
# Feature engineering
if 'total_purchases' in df.columns and 'days_since_signup' in df.columns:
df['purchase_frequency'] = df['total_purchases'] / (df['days_since_signup'] + 1)
df['avg_days_between_purchases'] = df['days_since_signup'] / (df['total_purchases'] + 1)
if 'total_spent' in df.columns and 'total_purchases' in df.columns:
df['avg_order_value'] = df['total_spent'] / (df['total_purchases'] + 1)
# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
# Separate features and target
feature_cols = [c for c in numeric_cols if c not in ['churned', 'customer_id']]
X = df[feature_cols]
y = df['churned']
# Split
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain set: {self.X_train.shape[0]:,} samples")
print(f"Test set: {self.X_test.shape[0]:,} samples")
return self.X_train, self.X_test, self.y_train, self.y_test
def train_baseline(self) -> dict:
"""Train and evaluate baseline logistic regression model."""
baseline = LogisticRegression(random_state=42, max_iter=1000)
baseline.fit(self.X_train, self.y_train)
y_pred = baseline.predict(self.X_test)
y_proba = baseline.predict_proba(self.X_test)[:, 1]
results = {
'model': baseline,
'classification_report': classification_report(self.y_test, y_pred),
'roc_auc': roc_auc_score(self.y_test, y_proba),
'feature_importance': pd.Series(
baseline.coef_[0], index=self.X_train.columns
).sort_values(ascending=False)
}
print(f"\nBaseline ROC-AUC: {results['roc_auc']:.4f}")
print(f"\n{results['classification_report']}")
return results
if __name__ == "__main__":
pipeline = DataPipeline("data/customers.csv")
pipeline.load_and_validate()
eda = pipeline.perform_eda()
pipeline.prepare_features()
baseline = pipeline.train_baseline()
# Save baseline results
print("\n=== Baseline Model Complete ===")
print(f"ROC-AUC: {baseline['roc_auc']:.4f}")
print("Ready for advanced modeling in Week 2")
Week 2-3: Advanced Modeling
# capstone/modeling.py
"""Advanced modeling with hyperparameter tuning and ensemble methods."""
import numpy as np
import pandas as pd
from sklearn.ensemble import (
RandomForestClassifier, GradientBoostingClassifier,
VotingClassifier
)
from sklearn.model_selection import (
RandomizedSearchCV, cross_val_score, StratifiedKFold
)
from sklearn.metrics import (
roc_auc_score, precision_recall_curve,
average_precision_score, f1_score
)
from scipy.stats import randint, uniform
import xgboost as xgb
import lightgbm as lgb
import joblib
from pathlib import Path
class AdvancedModeler:
"""Advanced modeling pipeline with tuning and ensembling."""
def __init__(self, X_train, X_test, y_train, y_test):
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
self.results = {}
def train_xgboost(self) -> dict:
"""Train XGBoost with hyperparameter search."""
param_distributions = {
'n_estimators': randint(100, 500),
'max_depth': randint(3, 10),
'learning_rate': uniform(0.01, 0.3),
'subsample': uniform(0.6, 0.4),
'colsample_bytree': uniform(0.6, 0.4),
'min_child_weight': randint(1, 10),
'reg_alpha': uniform(0, 1),
'reg_lambda': uniform(0, 1)
}
xgb_model = xgb.XGBClassifier(
objective='binary:logistic',
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
search = RandomizedSearchCV(
xgb_model, param_distributions,
n_iter=50, cv=StratifiedKFold(5, shuffle=True, random_state=42),
scoring='roc_auc', random_state=42, n_jobs=-1, verbose=0
)
search.fit(self.X_train, self.y_train)
best_model = search.best_estimator_
y_proba = best_model.predict_proba(self.X_test)[:, 1]
results = {
'model': best_model,
'best_params': search.best_params_,
'cv_score': search.best_score_,
'test_auc': roc_auc_score(self.y_test, y_proba),
'test_ap': average_precision_score(self.y_test, y_proba)
}
print(f"XGBoost - CV AUC: {results['cv_score']:.4f}, "
f"Test AUC: {results['test_auc']:.4f}")
self.results['xgboost'] = results
return results
def train_lightgbm(self) -> dict:
"""Train LightGBM with early stopping."""
param_distributions = {
'n_estimators': randint(100, 500),
'max_depth': randint(3, 12),
'learning_rate': uniform(0.01, 0.3),
'num_leaves': randint(20, 100),
'min_child_samples': randint(5, 50),
'feature_fraction': uniform(0.6, 0.4),
'bagging_fraction': uniform(0.6, 0.4),
'reg_alpha': uniform(0, 1),
'reg_lambda': uniform(0, 1)
}
lgb_model = lgb.LGBMClassifier(
objective='binary',
random_state=42,
verbose=-1
)
search = RandomizedSearchCV(
lgb_model, param_distributions,
n_iter=50, cv=StratifiedKFold(5, shuffle=True, random_state=42),
scoring='roc_auc', random_state=42, n_jobs=-1
)
search.fit(self.X_train, self.y_train)
best_model = search.best_estimator_
y_proba = best_model.predict_proba(self.X_test)[:, 1]
results = {
'model': best_model,
'best_params': search.best_params_,
'cv_score': search.best_score_,
'test_auc': roc_auc_score(self.y_test, y_proba),
'test_ap': average_precision_score(self.y_test, y_proba)
}
print(f"LightGBM - CV AUC: {results['cv_score']:.4f}, "
f"Test AUC: {results['test_auc']:.4f}")
self.results['lightgbm'] = results
return results
def build_ensemble(self) -> dict:
"""Create a voting ensemble of best models."""
estimators = []
for name, result in self.results.items():
estimators.append((name, result['model']))
ensemble = VotingClassifier(
estimators=estimators,
voting='soft',
weights=[1.2, 1.0] # Weight XGBoost slightly higher
)
ensemble.fit(self.X_train, self.y_train)
y_proba = ensemble.predict_proba(self.X_test)[:, 1]
results = {
'model': ensemble,
'test_auc': roc_auc_score(self.y_test, y_proba),
'test_ap': average_precision_score(self.y_test, y_proba),
'test_f1': f1_score(self.y_test, ensemble.predict(self.X_test))
}
print(f"\nEnsemble - Test AUC: {results['test_auc']:.4f}")
self.results['ensemble'] = results
return results
def find_optimal_threshold(self, model_name: str = 'ensemble') -> dict:
"""Find threshold maximizing F1 score."""
model = self.results[model_name]['model']
y_proba = model.predict_proba(self.X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(self.y_test, y_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
results = {
'threshold': optimal_threshold,
'precision': precisions[optimal_idx],
'recall': recalls[optimal_idx],
'f1': f1_scores[optimal_idx]
}
print(f"\nOptimal threshold: {optimal_threshold:.3f}")
print(f"Precision: {results['precision']:.3f}, "
f"Recall: {results['recall']:.3f}, "
f"F1: {results['f1']:.3f}")
return results
def save_best_model(self, path: str = 'models/'):
"""Save the best model for deployment."""
Path(path).mkdir(parents=True, exist_ok=True)
best_name = max(
[(k, v['test_auc']) for k, v in self.results.items()
if k != 'ensemble'],
key=lambda x: x[1]
)[0]
model_path = f"{path}best_model.joblib"
joblib.dump(self.results[best_name]['model'], model_path)
print(f"\nBest model ({best_name}) saved to {model_path}")
return model_path
if __name__ == "__main__":
# Assumes DataPipeline from previous step
from data_pipeline import DataPipeline
pipeline = DataPipeline("data/customers.csv")
pipeline.load_and_validate()
pipeline.prepare_features()
modeler = AdvancedModeler(
pipeline.X_train, pipeline.X_test,
pipeline.y_train, pipeline.y_test
)
modeler.train_xgboost()
modeler.train_lightgbm()
modeler.build_ensemble()
modeler.find_optimal_threshold()
modeler.save_best_model()
Week 3-4: Deployment
# capstone/api/main.py
"""Production API for churn prediction model."""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from typing import Optional
import joblib
import numpy as np
import pandas as pd
from datetime import datetime
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Churn Prediction API",
description="Predicts customer churn probability",
version="1.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# Model loading
model = None
feature_names = None
class PredictionRequest(BaseModel):
customer_id: str
days_since_signup: int = Field(..., ge=0)
total_purchases: int = Field(..., ge=0)
total_spent: float = Field(..., ge=0)
support_tickets: int = Field(..., ge=0)
avg_session_minutes: float = Field(..., ge=0)
days_since_last_purchase: int = Field(..., ge=0)
class Config:
json_schema_extra = {
"example": {
"customer_id": "CUST-001",
"days_since_signup": 180,
"total_purchases": 5,
"total_spent": 299.99,
"support_tickets": 2,
"avg_session_minutes": 12.5,
"days_since_last_purchase": 45
}
}
class PredictionResponse(BaseModel):
customer_id: str
churn_probability: float
risk_level: str
top_factors: list[str]
recommended_action: str
model_version: str
@app.on_event("startup")
async def load_model():
global model, feature_names
try:
model = joblib.load("models/best_model.joblib")
feature_names = model.feature_names_in_ if hasattr(model, 'feature_names_in_') else None
logger.info("Model loaded successfully")
except Exception as e:
logger.error(f"Failed to load model: {e}")
@app.get("/health")
def health_check():
return {
"status": "healthy",
"model_loaded": model is not None,
"timestamp": datetime.now().isoformat()
}
@app.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest):
if model is None:
raise HTTPException(status_code=503, detail="Model not loaded")
try:
# Engineer features
features = {
'days_since_signup': request.days_since_signup,
'total_purchases': request.total_purchases,
'total_spent': request.total_spent,
'support_tickets': request.support_tickets,
'avg_session_minutes': request.avg_session_minutes,
'days_since_last_purchase': request.days_since_last_purchase,
'purchase_frequency': request.total_purchases / (request.days_since_signup + 1),
'avg_order_value': request.total_spent / (request.total_purchases + 1),
'avg_days_between_purchases': request.days_since_signup / (request.total_purchases + 1)
}
X = pd.DataFrame([features])
if feature_names is not None:
X = X[feature_names]
# Predict
churn_prob = model.predict_proba(X)[0][1]
# Risk classification
if churn_prob >= 0.7:
risk_level = "high"
action = "Immediate outreach with retention offer"
elif churn_prob >= 0.4:
risk_level = "medium"
action = "Engagement email with usage tips"
else:
risk_level = "low"
action = "Standard nurture sequence"
# Top factors (simplified)
top_factors = []
if request.days_since_last_purchase > 30:
top_factors.append(f"No purchase in {request.days_since_last_purchase} days")
if request.support_tickets > 2:
top_factors.append(f"{request.support_tickets} support tickets")
if request.avg_session_minutes < 5:
top_factors.append("Low engagement sessions")
return PredictionResponse(
customer_id=request.customer_id,
churn_probability=round(float(churn_prob), 4),
risk_level=risk_level,
top_factors=top_factors[:3],
recommended_action=action,
model_version="1.0.0"
)
except Exception as e:
logger.error(f"Prediction error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/model/info")
def model_info():
return {
"model_type": type(model).__name__,
"features": list(feature_names) if feature_names else "unknown",
"version": "1.0.0",
"trained_at": "2024-01-15"
}
# capstone/Dockerfile
FROM python:3.10-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY api/ ./api/
COPY models/ ./models/
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]
Week 4-6: Documentation & Writeup
DfProject Documentation Structure
A comprehensive documentation structure for capstone projects.
Architecture Diagram
┌──────────────────────────────────────────────────────────────┐
│ Project Documentation Structure │
│ │
│ README.md │
│ ├── Problem Statement & Business Context │
│ ├── Approach Overview (high-level) │
│ ├── Results Summary (key metrics) │
│ ├── How to Run (quick start) │
│ └── Links (demo, blog post, slides) │
│ │
│ docs/ │
│ ├── technical_report.md │
│ │ ├── Data Description │
│ │ ├── Exploratory Analysis │
│ │ ├── Methodology │
│ │ ├── Model Selection & Tuning │
│ │ ├── Evaluation Results │
│ │ └── Limitations & Future Work │
│ │ │
│ ├── deployment_guide.md │
│ │ ├── Architecture Overview │
│ │ ├── API Documentation │
│ │ ├── Monitoring Setup │
│ │ └── Scaling Considerations │
│ │ │
│ └── presentation.pdf │
│ ├── Slide deck for stakeholders │
│ └── 10-20-30 format │
│ │
│ notebooks/ │
│ ├── 01_EDA.ipynb │
│ ├── 02_Feature_Engineering.ipynb │
│ ├── 03_Modeling.ipynb │
│ └── 04_Evaluation.ipynb │
└──────────────────────────────────────────────────────────────┘
# docs/technical_report.md
## 1. Executive Summary
We built a customer churn prediction model that identifies at-risk
customers 30 days before they leave. The model achieves 87% AUC-ROC
and identifies 80% of churners with only 15% false positive rate.
**Business Impact**: At scale, this system could save ~$1.8M annually
through targeted retention interventions.
## 2. Methodology
### 2.1 Data
- **Source**: Company CRM + product usage data
- **Size**: 50,000 customers, 12 months
- **Target**: Binary churn (inactive for 30+ days)
### 2.2 Feature Engineering
Key derived features:
- `purchase_frequency`: Purchases per day since signup
- `avg_order_value`: Total spent / number of purchases
- `engagement_trend`: Slope of session minutes over time
- `support_intensity`: Tickets per month of activity
### 2.3 Model Selection
| Model | CV AUC | Test AUC | Inference |
|-------|--------|----------|-----------|
| Logistic Regression | 0.78 | 0.77 | <1ms |
| Random Forest | 0.83 | 0.82 | <5ms |
| XGBoost | 0.87 | 0.86 | <10ms |
| LightGBM | 0.86 | 0.85 | <5ms |
| **Ensemble** | **0.88** | **0.87** | <20ms |
### 2.4 Final Model
- **Algorithm**: XGBoost + LightGBM soft voting ensemble
- **Threshold**: 0.42 (optimized for F1)
- **Features**: 12 engineered features from 8 raw columns
## 3. Results
### 3.1 Performance
precision recall f1-score support
Retained 0.93 0.92 0.92 8000 Churned 0.72 0.75 0.73 2000
accuracy 0.89 10000
Architecture Diagram
### 3.2 Business Impact
- **High-risk customers identified**: 80% recall at 15% FPR
- **Estimated saves**: 250 customers/month at 40% save rate
- **Revenue retained**: ~$150K/month = $1.8M annually
- **Intervention cost**: ~$25K/month (discounts + outreach)
## 4. Limitations
1. Model trained on historical data; may not capture sudden market shifts
2. Features limited to available data sources
3. Assumes interventions have consistent effectiveness
## 5. Future Work
- [ ] Real-time feature computation pipeline
- [ ] Causal inference for optimal intervention timing
- [ ] Multi-class prediction (reasons for churn)
- [ ] Integration with marketing automation platform
Project Checklist
Architecture Diagram
□ Problem Definition
□ Clear problem statement written
□ Business impact quantified
□ Success criteria defined
□ Data Pipeline
□ Data loaded and validated
□ EDA completed with visualizations
□ Feature engineering documented
□ Train/test split done properly
□ Modeling
□ Baseline model established
□ Multiple approaches tried
□ Hyperparameter tuning performed
□ Ensemble considered
□ Model interpretation done (SHAP, feature importance)
□ Deployment
□ API built with FastAPI
□ Input validation with Pydantic
□ Error handling implemented
□ Docker containerization done
□ Health check endpoint working
□ Documentation
□ README with overview and quick start
□ Technical report with methodology
□ Code well-commented and documented
□ Notebooks are clear and reproducible
□ Presentation slides created
□ Portfolio
□ Project added to GitHub profile
□ Blog post written
□ LinkedIn post about the project
□ Demo available (Streamlit, HuggingFace, etc.)
Presentation Template
capstone_presentation = {
"slide_1_title": "Customer Churn Prediction System",
"slide_2_problem": {
"headline": "$2M Annual Revenue Loss from Preventable Churn",
"visual": "Line chart showing churn trend"
},
"slide_3_approach": {
"headline": "ML Model Identifies At-Risk Customers 30 Days Early",
"visual": "Architecture diagram"
},
"slide_4_results": {
"headline": "87% AUC, 80% Recall at 15% False Positive Rate",
"visual": "ROC curve + confusion matrix"
},
"slide_5_impact": {
"headline": "$1.8M Annual Savings with Targeted Interventions",
"visual": "ROI calculation"
},
"slide_6_demo": {
"headline": "Live Demo",
"visual": "API call + prediction output"
},
"slide_7_next_steps": {
"headline": "Roadmap: Real-time Pipeline → Causal Inference → Automation",
"visual": "Timeline"
}
}
Key Takeaways
📋Summary: Capstone Project
- A great capstone solves a real business problem with measurable impact -- always quantify the "so what?"
- End-to-end matters: data -> model -> deployment -> documentation -- this demonstrates production readiness
- Start with a simple baseline, then iterate with complexity -- this shows disciplined engineering
- Deployment is not optional -- it proves you can ship, which is what separates practitioners from theorists
- Documentation is as important as code -- it's how others understand, reproduce, and extend your work
- Tell a story: problem -> approach -> results -> impact -> next steps -- narrative structure makes technical work accessible
- Make it accessible: non-technical stakeholders should understand the value -- this is the ultimate test of communication
Practice Exercises
- Define your problem: Write a one-paragraph problem statement using the template
- Build a baseline: Start with the simplest model that works
- Deploy it: Get your model serving predictions via API
- Write it up: Create a 2-page technical summary
- Present it: Record a 5-minute video walkthrough
- Share it: Publish on GitHub with a blog post
- Iterate: Add one improvement based on feedback