Data Science Capstone Masterclass

This is the culminating project that brings together everything you've learned. Follow this complete framework to execute an end-to-end data science project that stands out.

Project Lifecycle

\mathcal{L}_{MSE} = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2

Phase 1: Problem Framing

# Problem framing template
problem_statement = {
    "business_context": """
    E-commerce company X loses $5M annually to customer churn.
    Current retention efforts are untargeted, spending equally on all customers.
    """,
    "objective": """
    Build a churn prediction model to identify at-risk customers
    30 days before churn, enabling targeted retention campaigns.
    """,
    "success_metrics": {
        "primary": "AUC-ROC ≈¥ 0.85 on held-out test set",
        "business": "15% reduction in churn rate within 6 months",
        "constraints": [
            "Prediction latency < 100ms",
            "Must be interpretable for marketing team",
            "Cannot use PII in model features"
        ]
    },
    "scope": {
        "in": [
            "Customer behavior features",
            "Product interaction data",
            "Support ticket history"
        ],
        "out": [
            "Competitor pricing",
            "Customer survey responses"
        ]
    }
}

# Stakeholder alignment checklist
stakeholder_checklist = [
    "âœ“ Business problem clearly defined",
    "âœ“ Success metrics agreed upon",
    "âœ“ Data availability confirmed",
    "âœ“ Timeline and milestones set",
    "âœ“ Risks and dependencies identified",
    "âœ“ Go/No-go criteria established"
]

Phase 2: Data Collection & EDA

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

class DataExploration:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.report = {}
    
    def data_overview(self):
        """Comprehensive data overview"""
        overview = {
            "shape": self.df.shape,
            "dtypes": self.df.dtypes.to_dict(),
            "missing": self.df.isnull().sum().to_dict(),
            "missing_pct": (self.df.isnull().sum() / len(self.df) * 100).to_dict(),
            "duplicates": self.df.duplicated().sum(),
            "memory_mb": self.df.memory_usage(deep=True).sum() / 1024**2
        }
        
        print(f"Dataset: {overview['shape'][0]:,} rows à— {overview['shape'][1]} columns")
        print(f"Missing values: {sum(v > 0 for v in overview['missing'].values())} columns")
        print(f"Duplicates: {overview['duplicates']:,}")
        print(f"Memory: {overview['memory_mb']:.1f} MB")
        
        return overview
    
    def univariate_analysis(self, columns=None):
        """Analyze individual features"""
        if columns is None:
            columns = self.df.select_dtypes(include=[np.number]).columns
        
        for col in columns:
            fig, axes = plt.subplots(1, 2, figsize=(12, 4))
            
            # Distribution
            self.df[col].hist(ax=axes[0], bins=30, edgecolor='black')
            axes[0].set_title(f'{col} Distribution')
            axes[0].set_xlabel(col)
            
            # Box plot
            self.df.boxplot(column=col, ax=axes[1])
            axes[1].set_title(f'{col} Box Plot')
            
            plt.tight_layout()
            plt.show()
            
            print(f"\n{col}:")
            print(f"  Mean: {self.df[col].mean():.2f}")
            print(f"  Std: {self.df[col].std():.2f}")
            print(f"  Skew: {self.df[col].skew():.2f}")
    
    def bivariate_analysis(self, target_col):
        """Analyze relationships with target"""
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        
        correlations = self.df[numeric_cols].corr()[target_col].drop(target_col)
        correlations = correlations.sort_values(ascending=False)
        
        print(f"\nTop correlations with {target_col}:")
        print(correlations.head(10))
        
        # Plot top correlations
        top_features = correlations.head(6).index
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        
        for idx, feat in enumerate(top_features):
            ax = axes[idx // 3, idx % 3]
            self.df.plot.scatter(x=feat, y=target_col, ax=ax, alpha=0.3)
            ax.set_title(f'{feat} vs {target_col}')
        
        plt.tight_layout()
        plt.show()
    
    def generate_eda_report(self):
        """Generate comprehensive EDA report"""
        report = {
            "data_overview": self.data_overview(),
            "target_distribution": self.df['churned'].value_counts().to_dict(),
            "feature_correlations": self._compute_correlations(),
            "data_quality_issues": self._identify_quality_issues(),
            "insights": self._extract_insights()
        }
        
        return report
    
    def _compute_correlations(self):
        numeric_df = self.df.select_dtypes(include=[np.number])
        return numeric_df.corr().to_dict()
    
    def _identify_quality_issues(self):
        issues = []
        
        # High cardinality
        for col in self.df.columns:
            if self.df[col].nunique() > 100:
                issues.append(f"High cardinality: {col} ({self.df[col].nunique()} unique)")
        
        # Skewed features
        for col in self.df.select_dtypes(include=[np.number]).columns:
            if abs(self.df[col].skew()) > 2:
                issues.append(f"Highly skewed: {col} (skew={self.df[col].skew():.2f})")
        
        return issues
    
    def _extract_insights(self):
        insights = []
        
        # Correlation insights
        corr_with_target = self.df.corr()['churned'].drop('churned')
        strong_pos = corr_with_target[corr_with_target > 0.3]
        strong_neg = corr_with_target[corr_with_target < -0.3]
        
        if len(strong_pos) > 0:
            insights.append(f"Strong positive correlations: {list(strong_pos.index)}")
        if len(strong_neg) > 0:
            insights.append(f"Strong negative correlations: {list(strong_neg.index)}")
        
        return insights

# Usage
eda = DataExploration("customer_data.csv")
overview = eda.data_overview()
eda.univariate_analysis(['tenure_months', 'monthly_charges', 'total_charges'])
eda.bivariate_analysis('churned')
report = eda.generate_eda_report()

Phase 3: Feature Engineering & Modeling

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

class FeatureEngineer:
    def __init__(self):
        self.feature_names = []
    
    def create_features(self, df):
        """Create predictive features from raw data"""
        features = df.copy()
        
        # Tenure features
        features['tenure_group'] = pd.cut(
            features['tenure_months'], 
            bins=[0, 12, 24, 48, np.inf],
            labels=['0-1y', '1-2y', '2-4y', '4y+']
        )
        
        # Engagement features
        features['usage_trend_30d'] = (
            features['usage_last_7d'] / (features['usage_last_30d'] / 4 + 1e-8)
        )
        
        features['support_ticket_rate'] = (
            features['support_tickets_90d'] / (features['tenure_months'] + 1)
        )
        
        # Payment features
        features['payment_delay_avg'] = features['avg_days_past_due']
        features['has_payment_issue'] = (features['payment_delay_avg'] > 7).astype(int)
        
        # Value features
        features['ltv_to_date'] = features['total_revenue'] / (features['tenure_months'] + 1)
        features['monthly_trend'] = (
            features['revenue_last_3m'] / 3 - features['revenue_first_3m'] / 3
        )
        
        return features
    
    def get_feature_names(self):
        return self.feature_names

class ModelPipeline:
    def __init__(self):
        self.pipeline = None
        self.model = None
    
    def build_pipeline(self, numeric_features, categorical_features):
        """Build preprocessing + model pipeline"""
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ]
        )
        
        self.pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', lgb.LGBMClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                is_unbalance=True
            ))
        ])
        
        return self.pipeline
    
    def train(self, X_train, y_train, X_val=None, y_val=None):
        """Train with early stopping"""
        self.pipeline.fit(X_train, y_train)
        
        # Cross-validation
        cv_scores = cross_val_score(
            self.pipeline, X_train, y_train, 
            cv=5, scoring='roc_auc', n_jobs=-1
        )
        
        return {
            "cv_mean": cv_scores.mean(),
            "cv_std": cv_scores.std()
        }
    
    def predict(self, X):
        """Generate predictions"""
        return self.pipeline.predict(X)
    
    def predict_proba(self, X):
        """Generate probability predictions"""
        return self.pipeline.predict_proba(X)[:, 1]

# Training pipeline
def run_training_pipeline(data_path):
    # Load and prepare data
    df = pd.read_csv(data_path)
    
    engineer = FeatureEngineer()
    features = engineer.create_features(df)
    
    # Split features and target
    X = features.drop(['customer_id', 'churned'], axis=1)
    y = features['churned']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Build and train model
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    pipeline = ModelPipeline()
    pipeline.build_pipeline(numeric_features, categorical_features)
    
    results = pipeline.train(X_train, y_train)
    
    # Evaluate
    from sklearn.metrics import roc_auc_score, classification_report
    
    y_pred_proba = pipeline.predict_proba(X_test)
    y_pred = pipeline.predict(X_test)
    
    test_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"CV AUC: {results['cv_mean']:.4f} ± {results['cv_std']:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return pipeline, results

pipeline, results = run_training_pipeline("customer_data.csv")

Phase 4: Evaluation & Iteration

from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import numpy as np

class ModelEvaluator:
    def __init__(self, y_true, y_pred_proba, threshold=0.5):
        self.y_true = y_true
        self.y_pred_proba = y_pred_proba
        self.y_pred = (y_pred_proba >= threshold).astype(int)
        self.threshold = threshold
    
    def full_evaluation(self):
        """Comprehensive model evaluation"""
        metrics = {
            "auc_roc": roc_auc_score(self.y_true, self.y_pred_proba),
            "avg_precision": average_precision_score(self.y_true, self.y_pred_proba),
            "classification_report": classification_report(
                self.y_true, self.y_pred, output_dict=True
            ),
            "confusion_matrix": confusion_matrix(self.y_true, self.y_pred).tolist()
        }
        
        # Business metrics
        metrics["business"] = self._compute_business_metrics()
        
        return metrics
    
    def _compute_business_metrics(self):
        """Translate model metrics to business impact"""
        cm = confusion_matrix(self.y_true, self.y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        # Assumptions
        cost_per_retention_offer = 50  # dollars
        value_of_retained_customer = 500  # dollars
        
        retained_customers = tp  # True positives we can target
        wasted_offers = fp  # False positives (wasted offers)
        missed_churners = fn  # False negatives (missed opportunities)
        
        expected_savings = retained_customers * value_of_retained_customer
        expected_costs = (retained_customers + wasted_offers) * cost_per_retention_offer
        net_impact = expected_savings - expected_costs
        
        return {
            "retained_customers": int(retained_customers),
            "wasted_offers": int(wasted_offers),
            "missed_churners": int(missed_churners),
            "expected_savings": float(expected_savings),
            "expected_costs": float(expected_costs),
            "net_impact": float(net_impact)
        }
    
    def plot_evaluation(self):
        """Visualize model performance"""
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # ROC Curve
        from sklearn.metrics import roc_curve
        fpr, tpr, _ = roc_curve(self.y_true, self.y_pred_proba)
        axes[0, 0].plot(fpr, tpr, label=f'AUC = {roc_auc_score(self.y_true, self.y_pred_proba):.3f}')
        axes[0, 0].plot([0, 1], [0, 1], 'k--')
        axes[0, 0].set_xlabel('False Positive Rate')
        axes[0, 0].set_ylabel('True Positive Rate')
        axes[0, 0].set_title('ROC Curve')
        axes[0, 0].legend()
        
        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(self.y_true, self.y_pred_proba)
        axes[0, 1].plot(recall, precision)
        axes[0, 1].set_xlabel('Recall')
        axes[0, 1].set_ylabel('Precision')
        axes[0, 1].set_title('Precision-Recall Curve')
        
        # Confusion Matrix
        cm = confusion_matrix(self.y_true, self.y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
        axes[1, 0].set_xlabel('Predicted')
        axes[1, 0].set_ylabel('Actual')
        axes[1, 0].set_title('Confusion Matrix')
        
        # Threshold Analysis
        thresholds = np.arange(0.1, 0.9, 0.05)
        f1_scores = []
        for t in thresholds:
            y_pred_t = (self.y_pred_proba >= t).astype(int)
            from sklearn.metrics import f1_score
            f1_scores.append(f1_score(self.y_true, y_pred_t))
        
        axes[1, 1].plot(thresholds, f1_scores)
        axes[1, 1].axvline(x=self.threshold, color='r', linestyle='--', label=f'Current: {self.threshold}')
        axes[1, 1].set_xlabel('Threshold')
        axes[1, 1].set_ylabel('F1 Score')
        axes[1, 1].set_title('F1 Score vs Threshold')
        axes[1, 1].legend()
        
        plt.tight_layout()
        plt.savefig('evaluation_report.png', dpi=150)
        plt.show()

# Run evaluation
evaluator = ModelEvaluator(y_test, y_pred_proba)
metrics = evaluator.full_evaluation()
evaluator.plot_evaluation()

print(f"AUC-ROC: {metrics['auc_roc']:.4f}")
print(f"Expected Net Impact: ${metrics['business']['net_impact']:,.0f}")

Phase 5: Deployment

# Dockerfile
dockerfile = """
FROM python:3.10-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY model/ ./model/
COPY app.py .
COPY config.yaml .

EXPOSE 8000

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
"""

# FastAPI deployment
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
import joblib

app = FastAPI(title="Churn Prediction API")

model = joblib.load("model/churn_model.pkl")
feature_engineer = joblib.load("model/feature_engineer.pkl")

class CustomerData(BaseModel):
    tenure_months: float
    monthly_charges: float
    total_charges: float
    usage_last_7d: float
    usage_last_30d: float
    support_tickets_90d: int
    payment_delay_avg: float
    total_revenue: float

@app.post("/predict")
def predict_churn(customer: CustomerData):
    # Convert to DataFrame
    df = pd.DataFrame([customer.dict()])
    
    # Engineer features
    features = feature_engineer.transform(df)
    
    # Predict
    probability = model.predict_proba(features)[0, 1]
    
    # Interpret
    risk_level = "high" if probability > 0.7 else "medium" if probability > 0.3 else "low"
    
    return {
        "churn_probability": float(probability),
        "risk_level": risk_level,
        "recommended_action": get_recommendation(risk_level)
    }

def get_recommendation(risk_level):
    recommendations = {
        "high": "Immediate outreach with retention offer",
        "medium": "Engagement campaign with personalized content",
        "low": "Standard nurture sequence"
    }
    return recommendations.get(risk_level, "Monitor")

@app.get("/health")
def health():
    return {"status": "healthy"}

Phase 6: Presentation

# Presentation structure
presentation_outline = {
    "slide_1_title": {
        "title": "Customer Churn Prediction System",
        "subtitle": "Identifying at-risk customers for targeted retention",
        "author": "Your Name",
        "date": "Date"
    },
    "slide_2_problem": {
        "title": "Business Problem",
        "content": [
            "$5M annual revenue loss from churn",
            "Untargeted retention spending",
            "Need for predictive, actionable system"
        ]
    },
    "slide_3_approach": {
        "title": "Our Approach",
        "content": [
            "End-to-end ML pipeline",
            "Feature engineering from behavioral data",
            "LightGBM model with 0.89 AUC",
            "Real-time scoring API"
        ]
    },
    "slide_4_results": {
        "title": "Results",
        "content": [
            "AUC-ROC: 0.89 (vs 0.82 baseline)",
            "Precision @ 10%: 72%",
            "Estimated impact: $2.3M annual savings",
            "Latency: <50ms per prediction"
        ]
    },
    "slide_5_demo": {
        "title": "Live Demo",
        "content": [
            "Walk through the prediction API",
            "Show the dashboard",
            "Demonstrate interpretability"
        ]
    },
    "slide_6_business_impact": {
        "title": "Business Impact",
        "content": [
            "Targeted campaigns for top 10% risk",
            "30% reduction in churn for targeted segment",
            "ROI: 4.6x within 6 months",
            "Scalable to other retention use cases"
        ]
    },
    "slide_7_next_steps": {
        "title": "Next Steps",
        "content": [
            "A/B test retention campaigns",
            "Add real-time features from streaming data",
            "Build recommendation engine for offers",
            "Expand to other customer segments"
        ]
    }
}

# Key metrics to highlight
key_metrics = {
    "model_performance": {
        "auc_roc": 0.89,
        "precision_at_10pct": 0.72,
        "recall_at_10pct": 0.45
    },
    "business_impact": {
        "annual_savings": 2300000,
        "retention_rate_improvement": 0.15,
        "roi": 4.6
    },
    "technical_metrics": {
        "inference_latency_ms": 45,
        "training_time_hours": 2.5,
        "model_size_mb": 15
    }
}

Final Checklist

capstone_checklist = {
    "problem_framing": [
        "âœ“ Business problem clearly defined",
        "âœ“ Success metrics agreed with stakeholders",
        "âœ“ Scope and constraints documented",
        "âœ“ Timeline and milestones set"
    ],
    "data": [
        "âœ“ Data sources identified and accessed",
        "âœ“ EDA completed with insights documented",
        "âœ“ Data quality issues addressed",
        "âœ“ Feature engineering rationale explained"
    ],
    "modeling": [
        "âœ“ Multiple approaches compared",
        "âœ“ Cross-validation performed",
        "âœ“ Final model selected with justification",
        "âœ“ Interpretability analysis completed"
    ],
    "evaluation": [
        "âœ“ Test set performance reported",
        "âœ“ Business metrics computed",
        "âœ“ Edge cases analyzed",
        "âœ“ Failure modes documented"
    ],
    "deployment": [
        "âœ“ API endpoints working",
        "âœ“ Docker containerization complete",
        "âœ“ Health checks implemented",
        "âœ“ Monitoring configured"
    ],
    "documentation": [
        "âœ“ README with clear instructions",
        "âœ“ Code well-documented",
        "âœ“ Architecture diagram included",
        "âœ“ Results reproducible"
    ],
    "presentation": [
        "âœ“ Story arc compelling",
        "âœ“ Slides clean and focused",
        "âœ“ Demo working",
        "âœ“ Q&A preparation complete"
    ]
}

Key Takeaways

Spend 40% of time on problem framing – a well-defined problem is half-solved
Document everything – your future self and team will thank you
Start simple – baseline first, then iterate
Quantify business impact – this is what stakeholders care about
Practice your presentation – rehearse until it flows naturally