Explainable AI (XAI)

Black-box models achieve high accuracy but resist understanding. Explainable AI makes predictions transparent – showing why a model made a specific decision, which features matter most, and how to change outcomes. Trust and compliance require explainability.

SHAP Waterfall Explanation

Why XAI Matters

A loan application is denied. The applicant deserves to know why. A doctor needs to trust a diagnosis model. Regulators require explanations. XAI transforms opaque predictions into actionable insights.

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

Generate Interpretable Dataset

np.random.seed(42)
n = 2000

X = pd.DataFrame({
    'income': np.random.lognormal(10, 0.7, n),
    'age': np.random.normal(40, 12, n).clip(18, 80),
    'credit_score': np.random.normal(650, 80, n).clip(300, 850),
    'debt_ratio': np.random.beta(2, 5, n),
    'years_employed': np.random.exponential(8, n).clip(0, 40),
    'num_accounts': np.random.poisson(3, n) + 1
})

# Target with known relationships
log_odds = (
    -2
    + 0.3 * (X['income'] > 50000).astype(int)
    - 0.5 * (X['age'] < 30).astype(int)
    + 0.8 * (X['credit_score'] > 700).astype(int)
    - 1.2 * (X['debt_ratio'] > 0.4).astype(int)
    + 0.2 * X['years_employed']
)
prob = 1 / (1 + np.exp(-log_odds))
y = np.random.binomial(1, prob)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
model.fit(X_train, y_train)
print(f"Model accuracy: {model.score(X_test, y_test):.3f}")

SHAP (SHapley Additive exPlanations)

SHAP values explain each prediction by attributing it to individual features based on game theory. The SHAP value for feature $i$ is:

\phi_i = \sum_{S \subseteq N \setminus \{i\}} \frac{|S|!(|N|-|S|-1)!}{|N|!}[f(S \cup \{i\}) - f(S)]

where $N$ is the set of all features, $S$ is a subset without feature $i$ , and $f(S)$ is the model prediction using features in $S$ . This is the unique solution satisfying efficiency, symmetry, dummy, and additivity axioms.

import shap

# Create SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

print(f"SHAP values shape: {shap_values.shape}")
print(f"Base value: {explainer.expected_value:.4f}")

# SHAP summary plot
shap.summary_plot(shap_values, X_test, show=False)
print("SHAP summary plot generated")

# SHAP bar plot (global importance)
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
print("SHAP bar plot generated")

# SHAP dependence plot
shap.dependence_plot("credit_score", shap_values, X_test, show=False)
print("SHAP dependence plot for credit_score")

Individual Prediction Explanation

def explain_prediction(model, explainer, X_instance, feature_names):
    """Explain a single prediction."""
    shap_values = explainer.shap_values(X_instance)
    
    # Base value + feature contributions
    base_value = explainer.expected_value
    contributions = pd.Series(shap_values[0], index=feature_names)
    
    prediction = model.predict_proba(X_instance)[0, 1]
    
    print(f"Prediction: {prediction:.4f}")
    print(f"Base rate: {1/(1+np.exp(-base_value)):.4f}")
    print(f"\nFeature contributions:")
    for feat, contrib in contributions.sort_values(ascending=False).items():
        direction = "+" if contrib > 0 else "-"
        print(f"  {feat}: {direction}{abs(contrib):.4f}")
    
    return contributions

# Explain a specific prediction
idx = 0
contributions = explain_prediction(model, explainer, X_test.iloc[[idx]], X_test.columns)

LIME (Local Interpretable Model-Agnostic Explanations)

from sklearn.linear_model import Ridge

class LIMEExplainer:
    """Simplified LIME implementation."""
    
    def __init__(self, model, X_train, n_samples=1000):
        self.model = model
        self.X_train = X_train
        self.n_samples = n_samples
        self.mean = X_train.mean()
        self.std = X_train.std() + 1e-10
    
    def explain(self, instance, n_features=5):
        """Generate local explanation for a single instance."""
        # Sample neighborhood
        X_sample = np.random.randn(self.n_samples, len(instance)) * self.std + self.mean
        
        # Get model predictions
        y_sample = self.model.predict_proba(X_sample)[:, 1]
        
        # Compute distances (kernel weights)
        distances = np.sqrt(((X_sample - instance.values) ** 2).sum(axis=1))
        weights = np.exp(-distances ** 2 / (np.median(distances) ** 2))
        
        # Fit local linear model
        local_model = Ridge(alpha=1.0)
        local_model.fit(X_sample, y_sample, sample_weight=weights)
        
        # Feature importance from local model
        importance = pd.Series(
            np.abs(local_model.coef_) * self.std.values,
            index=self.X_train.columns
        ).sort_values(ascending=False)
        
        return importance.head(n_features), local_model

lime = LIMEExplainer(model, X_train)
importance, local_model = lime.explain(X_test.iloc[0])
print("LIME feature importance (local):")
print(importance)

Partial Dependence Plots

from sklearn.inspection import partial_dependence

# Partial dependence for individual features
features = ['credit_score', 'income', 'debt_ratio']
pd_results = partial_dependence(model, X_train, features, grid_resolution=50)

for i, feature in enumerate(features):
    pd_values = pd_results['average'][i]
    pd_grid = pd_results['grid_values'][i]
    print(f"\n{feature} partial dependence:")
    print(f"  Range: {pd_grid.min():.1f} to {pd_grid.max():.1f}")
    print(f"  Effect range: {pd_values.min():.4f} to {pd_values.max():.4f}")

Global Feature Importance

# Permutation importance (model-agnostic)
perm_imp = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)

importance_df = pd.DataFrame({
    'feature': X_test.columns,
    'importance_mean': perm_imp.importances_mean,
    'importance_std': perm_imp.importances_std
}).sort_values('importance_mean', ascending=False)

print("Permutation Feature Importance:")
print(importance_df.to_string(index=False))

# Tree-specific importance
tree_imp = pd.Series(model.feature_importances_, index=X_test.columns).sort_values(ascending=False)
print("\nTree-based Feature Importance:")
print(tree_imp)

Counterfactual Explanations

def generate_counterfactual(model, instance, target_class, X_train, n_iterations=1000, learning_rate=0.1):
    """Generate counterfactual explanation: minimal change to flip prediction."""
    instance = instance.copy()
    original_pred = model.predict_proba(instance.values.reshape(1, -1))[0, target_class]
    
    # Start from instance
    cf = instance.values.copy()
    
    for _ in range(n_iterations):
        # Add small perturbation
        gradient = np.random.randn(*cf.shape) * 0.1
        
        # Only change features that move toward target
        candidate = cf + gradient
        new_pred = model.predict_proba(candidate.reshape(1, -1))[0, target_class]
        
        if new_pred > original_pred:
            cf = candidate
            original_pred = new_pred
    
    cf_instance = pd.Series(cf, index=instance.index)
    
    # What changed?
    changes = cf_instance - instance
    changed_features = changes[changes.abs() > 0.01]
    
    return cf_instance, changed_features

# Generate counterfactual for a denied application
denied_idx = X_test[X_test.apply(lambda row: model.predict_proba(row.values.reshape(1,-1))[0,1] < 0.5, axis=1)].index[0]
instance = X_test.loc[denied_idx]

cf, changes = generate_counterfactual(model, instance, target_class=1, X_train=X_train)
print(f"Original prediction: {model.predict_proba(instance.values.reshape(1,-1))[0,1]:.4f}")
print(f"Counterfactual prediction: {model.predict_proba(cf.values.reshape(1,-1))[0,1]:.4f}")
print(f"\nChanges needed:")
for feat, change in changes.items():
    print(f"  {feat}: {instance[feat]:.2f} → {cf[feat]:.2f}")

Model Cards

def create_model_card(model, X_test, y_test, model_name="Credit Scoring Model"):
    """Generate a model card for documentation."""
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, 
        f1_score, roc_auc_score, confusion_matrix
    )
    
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    card = f"""
# Model Card: {model_name}

## Model Details
- Type: {type(model).__name__}
- Features: {X_test.shape[1]}
- Training samples: Not specified

## Performance Metrics
- Accuracy: {accuracy_score(y_test, y_pred):.4f}
- Precision: {precision_score(y_test, y_pred):.4f}
- Recall: {recall_score(y_test, y_pred):.4f}
- F1 Score: {f1_score(y_test, y_pred):.4f}
- ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}

## Confusion Matrix
- True Negatives: {tn}
- False Positives: {fp}
- False Negatives: {fn}
- True Positives: {tp}

## Fairness Considerations
- [ ] Tested across demographic groups
- [ ] No disparate impact detected
- [ ] Regular monitoring plan in place

## Limitations
- Model trained on historical data
- May not generalize to population shifts
- Requires periodic retraining

## Ethical Considerations
- Used for credit decisions
- Applicants can request explanation
- Human review available for edge cases
"""
    print(card)
    return card

card = create_model_card(model, X_test, y_test)

Best Practices

SHAP for global and local – the gold standard for tree models
LIME for model-agnostic – works with any model
Counterfactuals for actionability – tells users what to change
Partial dependence plots – show feature-response relationships
Document with model cards – transparency and accountability
Combine methods – no single explanation tells the whole story

Summary

Explainable AI builds trust and meets regulatory requirements. SHAP provides theoretically grounded explanations, LIME offers local approximations, and counterfactuals show actionable changes. Master these techniques to deploy models that are both accurate and transparent.