Explainable AI (XAI)
Black-box models achieve high accuracy but resist understanding. Explainable AI makes predictions transparent β showing why a model made a specific decision, which features matter most, and how to change outcomes. Trust and compliance require explainability.
SHAP Waterfall Explanation
Why XAI Matters
A loan application is denied. The applicant deserves to know why. A doctor needs to trust a diagnosis model. Regulators require explanations. XAI transforms opaque predictions into actionable insights.
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')
Generate Interpretable Dataset
np.random.seed(42)
n = 2000
X = pd.DataFrame({
'income': np.random.lognormal(10, 0.7, n),
'age': np.random.normal(40, 12, n).clip(18, 80),
'credit_score': np.random.normal(650, 80, n).clip(300, 850),
'debt_ratio': np.random.beta(2, 5, n),
'years_employed': np.random.exponential(8, n).clip(0, 40),
'num_accounts': np.random.poisson(3, n) + 1
})
# Target with known relationships
log_odds = (
-2
+ 0.3 * (X['income'] > 50000).astype(int)
- 0.5 * (X['age'] < 30).astype(int)
+ 0.8 * (X['credit_score'] > 700).astype(int)
- 1.2 * (X['debt_ratio'] > 0.4).astype(int)
+ 0.2 * X['years_employed']
)
prob = 1 / (1 + np.exp(-log_odds))
y = np.random.binomial(1, prob)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42)
model.fit(X_train, y_train)
print(f"Model accuracy: {model.score(X_test, y_test):.3f}")
SHAP (SHapley Additive exPlanations)
SHAP values explain each prediction by attributing it to individual features based on game theory. The SHAP value for feature is:
where is the set of all features, is a subset without feature , and is the model prediction using features in . This is the unique solution satisfying efficiency, symmetry, dummy, and additivity axioms.
import shap
# Create SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
print(f"SHAP values shape: {shap_values.shape}")
print(f"Base value: {explainer.expected_value:.4f}")
# SHAP summary plot
shap.summary_plot(shap_values, X_test, show=False)
print("SHAP summary plot generated")
# SHAP bar plot (global importance)
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
print("SHAP bar plot generated")
# SHAP dependence plot
shap.dependence_plot("credit_score", shap_values, X_test, show=False)
print("SHAP dependence plot for credit_score")
Individual Prediction Explanation
def explain_prediction(model, explainer, X_instance, feature_names):
"""Explain a single prediction."""
shap_values = explainer.shap_values(X_instance)
# Base value + feature contributions
base_value = explainer.expected_value
contributions = pd.Series(shap_values[0], index=feature_names)
prediction = model.predict_proba(X_instance)[0, 1]
print(f"Prediction: {prediction:.4f}")
print(f"Base rate: {1/(1+np.exp(-base_value)):.4f}")
print(f"\nFeature contributions:")
for feat, contrib in contributions.sort_values(ascending=False).items():
direction = "+" if contrib > 0 else "-"
print(f" {feat}: {direction}{abs(contrib):.4f}")
return contributions
# Explain a specific prediction
idx = 0
contributions = explain_prediction(model, explainer, X_test.iloc[[idx]], X_test.columns)
LIME (Local Interpretable Model-Agnostic Explanations)
from sklearn.linear_model import Ridge
class LIMEExplainer:
"""Simplified LIME implementation."""
def __init__(self, model, X_train, n_samples=1000):
self.model = model
self.X_train = X_train
self.n_samples = n_samples
self.mean = X_train.mean()
self.std = X_train.std() + 1e-10
def explain(self, instance, n_features=5):
"""Generate local explanation for a single instance."""
# Sample neighborhood
X_sample = np.random.randn(self.n_samples, len(instance)) * self.std + self.mean
# Get model predictions
y_sample = self.model.predict_proba(X_sample)[:, 1]
# Compute distances (kernel weights)
distances = np.sqrt(((X_sample - instance.values) ** 2).sum(axis=1))
weights = np.exp(-distances ** 2 / (np.median(distances) ** 2))
# Fit local linear model
local_model = Ridge(alpha=1.0)
local_model.fit(X_sample, y_sample, sample_weight=weights)
# Feature importance from local model
importance = pd.Series(
np.abs(local_model.coef_) * self.std.values,
index=self.X_train.columns
).sort_values(ascending=False)
return importance.head(n_features), local_model
lime = LIMEExplainer(model, X_train)
importance, local_model = lime.explain(X_test.iloc[0])
print("LIME feature importance (local):")
print(importance)
Partial Dependence Plots
from sklearn.inspection import partial_dependence
# Partial dependence for individual features
features = ['credit_score', 'income', 'debt_ratio']
pd_results = partial_dependence(model, X_train, features, grid_resolution=50)
for i, feature in enumerate(features):
pd_values = pd_results['average'][i]
pd_grid = pd_results['grid_values'][i]
print(f"\n{feature} partial dependence:")
print(f" Range: {pd_grid.min():.1f} to {pd_grid.max():.1f}")
print(f" Effect range: {pd_values.min():.4f} to {pd_values.max():.4f}")
Global Feature Importance
# Permutation importance (model-agnostic)
perm_imp = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
importance_df = pd.DataFrame({
'feature': X_test.columns,
'importance_mean': perm_imp.importances_mean,
'importance_std': perm_imp.importances_std
}).sort_values('importance_mean', ascending=False)
print("Permutation Feature Importance:")
print(importance_df.to_string(index=False))
# Tree-specific importance
tree_imp = pd.Series(model.feature_importances_, index=X_test.columns).sort_values(ascending=False)
print("\nTree-based Feature Importance:")
print(tree_imp)
Counterfactual Explanations
def generate_counterfactual(model, instance, target_class, X_train, n_iterations=1000, learning_rate=0.1):
"""Generate counterfactual explanation: minimal change to flip prediction."""
instance = instance.copy()
original_pred = model.predict_proba(instance.values.reshape(1, -1))[0, target_class]
# Start from instance
cf = instance.values.copy()
for _ in range(n_iterations):
# Add small perturbation
gradient = np.random.randn(*cf.shape) * 0.1
# Only change features that move toward target
candidate = cf + gradient
new_pred = model.predict_proba(candidate.reshape(1, -1))[0, target_class]
if new_pred > original_pred:
cf = candidate
original_pred = new_pred
cf_instance = pd.Series(cf, index=instance.index)
# What changed?
changes = cf_instance - instance
changed_features = changes[changes.abs() > 0.01]
return cf_instance, changed_features
# Generate counterfactual for a denied application
denied_idx = X_test[X_test.apply(lambda row: model.predict_proba(row.values.reshape(1,-1))[0,1] < 0.5, axis=1)].index[0]
instance = X_test.loc[denied_idx]
cf, changes = generate_counterfactual(model, instance, target_class=1, X_train=X_train)
print(f"Original prediction: {model.predict_proba(instance.values.reshape(1,-1))[0,1]:.4f}")
print(f"Counterfactual prediction: {model.predict_proba(cf.values.reshape(1,-1))[0,1]:.4f}")
print(f"\nChanges needed:")
for feat, change in changes.items():
print(f" {feat}: {instance[feat]:.2f} β {cf[feat]:.2f}")
Model Cards
def create_model_card(model, X_test, y_test, model_name="Credit Scoring Model"):
"""Generate a model card for documentation."""
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix
)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
card = f"""
# Model Card: {model_name}
## Model Details
- Type: {type(model).__name__}
- Features: {X_test.shape[1]}
- Training samples: Not specified
## Performance Metrics
- Accuracy: {accuracy_score(y_test, y_pred):.4f}
- Precision: {precision_score(y_test, y_pred):.4f}
- Recall: {recall_score(y_test, y_pred):.4f}
- F1 Score: {f1_score(y_test, y_pred):.4f}
- ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}
## Confusion Matrix
- True Negatives: {tn}
- False Positives: {fp}
- False Negatives: {fn}
- True Positives: {tp}
## Fairness Considerations
- [ ] Tested across demographic groups
- [ ] No disparate impact detected
- [ ] Regular monitoring plan in place
## Limitations
- Model trained on historical data
- May not generalize to population shifts
- Requires periodic retraining
## Ethical Considerations
- Used for credit decisions
- Applicants can request explanation
- Human review available for edge cases
"""
print(card)
return card
card = create_model_card(model, X_test, y_test)
Best Practices
- SHAP for global and local β the gold standard for tree models
- LIME for model-agnostic β works with any model
- Counterfactuals for actionability β tells users what to change
- Partial dependence plots β show feature-response relationships
- Document with model cards β transparency and accountability
- Combine methods β no single explanation tells the whole story
Summary
Explainable AI builds trust and meets regulatory requirements. SHAP provides theoretically grounded explanations, LIME offers local approximations, and counterfactuals show actionable changes. Master these techniques to deploy models that are both accurate and transparent.