Privacy & Differential Privacy
Privacy isn't optional. Learn to build ML systems that protect individual data while still extracting useful patterns.
Differential Privacy Fundamentals
Differential privacy provides a mathematical guarantee: removing any single individual's data doesn't significantly change the output.
import numpy as np
from typing import Callable, Tuple
from scipy import stats
class DifferentialPrivacy:
def __init__(self, epsilon: float, delta: float = 0.0):
self.epsilon = epsilon
self.delta = delta
def laplace_mechanism(self, value: float, sensitivity: float) -> float:
"""Add Laplace noise for pure differential privacy"""
scale = sensitivity / self.epsilon
noise = np.random.laplace(0, scale)
return value + noise
def gaussian_mechanism(self, value: float, sensitivity: float) -> float:
"""Add Gaussian noise for approximate differential privacy"""
sigma = sensitivity * np.sqrt(2 * np.log(1.25 / self.delta)) / self.epsilon
noise = np.random.normal(0, sigma)
return value + noise
def exponential_mechanism(self, scores: np.ndarray, sensitivity: float) -> int:
"""Select from candidates with probability proportional to utility"""
probabilities = np.exp(self.epsilon * scores / (2 * sensitivity))
probabilities /= probabilities.sum()
return np.random.choice(len(scores), p=probabilities)
# Usage
dp = DifferentialPrivacy(epsilon=1.0, delta=1e-5)
# Count query with noise
true_count = 42000
noisy_count = dp.laplace_mechanism(true_count, sensitivity=1.0)
print(f"True: {true_count}, Noisy: {noisy_count:.0f}")
# Mean query
data = np.random.normal(50, 10, 1000)
true_mean = np.mean(data)
# Sensitivity of mean = (max - min) / n
sensitivity = (np.max(data) - np.min(data)) / len(data)
noisy_mean = dp.laplace_mechanism(true_mean, sensitivity)
Differentially Private SGD (DP-SGD)
import numpy as np
from typing import Tuple
class DPSGD:
def __init__(self, learning_rate: float, noise_multiplier: float,
batch_size: int, l2_norm_clip: float):
self.lr = learning_rate
self.noise_multiplier = noise_multiplier
self.batch_size = batch_size
self.l2_norm_clip = l2_norm_clip
def clip_gradients(self, gradients: np.ndarray) -> np.ndarray:
"""Clip per-sample gradients to bound sensitivity"""
grad_norm = np.linalg.norm(gradients, axis=1, keepdims=True)
clip_factor = np.minimum(1.0, self.l2_norm_clip / (grad_norm + 1e-8))
clipped = gradients * clip_factor
return clipped
def add_noise(self, clipped_gradients: np.ndarray) -> np.ndarray:
"""Add calibrated Gaussian noise"""
avg_grad = np.mean(clipped_gradients, axis=0)
noise_scale = self.l2_norm_clip * self.noise_multiplier / self.batch_size
noise = np.random.normal(0, noise_scale, avg_grad.shape)
return avg_grad + noise
def step(self, model, batch_X, batch_y, epoch: int):
"""Single DP-SGD training step"""
# Forward pass
predictions = model.forward(batch_X)
# Compute per-sample gradients
per_sample_grads = model.compute_per_sample_gradients(batch_X, batch_y)
# Clip
clipped = self.clip_gradients(per_sample_grads)
# Add noise
noisy_grad = self.add_noise(clipped)
# Update model
model.update(noisy_grad, self.lr)
# Track privacy budget
return self.privacy_spent(epoch)
def privacy_spent(self, epoch: int) -> Tuple[float, float]:
"""Compute epsilon spent using RDP"""
# Simplified RDP accounting
q = self.batch_size / 10000 # Sampling rate
sigma = self.noise_multiplier
# RDP at alpha = 1 + ln(1/delta)
alpha = 1 + np.log(1 / 1e-5)
rdp = (q ** 2 * alpha) / (2 * sigma ** 2)
# Convert RDP to (epsilon, delta)-DP
epsilon = rdp + np.log(1 / 1e-5) / (alpha - 1)
return epsilon, 1e-5
Federated Learning
import numpy as np
from typing import List, Dict
import copy
class FederatedClient:
def __init__(self, model, data, labels):
self.model = model
self.X = data
self.y = labels
def train(self, global_weights, local_epochs=5):
"""Train locally and return updates"""
self.model.set_weights(global_weights)
for _ in range(local_epochs):
self.model.train(self.X, self.y)
return self.model.get_weights()
def compute_update(self, global_weights, local_weights):
"""Compute weight update (difference)"""
return {k: local_weights[k] - global_weights[k]
for k in global_weights.keys()}
class FederatedAveraging:
def __init__(self, global_model, clients: List[FederatedClient]):
self.global_model = global_model
self.clients = clients
self.round_num = 0
def aggregate(self, client_updates: List[Dict]) -> Dict:
"""FedAvg: weighted average of client updates"""
total_samples = sum(len(c.X) for c in self.clients)
averaged = {}
for key in client_updates[0].keys():
weighted_sum = np.zeros_like(client_updates[0][key])
for i, client in enumerate(self.clients):
weight = len(client.X) / total_samples
weighted_sum += client_updates[i][key] * weight
averaged[key] = weighted_sum
return averaged
def train_round(self) -> float:
"""Run one round of federated training"""
self.round_num += 1
global_weights = self.global_model.get_weights()
# Distribute global model
client_updates = []
for client in self.clients:
local_weights = client.train(global_weights)
update = client.compute_update(global_weights, local_weights)
client_updates.append(update)
# Aggregate
avg_update = self.aggregate(client_updates)
# Update global model
new_weights = {k: global_weights[k] + avg_update[k]
for k in global_weights.keys()}
self.global_model.set_weights(new_weights)
# Evaluate
return self.evaluate()
def evaluate(self):
"""Evaluate global model on held-out data"""
# In practice, evaluate on a server-side validation set
pass
# Secure aggregation - mask updates before sending
class SecureAggregator:
def __init__(self, num_clients: int):
self.num_clients = num_clients
self.keys = [np.random.bytes(32) for _ in range(num_clients)]
def mask_update(self, client_id: int, update: Dict) -> Dict:
"""Mask client update with cryptographic randomness"""
np.random.seed(int.from_bytes(self.keys[client_id][:4], 'big'))
masked = {}
for key, value in update.items():
mask = np.random.normal(0, 1, value.shape)
masked[key] = value + mask
return masked, self.keys[client_id]
GDPR Compliance for ML
from dataclasses import dataclass
from typing import Optional
from datetime import datetime
@dataclass
class DataSubjectRequest:
subject_id: str
request_type: str # "access", "deletion", "portability"
timestamp: datetime = None
def __post_init__(self):
if self.timestamp is None:
self.timestamp = datetime.now()
class GDPRCompliantML:
def __init__(self, model_store, data_store):
self.model_store = model_store
self.data_store = data_store
def handle_deletion_request(self, subject_id: str):
"""Right to erasure - remove subject's data and retrain"""
# 1. Delete from data store
self.data_store.delete(subject_id)
# 2. Check if model memorizes this individual
memorization_risk = self._assess_memorization(subject_id)
if memorization_risk > 0.1:
# 3. Retrain without this subject's data
remaining_data = self.data_store.get_all(exclude=[subject_id])
self.model_store.retrain(remaining_data)
return {"status": "completed", "retrained": memorization_risk > 0.1}
def handle_access_request(self, subject_id: str):
"""Right to access - provide all data held about subject"""
return self.data_store.get_all(subject_id)
def handle_portability_request(self, subject_id: str):
"""Right to data portability - export in machine-readable format"""
data = self.data_store.get_all(subject_id)
return {"format": "json", "data": data}
def _assess_memorization(self, subject_id: str) -> float:
"""Estimate if model memorizes this individual's data"""
# Membership inference attack approximation
subject_data = self.data_store.get_all(subject_id)
# Compare loss on subject's data vs similar non-subject data
subject_loss = self.model_store.evaluate(subject_data)
similar_data = self.data_store.get_similar(subject_id, exclude=[subject_id])
non_subject_loss = self.model_store.evaluate(similar_data)
# Higher loss difference suggests memorization
memorization = max(0, non_subject_loss - subject_loss) / non_subject_loss
return memorization
Best Practices
- Use differential privacy for aggregate statistics and model training
- Implement federated learning when data cannot leave devices
- Apply differential privacy to federated learning for double protection
- Conduct privacy impact assessments before deploying ML systems
- Provide transparency about what data is collected and how it's used