Privacy & Differential Privacy

Privacy isn't optional. Learn to build ML systems that protect individual data while still extracting useful patterns.

Differential Privacy Fundamentals

Differential privacy provides a mathematical guarantee: removing any single individual's data doesn't significantly change the output.

\Pr[\mathcal{M}(D) \in S] \leq e^{\epsilon} \Pr[\mathcal{M}(D') \in S]

import numpy as np
from typing import Callable, Tuple
from scipy import stats

class DifferentialPrivacy:
    def __init__(self, epsilon: float, delta: float = 0.0):
        self.epsilon = epsilon
        self.delta = delta
    
    def laplace_mechanism(self, value: float, sensitivity: float) -> float:
        """Add Laplace noise for pure differential privacy"""
        scale = sensitivity / self.epsilon
        noise = np.random.laplace(0, scale)
        return value + noise
    
    def gaussian_mechanism(self, value: float, sensitivity: float) -> float:
        """Add Gaussian noise for approximate differential privacy"""
        sigma = sensitivity * np.sqrt(2 * np.log(1.25 / self.delta)) / self.epsilon
        noise = np.random.normal(0, sigma)
        return value + noise
    
    def exponential_mechanism(self, scores: np.ndarray, sensitivity: float) -> int:
        """Select from candidates with probability proportional to utility"""
        probabilities = np.exp(self.epsilon * scores / (2 * sensitivity))
        probabilities /= probabilities.sum()
        return np.random.choice(len(scores), p=probabilities)

# Usage
dp = DifferentialPrivacy(epsilon=1.0, delta=1e-5)

# Count query with noise
true_count = 42000
noisy_count = dp.laplace_mechanism(true_count, sensitivity=1.0)
print(f"True: {true_count}, Noisy: {noisy_count:.0f}")

# Mean query
data = np.random.normal(50, 10, 1000)
true_mean = np.mean(data)
# Sensitivity of mean = (max - min) / n
sensitivity = (np.max(data) - np.min(data)) / len(data)
noisy_mean = dp.laplace_mechanism(true_mean, sensitivity)

Differentially Private SGD (DP-SGD)

import numpy as np
from typing import Tuple

class DPSGD:
    def __init__(self, learning_rate: float, noise_multiplier: float,
                 batch_size: int, l2_norm_clip: float):
        self.lr = learning_rate
        self.noise_multiplier = noise_multiplier
        self.batch_size = batch_size
        self.l2_norm_clip = l2_norm_clip
    
    def clip_gradients(self, gradients: np.ndarray) -> np.ndarray:
        """Clip per-sample gradients to bound sensitivity"""
        grad_norm = np.linalg.norm(gradients, axis=1, keepdims=True)
        clip_factor = np.minimum(1.0, self.l2_norm_clip / (grad_norm + 1e-8))
        clipped = gradients * clip_factor
        return clipped
    
    def add_noise(self, clipped_gradients: np.ndarray) -> np.ndarray:
        """Add calibrated Gaussian noise"""
        avg_grad = np.mean(clipped_gradients, axis=0)
        noise_scale = self.l2_norm_clip * self.noise_multiplier / self.batch_size
        noise = np.random.normal(0, noise_scale, avg_grad.shape)
        return avg_grad + noise
    
    def step(self, model, batch_X, batch_y, epoch: int):
        """Single DP-SGD training step"""
        # Forward pass
        predictions = model.forward(batch_X)
        
        # Compute per-sample gradients
        per_sample_grads = model.compute_per_sample_gradients(batch_X, batch_y)
        
        # Clip
        clipped = self.clip_gradients(per_sample_grads)
        
        # Add noise
        noisy_grad = self.add_noise(clipped)
        
        # Update model
        model.update(noisy_grad, self.lr)
        
        # Track privacy budget
        return self.privacy_spent(epoch)
    
    def privacy_spent(self, epoch: int) -> Tuple[float, float]:
        """Compute epsilon spent using RDP"""
        # Simplified RDP accounting
        q = self.batch_size / 10000  # Sampling rate
        sigma = self.noise_multiplier
        
        # RDP at alpha = 1 + ln(1/delta)
        alpha = 1 + np.log(1 / 1e-5)
        rdp = (q ** 2 * alpha) / (2 * sigma ** 2)
        
        # Convert RDP to (epsilon, delta)-DP
        epsilon = rdp + np.log(1 / 1e-5) / (alpha - 1)
        return epsilon, 1e-5

Federated Learning

import numpy as np
from typing import List, Dict
import copy

class FederatedClient:
    def __init__(self, model, data, labels):
        self.model = model
        self.X = data
        self.y = labels
    
    def train(self, global_weights, local_epochs=5):
        """Train locally and return updates"""
        self.model.set_weights(global_weights)
        
        for _ in range(local_epochs):
            self.model.train(self.X, self.y)
        
        return self.model.get_weights()
    
    def compute_update(self, global_weights, local_weights):
        """Compute weight update (difference)"""
        return {k: local_weights[k] - global_weights[k] 
                for k in global_weights.keys()}

class FederatedAveraging:
    def __init__(self, global_model, clients: List[FederatedClient]):
        self.global_model = global_model
        self.clients = clients
        self.round_num = 0
    
    def aggregate(self, client_updates: List[Dict]) -> Dict:
        """FedAvg: weighted average of client updates"""
        total_samples = sum(len(c.X) for c in self.clients)
        
        averaged = {}
        for key in client_updates[0].keys():
            weighted_sum = np.zeros_like(client_updates[0][key])
            for i, client in enumerate(self.clients):
                weight = len(client.X) / total_samples
                weighted_sum += client_updates[i][key] * weight
            averaged[key] = weighted_sum
        
        return averaged
    
    def train_round(self) -> float:
        """Run one round of federated training"""
        self.round_num += 1
        global_weights = self.global_model.get_weights()
        
        # Distribute global model
        client_updates = []
        for client in self.clients:
            local_weights = client.train(global_weights)
            update = client.compute_update(global_weights, local_weights)
            client_updates.append(update)
        
        # Aggregate
        avg_update = self.aggregate(client_updates)
        
        # Update global model
        new_weights = {k: global_weights[k] + avg_update[k] 
                      for k in global_weights.keys()}
        self.global_model.set_weights(new_weights)
        
        # Evaluate
        return self.evaluate()
    
    def evaluate(self):
        """Evaluate global model on held-out data"""
        # In practice, evaluate on a server-side validation set
        pass

# Secure aggregation - mask updates before sending
class SecureAggregator:
    def __init__(self, num_clients: int):
        self.num_clients = num_clients
        self.keys = [np.random.bytes(32) for _ in range(num_clients)]
    
    def mask_update(self, client_id: int, update: Dict) -> Dict:
        """Mask client update with cryptographic randomness"""
        np.random.seed(int.from_bytes(self.keys[client_id][:4], 'big'))
        masked = {}
        for key, value in update.items():
            mask = np.random.normal(0, 1, value.shape)
            masked[key] = value + mask
        return masked, self.keys[client_id]

GDPR Compliance for ML

from dataclasses import dataclass
from typing import Optional
from datetime import datetime

@dataclass
class DataSubjectRequest:
    subject_id: str
    request_type: str  # "access", "deletion", "portability"
    timestamp: datetime = None
    
    def __post_init__(self):
        if self.timestamp is None:
            self.timestamp = datetime.now()

class GDPRCompliantML:
    def __init__(self, model_store, data_store):
        self.model_store = model_store
        self.data_store = data_store
    
    def handle_deletion_request(self, subject_id: str):
        """Right to erasure - remove subject's data and retrain"""
        # 1. Delete from data store
        self.data_store.delete(subject_id)
        
        # 2. Check if model memorizes this individual
        memorization_risk = self._assess_memorization(subject_id)
        
        if memorization_risk > 0.1:
            # 3. Retrain without this subject's data
            remaining_data = self.data_store.get_all(exclude=[subject_id])
            self.model_store.retrain(remaining_data)
        
        return {"status": "completed", "retrained": memorization_risk > 0.1}
    
    def handle_access_request(self, subject_id: str):
        """Right to access - provide all data held about subject"""
        return self.data_store.get_all(subject_id)
    
    def handle_portability_request(self, subject_id: str):
        """Right to data portability - export in machine-readable format"""
        data = self.data_store.get_all(subject_id)
        return {"format": "json", "data": data}
    
    def _assess_memorization(self, subject_id: str) -> float:
        """Estimate if model memorizes this individual's data"""
        # Membership inference attack approximation
        subject_data = self.data_store.get_all(subject_id)
        
        # Compare loss on subject's data vs similar non-subject data
        subject_loss = self.model_store.evaluate(subject_data)
        similar_data = self.data_store.get_similar(subject_id, exclude=[subject_id])
        non_subject_loss = self.model_store.evaluate(similar_data)
        
        # Higher loss difference suggests memorization
        memorization = max(0, non_subject_loss - subject_loss) / non_subject_loss
        return memorization

Best Practices

Use differential privacy for aggregate statistics and model training
Implement federated learning when data cannot leave devices
Apply differential privacy to federated learning for double protection
Conduct privacy impact assessments before deploying ML systems
Provide transparency about what data is collected and how it's used