Docker & Kubernetes for ML

Reproducibility and scalability are solved by containers. Learn to package ML models with Docker and deploy them on Kubernetes for production-grade inference.

Why Containers for ML?

Docker for ML Models

# Dockerfile for ML inference service
FROM python:3.10-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    libgomp1 \
    && rm -rf /var/lib/apt/lists/*

# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy model and application
COPY model/ ./model/
COPY app.py .
COPY config.yaml .

# Create non-root user
RUN useradd -m mluser && chown -R mluser:mluser /app
USER mluser

EXPOSE 8000

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

# requirements.txt
fastapi==0.104.1
uvicorn==0.24.0
scikit-learn==1.3.2
pandas==2.1.3
numpy==1.26.2
pydantic==2.5.2
prometheus-client==0.19.0

# app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pandas as pd
import numpy as np
import joblib
import time
from prometheus_client import Counter, Histogram

app = FastAPI(title="ML Inference Service")

# Metrics
REQUEST_COUNT = Counter('predictions_total', 'Total predictions')
REQUEST_LATENCY = Histogram('prediction_latency_seconds', 'Prediction latency')

model = joblib.load("model/production_model.pkl")
feature_names = ["feature_1", "feature_2", "feature_3", "feature_4"]

class PredictionRequest(BaseModel):
    features: dict

class PredictionResponse(BaseModel):
    prediction: float
    confidence: float
    model_version: str
    latency_ms: float

@app.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest):
    start = time.time()
    
    try:
        X = pd.DataFrame([request.features], columns=feature_names)
        prediction = model.predict(X)[0]
        confidence = float(np.max(model.predict_proba(X)))
        
        REQUEST_COUNT.inc()
        latency = (time.time() - start) * 1000
        REQUEST_LATENCY.observe(latency / 1000)
        
        return PredictionResponse(
            prediction=float(prediction),
            confidence=confidence,
            model_version="1.0.0",
            latency_ms=latency
        )
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

@app.get("/health")
def health():
    return {"status": "healthy"}

@app.get("/ready")
def readiness():
    return {"status": "ready"}

Docker Compose for Local Development

# docker-compose.yml
version: '3.8'

services:
  ml-service:
    build: .
    ports:
      - "8000:8000"
    environment:
      - MODEL_PATH=/app/model
      - LOG_LEVEL=info
    volumes:
      - ./model:/app/model:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
  
  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
    volumes:
      - redis-data:/data
  
  prometheus:
    image: prom/prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
  
  grafana:
    image: grafana/grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana-data:/var/lib/grafana

volumes:
  redis-data:
  grafana-data:

Kubernetes Deployment

# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ml-inference
  labels:
    app: ml-inference
spec:
  replicas: 3
  selector:
    matchLabels:
      app: ml-inference
  template:
    metadata:
      labels:
        app: ml-inference
    spec:
      containers:
      - name: ml-service
        image: registry.example.com/ml-inference:v1.0
        ports:
        - containerPort: 8000
        resources:
          requests:
            memory: "512Mi"
            cpu: "500m"
          limits:
            memory: "1Gi"
            cpu: "1000m"
            nvidia.com/gpu: 1
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /ready
            port: 8000
          initialDelaySeconds: 5
          periodSeconds: 5
        env:
        - name: MODEL_VERSION
          value: "1.0.0"
        - name: REDIS_URL
          valueFrom:
            secretKeyRef:
              name: ml-secrets
              key: redis-url
---
# service.yaml
apiVersion: v1
kind: Service
metadata:
  name: ml-inference
spec:
  selector:
    app: ml-inference
  ports:
  - port: 80
    targetPort: 8000
  type: ClusterIP
---
# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ml-inference-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ml-inference
  minReplicas: 2
  maxReplicas: 20
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Pods
    pods:
      metric:
        name: predictions_per_second
      target:
        type: AverageValue
        averageValue: "100"

GPU Scheduling

# gpu-scheduling.yaml
apiVersion: v1
kind: Pod
metadata:
  name: gpu-inference
spec:
  containers:
  - name: ml-service
    image: registry.example.com/ml-gpu:v1.0
    resources:
      limits:
        nvidia.com/gpu: 1
      requests:
        nvidia.com/gpu: 1
  nodeSelector:
    accelerator: nvidia-tesla-t4
  tolerations:
  - key: nvidia.com/gpu
    operator: Exists
    effect: NoSchedule
---
# For multi-GPU models
apiVersion: apps/v1
kind: Deployment
metadata:
  name: multi-gpu-inference
spec:
  replicas: 2
  template:
    spec:
      containers:
      - name: ml-service
        resources:
          limits:
            nvidia.com/gpu: 4

Best Practices

Use multi-stage builds to keep images small
Pin versions in requirements.txt for reproducibility
Set resource requests and limits to prevent OOM kills
Use health checks for liveness and readiness probes
Implement graceful shutdown to drain requests before termination