ML Containerization (Docker, K8s)

Difficulty: Senior Level | Companies: Google, Meta, Netflix, Uber, Stripe

Docker for ML

Containerization ensures consistent environments across development and production.

ℹ️

Uber's Michelangelo uses Kubernetes to serve 1,000+ models across 50+ countries with 99.99% uptime.

Dockerfile

# Dockerfile
FROM python:3.9-slim as builder

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .
RUN pip install --no-cache-dir -e .

FROM python:3.9-slim

WORKDIR /app

COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /app /app

RUN useradd --create-home --shell /bin/bash mluser
USER mluser

EXPOSE 8000

HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
  CMD curl -f http://localhost:8000/health || exit 1

CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]

# docker-compose.yml
version: '3.8'

services:
  ml-server:
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "8000:8000"
    environment:
      - MODEL_PATH=/models/current
      - MLFLOW_TRACKING_URI=http://mlflow:5000
      - REDIS_URL=redis://redis:6379
      - LOG_LEVEL=info
    volumes:
      - model-data:/models
    deploy:
      replicas: 3
      resources:
        limits:
          cpus: '2'
          memory: 4G
        reservations:
          cpus: '1'
          memory: 2G
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    networks:
      - ml-network

  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
    volumes:
      - redis-data:/data
    networks:
      - ml-network

  mlflow:
    image: ghcr.io/mlflow/mlflow:v2.0.0
    ports:
      - "5000:5000"
    command: mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlflow.db --default-artifact-root /mlflow/artifacts
    volumes:
      - mlflow-data:/mlflow
    networks:
      - ml-network

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    depends_on:
      - ml-server
    networks:
      - ml-network

volumes:
  model-data:
  redis-data:
  mlflow-data:

networks:
  ml-network:
    driver: bridge

Kubernetes Deployment

# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ml-server
  namespace: production
  labels:
    app: ml-server
    version: v1.0.0
spec:
  replicas: 3
  selector:
    matchLabels:
      app: ml-server
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  template:
    metadata:
      labels:
        app: ml-server
        version: v1.0.0
    spec:
      containers:
        - name: ml-server
          image: ml-server:v1.0.0
          ports:
            - containerPort: 8000
          env:
            - name: MODEL_PATH
              value: "/models/current"
            - name: MLFLOW_TRACKING_URI
              valueFrom:
                secretKeyRef:
                  name: ml-secrets
                  key: mlflow-tracking-uri
            - name: REDIS_URL
              valueFrom:
                configMapKeyRef:
                  name: ml-config
                  key: redis-url
          resources:
            requests:
              memory: "2Gi"
              cpu: "1000m"
            limits:
              memory: "4Gi"
              cpu: "2000m"
          livenessProbe:
            httpGet:
              path: /health
              port: 8000
            initialDelaySeconds: 30
            periodSeconds: 10
          readinessProbe:
            httpGet:
              path: /ready
              port: 8000
            initialDelaySeconds: 5
            periodSeconds: 5
          volumeMounts:
            - name: model-volume
              mountPath: /models
              readOnly: true
      volumes:
        - name: model-volume
          persistentVolumeClaim:
            claimName: model-pvc

---
apiVersion: v1
kind: Service
metadata:
  name: ml-server-service
  namespace: production
spec:
  selector:
    app: ml-server
  ports:
    - protocol: TCP
      port: 80
      targetPort: 8000
  type: ClusterIP

---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: ml-server-hpa
  namespace: production
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: ml-server
  minReplicas: 2
  maxReplicas: 10
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: ml-server-ingress
  namespace: production
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
  ingressClassName: nginx
  tls:
    - hosts:
        - ml-api.example.com
      secretName: ml-tls-secret
  rules:
    - host: ml-api.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: ml-server-service
                port:
                  number: 80

GPU Kubernetes Configuration

# k8s-gpu-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: gpu-ml-server
  namespace: production
spec:
  replicas: 2
  selector:
    matchLabels:
      app: gpu-ml-server
  template:
    metadata:
      labels:
        app: gpu-ml-server
    spec:
      containers:
        - name: ml-server
          image: ml-server-gpu:v1.0.0
          resources:
            limits:
              nvidia.com/gpu: 1
              memory: "8Gi"
              cpu: "4000m"
            requests:
              memory: "4Gi"
              cpu: "2000m"
          env:
            - name: NVIDIA_VISIBLE_DEVICES
              value: "all"
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: "compute,utility"

---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
  name: gpu-priority
value: 1000000
globalDefault: false
description: "Priority class for GPU workloads"

Follow-Up Questions

How do you implement model caching in Kubernetes?
What are the trade-offs between different container orchestration platforms?
How would you handle GPU sharing across multiple models?
What monitoring is needed for containerized ML workloads?