ML Containerization (Docker, K8s)
Difficulty: Senior Level | Companies: Google, Meta, Netflix, Uber, Stripe
Docker for ML
Containerization ensures consistent environments across development and production.
βΉοΈ
Uber's Michelangelo uses Kubernetes to serve 1,000+ models across 50+ countries with 99.99% uptime.
Dockerfile
# Dockerfile
FROM python:3.9-slim as builder
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN pip install --no-cache-dir -e .
FROM python:3.9-slim
WORKDIR /app
COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /app /app
RUN useradd --create-home --shell /bin/bash mluser
USER mluser
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
# docker-compose.yml
version: '3.8'
services:
ml-server:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:8000"
environment:
- MODEL_PATH=/models/current
- MLFLOW_TRACKING_URI=http://mlflow:5000
- REDIS_URL=redis://redis:6379
- LOG_LEVEL=info
volumes:
- model-data:/models
deploy:
replicas: 3
resources:
limits:
cpus: '2'
memory: 4G
reservations:
cpus: '1'
memory: 2G
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
networks:
- ml-network
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis-data:/data
networks:
- ml-network
mlflow:
image: ghcr.io/mlflow/mlflow:v2.0.0
ports:
- "5000:5000"
command: mlflow server --host 0.0.0.0 --port 5000 --backend-store-uri sqlite:///mlflow.db --default-artifact-root /mlflow/artifacts
volumes:
- mlflow-data:/mlflow
networks:
- ml-network
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- ml-server
networks:
- ml-network
volumes:
model-data:
redis-data:
mlflow-data:
networks:
ml-network:
driver: bridge
Kubernetes Deployment
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-server
namespace: production
labels:
app: ml-server
version: v1.0.0
spec:
replicas: 3
selector:
matchLabels:
app: ml-server
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
template:
metadata:
labels:
app: ml-server
version: v1.0.0
spec:
containers:
- name: ml-server
image: ml-server:v1.0.0
ports:
- containerPort: 8000
env:
- name: MODEL_PATH
value: "/models/current"
- name: MLFLOW_TRACKING_URI
valueFrom:
secretKeyRef:
name: ml-secrets
key: mlflow-tracking-uri
- name: REDIS_URL
valueFrom:
configMapKeyRef:
name: ml-config
key: redis-url
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
volumeMounts:
- name: model-volume
mountPath: /models
readOnly: true
volumes:
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
---
apiVersion: v1
kind: Service
metadata:
name: ml-server-service
namespace: production
spec:
selector:
app: ml-server
ports:
- protocol: TCP
port: 80
targetPort: 8000
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ml-server-hpa
namespace: production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ml-server
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ml-server-ingress
namespace: production
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- ml-api.example.com
secretName: ml-tls-secret
rules:
- host: ml-api.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: ml-server-service
port:
number: 80
GPU Kubernetes Configuration
# k8s-gpu-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-ml-server
namespace: production
spec:
replicas: 2
selector:
matchLabels:
app: gpu-ml-server
template:
metadata:
labels:
app: gpu-ml-server
spec:
containers:
- name: ml-server
image: ml-server-gpu:v1.0.0
resources:
limits:
nvidia.com/gpu: 1
memory: "8Gi"
cpu: "4000m"
requests:
memory: "4Gi"
cpu: "2000m"
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: gpu-priority
value: 1000000
globalDefault: false
description: "Priority class for GPU workloads"
Follow-Up Questions
- How do you implement model caching in Kubernetes?
- What are the trade-offs between different container orchestration platforms?
- How would you handle GPU sharing across multiple models?
- What monitoring is needed for containerized ML workloads?