Docker & Kubernetes for ML
Reproducibility and scalability are solved by containers. Learn to package ML models with Docker and deploy them on Kubernetes for production-grade inference.
Why Containers for ML?
Docker for ML Models
# Dockerfile for ML inference service
FROM python:3.10-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy model and application
COPY model/ ./model/
COPY app.py .
COPY config.yaml .
# Create non-root user
RUN useradd -m mluser && chown -R mluser:mluser /app
USER mluser
EXPOSE 8000
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
# requirements.txt
fastapi==0.104.1
uvicorn==0.24.0
scikit-learn==1.3.2
pandas==2.1.3
numpy==1.26.2
pydantic==2.5.2
prometheus-client==0.19.0
# app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import pandas as pd
import numpy as np
import joblib
import time
from prometheus_client import Counter, Histogram
app = FastAPI(title="ML Inference Service")
# Metrics
REQUEST_COUNT = Counter('predictions_total', 'Total predictions')
REQUEST_LATENCY = Histogram('prediction_latency_seconds', 'Prediction latency')
model = joblib.load("model/production_model.pkl")
feature_names = ["feature_1", "feature_2", "feature_3", "feature_4"]
class PredictionRequest(BaseModel):
features: dict
class PredictionResponse(BaseModel):
prediction: float
confidence: float
model_version: str
latency_ms: float
@app.post("/predict", response_model=PredictionResponse)
def predict(request: PredictionRequest):
start = time.time()
try:
X = pd.DataFrame([request.features], columns=feature_names)
prediction = model.predict(X)[0]
confidence = float(np.max(model.predict_proba(X)))
REQUEST_COUNT.inc()
latency = (time.time() - start) * 1000
REQUEST_LATENCY.observe(latency / 1000)
return PredictionResponse(
prediction=float(prediction),
confidence=confidence,
model_version="1.0.0",
latency_ms=latency
)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/health")
def health():
return {"status": "healthy"}
@app.get("/ready")
def readiness():
return {"status": "ready"}
Docker Compose for Local Development
# docker-compose.yml
version: '3.8'
services:
ml-service:
build: .
ports:
- "8000:8000"
environment:
- MODEL_PATH=/app/model
- LOG_LEVEL=info
volumes:
- ./model:/app/model:ro
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis-data:/data
prometheus:
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
grafana:
image: grafana/grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-data:/var/lib/grafana
volumes:
redis-data:
grafana-data:
Kubernetes Deployment
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: ml-inference
labels:
app: ml-inference
spec:
replicas: 3
selector:
matchLabels:
app: ml-inference
template:
metadata:
labels:
app: ml-inference
spec:
containers:
- name: ml-service
image: registry.example.com/ml-inference:v1.0
ports:
- containerPort: 8000
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
nvidia.com/gpu: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
env:
- name: MODEL_VERSION
value: "1.0.0"
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: ml-secrets
key: redis-url
---
# service.yaml
apiVersion: v1
kind: Service
metadata:
name: ml-inference
spec:
selector:
app: ml-inference
ports:
- port: 80
targetPort: 8000
type: ClusterIP
---
# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ml-inference-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ml-inference
minReplicas: 2
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Pods
pods:
metric:
name: predictions_per_second
target:
type: AverageValue
averageValue: "100"
GPU Scheduling
# gpu-scheduling.yaml
apiVersion: v1
kind: Pod
metadata:
name: gpu-inference
spec:
containers:
- name: ml-service
image: registry.example.com/ml-gpu:v1.0
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
nodeSelector:
accelerator: nvidia-tesla-t4
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
---
# For multi-GPU models
apiVersion: apps/v1
kind: Deployment
metadata:
name: multi-gpu-inference
spec:
replicas: 2
template:
spec:
containers:
- name: ml-service
resources:
limits:
nvidia.com/gpu: 4
Best Practices
- Use multi-stage builds to keep images small
- Pin versions in requirements.txt for reproducibility
- Set resource requests and limits to prevent OOM kills
- Use health checks for liveness and readiness probes
- Implement graceful shutdown to drain requests before termination