Interview Question (Hard) β Asked at: Google, NVIDIA, Tesla, Uber, Netflix
"Design a containerized ML serving system optimized for GPU inference. How do you handle model optimization, batching, and resource management while maintaining low latency?"
ML Containerization Architecture
Containerization provides consistent, reproducible environments for ML model deployment. It encapsulates dependencies, models, and serving code into portable units.
Container Architecture Diagram
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β ML Containerization Stack β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β Container Registry β β
β β (ECR, GCR, Docker Hub) β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β β
β ββββββββββββββββββββββΌβββββββββββββββββββββ β
β βΌ βΌ βΌ β
β ββββββββββββ ββββββββββββββββ ββββββββββββ β
β β Training β β Serving β β Batch β β
β βContainer β β Container β βContainer β β
β β (GPU) β β (GPU/CPU) β β (CPU/GPU)β β
β ββββββββββββ ββββββββββββββββ ββββββββββββ β
β β β β β
β β ββββββββββββββββ΄ββββββββββββββββ β β
β β β Runtime Layer β β β
β β β (CUDA, cuDNN, TensorRT) β β β
β β ββββββββββββββββββββββββββββββββ β β
β β β β
β ββββββββββββββββ¬ββββββββββββββββββββββββ β
β βΌ β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β Base Images & OS Layer β β
β β (Ubuntu, NVIDIA CUDA, Python) β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Docker Best Practices for ML
Multi-Stage Build for ML Serving
# Stage 1: Build dependencies
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS builder
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.10
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-venv \
python3-pip \
build-essential \
cmake \
git \
wget \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python3.10 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Stage 2: Production image
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHON_VERSION=3.10
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
# Install runtime dependencies only
RUN apt-get update && apt-get install -y \
python3.10 \
python3.10-venv \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create non-root user
RUN groupadd -r mluser && useradd -r -g mluser -d /home/mluser -s /sbin/nologin mluser
RUN mkdir -p /home/mluser && chown mluser:mluser /home/mluser
# Copy application code
WORKDIR /app
COPY --chown=mluser:mluser src/ ./src/
COPY --chown=mluser:mluser models/ ./models/
COPY --chown=mluser:mluser config/ ./config/
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python3.10 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1
# Switch to non-root user
USER mluser
# Expose port
EXPOSE 8080
# Run the application
CMD ["python3.10", "-m", "uvicorn", "src.server:app", "--host", "0.0.0.0", "--port", "8080"]
Requirements.txt for ML Serving
# ML Framework
torch==2.0.1+cu118
onnxruntime-gpu==1.15.1
# Web Framework
fastapi==0.104.1
uvicorn[standard]==0.24.0
# Data Processing
numpy==1.24.3
pandas==2.0.3
scikit-learn==1.3.2
# Monitoring
prometheus-client==0.19.0
prometheus-fastapi-instrumentator==6.1.0
# Utilities
pydantic==2.5.2
python-multipart==0.0.6
NVIDIA Triton Deployment
Triton Model Repository
# model_repository/
# βββ fraud_detection/
# β βββ config.pbtxt
# β βββ 1/
# β β βββ model.onnx
# β βββ preprocess/
# β βββ config.pbtxt
# β βββ 1/
# β βββ model.py
# fraud_detection/config.pbtxt
TRITON_CONFIG = """
name: "fraud_detection"
platform: "onnxruntime_onnx"
max_batch_size: 64
input [
{
name: "input"
data_type: TYPE_FP32
dims: [ 128 ]
}
]
output [
{
name: "output"
data_type: TYPE_FP32
dims: [ 1 ]
}
]
dynamic_batching {
preferred_batch_size: [ 16, 32, 64 ]
max_queue_delay_microseconds: 100
}
instance_group [
{
count: 2
kind: KIND_GPU
gpus: [ 0 ]
}
]
optimization {
graph {
level: 1
}
}
"""
# preprocess/config.pbtxt
PREPROCESS_CONFIG = """
name: "preprocess"
platform: "python"
max_batch_size: 64
input [
{
name: "raw_input"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
output [
{
name: "preprocessed_input"
data_type: TYPE_FP32
dims: [ 128 ]
}
]
instance_group [
{
count: 1
kind: KIND_CPU
}
]
"""
Triton Inference Server Dockerfile
# Dockerfile.triton
FROM nvcr.io/nvidia/tritonserver:23.10-py3
# Install additional dependencies
RUN pip install --no-cache-dir \
onnxruntime-gpu \
transformers \
tokenizers
# Create model repository
RUN mkdir -p /models/fraud_detection/1 \
&& mkdir -p /models/preprocess/1
# Copy model files
COPY models/fraud_detection/model.onnx /models/fraud_detection/1/
COPY models/fraud_detection/config.pbtxt /models/fraud_detection/
COPY models/preprocess/model.py /models/preprocess/1/
COPY models/preprocess/config.pbtxt /models/preprocess/
# Copy custom backend code
COPY src/triton_backends/ /opt/triton_backends/
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8000/v2/health/ready || exit 1
# Expose ports
EXPOSE 8000 8001 8002
# Run Triton
ENTRYPOINT ["tritonserver"]
CMD ["--model-repository=/models", \
"--log-verbose=1", \
"--strict-model-config=false", \
"--strict-readiness=false"]
Triton Python Backend
# model_repository/preprocess/1/model.py
import numpy as np
import json
class TritonPythonModel:
def initialize(self, args):
"""Initialize the model."""
self.model_config = json.loads(args['model_config'])
# Load preprocessing parameters
with open('/models/preprocess/params.json', 'r') as f:
self.params = json.load(f)
self.scaler_mean = np.array(self.params['scaler_mean'])
self.scaler_std = np.array(self.params['scaler_std'])
def execute(self, requests):
"""Process inference requests."""
responses = []
for request in requests:
# Get input
input_tensor = request.as_numpy('raw_input')
# Preprocess
preprocessed = self._preprocess(input_tensor)
# Set output
out0 = np.array(preprocessed, dtype=np.float32)
responses.append({
'preprocessed_input': out0
})
return responses
def _preprocess(self, raw_input):
"""Preprocess input data."""
# Parse JSON input
data = json.loads(raw_input[0].decode())
# Convert to numpy array
features = np.array(data['features'], dtype=np.float32)
# Normalize
preprocessed = (features - self.scaler_mean) / self.scaler_std
return preprocessed.reshape(1, -1)
βΉοΈ
NVIDIA Triton provides optimized inference with dynamic batching, model ensembles, and multi-GPU support. Use it for high-throughput serving with sub-millisecond latency requirements.
TensorRT Optimization
Model Optimization Pipeline
import tensorrt as trt
import numpy as np
import onnx
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
class TensorRTCompiler:
"""Compile models to TensorRT for optimized inference."""
def __init__(self, max_batch_size: int = 64,
fp16: bool = True,
int8: bool = False):
self.max_batch_size = max_batch_size
self.fp16 = fp16
self.int8 = int8
self.logger = trt.Logger(trt.Logger.WARNING)
self.trt_builder = trt.Builder(self.logger)
def compile_from_onnx(self, onnx_path: str,
output_path: str) -> str:
"""Compile ONNX model to TensorRT engine."""
logger.info(f"Compiling {onnx_path} to TensorRT")
# Create network
network = self.trt_builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
# Parse ONNX
parser = trt.OnnxParser(network, self.logger)
with open(onnx_path, 'rb') as f:
if not parser.parse(f.read()):
for error in range(parser.num_errors):
logger.error(f"ONNX Parse Error: {parser.get_error(error)}")
raise RuntimeError("Failed to parse ONNX model")
# Create builder config
config = self.trt_builder.create_builder_config()
config.max_workspace_size = 4 * 1 << 30 # 4GB
if self.fp16:
config.set_flag(trt.BuilderFlag.FP16)
if self.int8:
config.set_flag(trt.BuilderFlag.INT8)
# Set up INT8 calibration
config.int8_calibrator = self._create_calibrator()
# Build engine
engine = self.trt_builder.build_serialized_network(network, config)
if engine is None:
raise RuntimeError("Failed to build TensorRT engine")
# Save engine
with open(output_path, 'wb') as f:
f.write(engine)
logger.info(f"TensorRT engine saved to {output_path}")
return output_path
def _create_calibrator(self):
"""Create INT8 calibrator."""
# Implementation depends on your calibration data
pass
class TensorRTInference:
"""Run inference with TensorRT engine."""
def __init__(self, engine_path: str):
self.logger = trt.Logger(trt.Logger.WARNING)
# Load engine
with open(engine_path, 'rb') as f:
runtime = trt.Runtime(self.logger)
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
# Allocate buffers
self._allocate_buffers()
def _allocate_buffers(self):
"""Allocate GPU memory for inference."""
import pycuda.driver as cuda
cuda.init()
self.device = cuda.Device(0)
self.context = self.device.make_context()
self.inputs = []
self.outputs = []
self.bindings = []
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
shape = self.engine.get_tensor_shape(name)
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(trt.volume(shape), dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
self.inputs.append({
'host': host_mem,
'device': device_mem,
'shape': shape,
'dtype': dtype
})
else:
self.outputs.append({
'host': host_mem,
'device': device_mem,
'shape': shape,
'dtype': dtype
})
def infer(self, input_data: np.ndarray) -> np.ndarray:
"""Run inference on input data."""
import pycuda.driver as cuda
# Copy input to host buffer
np.copyto(self.inputs[0]['host'], input_data.ravel())
# Transfer input to GPU
cuda.memcpy_htod(
self.inputs[0]['device'],
self.inputs[0]['host']
)
# Run inference
self.context.execute_v2(bindings=self.bindings)
# Transfer output back to host
cuda.memcpy_dtoh(
self.outputs[0]['host'],
self.outputs[0]['device']
)
return self.outputs[0]['host'].reshape(self.outputs[0]['shape'])
TensorRT Optimization for Transformers
import tensorrt as trt
from transformers import AutoModel, AutoTokenizer
import torch
class TransformerTensorRT:
"""Optimize transformer models with TensorRT."""
def __init__(self, model_name: str, max_seq_length: int = 512):
self.model_name = model_name
self.max_seq_length = max_seq_length
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
def export_to_onnx(self, output_path: str):
"""Export transformer to ONNX format."""
# Create dummy input
dummy_input = self.tokenizer(
"Sample text",
return_tensors="pt",
max_length=self.max_seq_length,
padding="max_length",
truncation=True
)
# Export
torch.onnx.export(
self.model,
(dummy_input['input_ids'], dummy_input['attention_mask']),
output_path,
opset_version=14,
do_constant_folding=True,
input_names=['input_ids', 'attention_mask'],
output_names=['last_hidden_state'],
dynamic_axes={
'input_ids': {0: 'batch_size', 1: 'sequence'},
'attention_mask': {0: 'batch_size', 1: 'sequence'},
'last_hidden_state': {0: 'batch_size', 1: 'sequence'}
}
)
logger.info(f"ONNX model exported to {output_path}")
def optimize_with_tensorrt(self, onnx_path: str,
output_path: str,
fp16: bool = True):
"""Optimize ONNX model with TensorRT."""
compiler = TensorRTCompiler(
max_batch_size=32,
fp16=fp16
)
compiler.compile_from_onnx(onnx_path, output_path)
β οΈ
TensorRT optimization can significantly improve inference speed but requires careful calibration for INT8 quantization. Always validate optimized models against the original to ensure accuracy is maintained.
ONNX Runtime Serving
ONNX Model Serving
import onnxruntime as ort
import numpy as np
from typing import Dict, List, Optional
import json
import time
class ONNXModelServer:
"""Serve models using ONNX Runtime."""
def __init__(self, model_path: str,
provider: str = 'CUDAExecutionProvider',
num_threads: int = 4):
# Configure session options
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = num_threads
sess_options.inter_op_num_threads = num_threads
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Create session
self.session = ort.InferenceSession(
model_path,
sess_options=sess_options,
providers=[provider, 'CPUExecutionProvider']
)
# Get input/output names
self.input_names = [inp.name for inp in self.session.get_inputs()]
self.output_names = [out.name for out in self.session.get_outputs()]
def predict(self, input_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
"""Run inference on input data."""
# Run inference
outputs = self.session.run(
self.output_names,
input_data
)
# Return as dictionary
return {
name: output
for name, output in zip(self.output_names, outputs)
}
def batch_predict(self, batch_inputs: List[Dict[str, np.ndarray]]) -> List[Dict[str, np.ndarray]]:
"""Run batch inference."""
# Stack inputs into batches
batched_inputs = {}
for name in self.input_names:
batched_inputs[name] = np.stack([
inp[name] for inp in batch_inputs
])
# Run inference
batched_outputs = self.predict(batched_inputs)
# Split outputs back into individual predictions
batch_size = len(batch_inputs)
outputs = []
for i in range(batch_size):
output = {
name: values[i]
for name, values in batched_outputs.items()
}
outputs.append(output)
return outputs
class ONNXOptimizedServer(ONNXModelServer):
"""Optimized ONNX server with batching and caching."""
def __init__(self, model_path: str,
max_batch_size: int = 32,
batch_timeout_ms: float = 10.0):
super().__init__(model_path)
self.max_batch_size = max_batch_size
self.batch_timeout_ms = batch_timeout_ms
# Request queue for dynamic batching
self.request_queue = []
self.batch_results = {}
def _dynamic_batching(self, requests: List[Dict]) -> np.ndarray:
"""Create dynamic batches from requests."""
batches = []
for i in range(0, len(requests), self.max_batch_size):
batch = requests[i:i + self.max_batch_size]
# Stack inputs
batched_input = {}
for name in self.input_names:
batched_input[name] = np.stack([
req['input'][name] for req in batch
])
batches.append(batched_input)
return batches
ONNX Model Conversion
import torch
import tensorflow as tf
from pathlib import Path
class ModelConverter:
"""Convert models to ONNX format."""
@staticmethod
def pytorch_to_onnx(model, dummy_input, output_path: str,
input_names: List[str] = None,
output_names: List[str] = None):
"""Convert PyTorch model to ONNX."""
torch.onnx.export(
model,
dummy_input,
output_path,
opset_version=14,
do_constant_folding=True,
input_names=input_names or ['input'],
output_names=output_names or ['output'],
dynamic_axes={
name: {0: 'batch_size'}
for name in (input_names or ['input']) + (output_names or ['output'])
}
)
print(f"ONNX model exported to {output_path}")
@staticmethod
def tensorflow_to_onnx(model, output_path: str):
"""Convert TensorFlow model to ONNX."""
import tf2onnx
# Convert
model_proto, _ = tf2onnx.convert.from_keras(
model,
output_path=output_path,
opset=14
)
print(f"ONNX model exported to {output_path}")
@staticmethod
def validate_onnx(onnx_path: str):
"""Validate ONNX model."""
import onnx
model = onnx.load(onnx_path)
onnx.checker.check_model(model)
print("ONNX model is valid")
# Print model info
print(f"IR version: {model.ir_version}")
print(f"Opset version: {model.opset_import[0].version}")
print(f"Producer: {model.producer_name}")
Docker Compose for ML Stack
Multi-Service ML Deployment
# docker-compose.yml
version: '3.8'
services:
# Model Serving
model-server:
build:
context: .
dockerfile: Dockerfile.serving
ports:
- "8080:8080"
volumes:
- ./models:/app/models
- ./config:/app/config
environment:
- MODEL_PATH=/app/models/model.onnx
- LOG_LEVEL=info
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
networks:
- ml-network
# Feature Store
feature-store:
image: feastdev/feature-server:latest
ports:
- "6566:6566"
volumes:
- ./feature_repo:/feature_repo
environment:
- FEATURE_STORE_YAML=/feature_repo/feature_store.yaml
networks:
- ml-network
# Model Registry
model-registry:
build:
context: .
dockerfile: Dockerfile.registry
ports:
- "8081:8081"
volumes:
- registry-data:/var/lib/registry
environment:
- DATABASE_URL=sqlite:///var/lib/registry/registry.db
networks:
- ml-network
# Monitoring
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
networks:
- ml-network
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
networks:
- ml-network
# Redis for caching
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis-data:/data
networks:
- ml-network
volumes:
registry-data:
grafana-data:
redis-data:
networks:
ml-network:
driver: bridge
βΉοΈ
Use Docker Compose for local development and testing. For production, migrate to Kubernetes with Helm charts for better scalability and management.
Kubernetes Deployment
Kubernetes ML Deployment
# kubernetes/ml-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: model-server
labels:
app: ml-model
spec:
replicas: 3
selector:
matchLabels:
app: ml-model
template:
metadata:
labels:
app: ml-model
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
containers:
- name: model-server
image: registry.example.com/ml-model:v1.0
ports:
- containerPort: 8080
name: http
- containerPort: 8081
name: grpc
resources:
requests:
memory: "4Gi"
cpu: "2000m"
nvidia.com/gpu: "1"
limits:
memory: "8Gi"
cpu: "4000m"
nvidia.com/gpu: "1"
env:
- name: MODEL_PATH
value: "/models/model.onnx"
- name: LOG_LEVEL
value: "info"
- name: MAX_BATCH_SIZE
value: "32"
volumeMounts:
- name: model-volume
mountPath: /models
- name: config-volume
mountPath: /config
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
volumes:
- name: model-volume
persistentVolumeClaim:
claimName: model-pvc
- name: config-volume
configMap:
name: model-config
nodeSelector:
accelerator: nvidia-tesla-t4
---
apiVersion: v1
kind: Service
metadata:
name: model-server
labels:
app: ml-model
spec:
selector:
app: ml-model
ports:
- name: http
port: 80
targetPort: 8080
- name: grpc
port: 8081
targetPort: 8081
type: LoadBalancer
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: model-server-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: model-server
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: inference_requests_per_second
target:
type: AverageValue
averageValue: "100"
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Pods
value: 2
periodSeconds: 60
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 120
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: model-server-ingress
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
cert-manager.io/cluster-issuer: letsencrypt-prod
spec:
tls:
- hosts:
- ml.example.com
secretName: ml-tls
rules:
- host: ml.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: model-server
port:
number: 80
β οΈ
When deploying GPU workloads on Kubernetes, ensure proper GPU resource requests and limits. Use node selectors or taints/tolerations to schedule GPU pods on appropriate nodes.
Model Optimization Pipeline
Complete Optimization Workflow
from pathlib import Path
import json
import time
from typing import Dict
class ModelOptimizationPipeline:
"""End-to-end model optimization pipeline."""
def __init__(self, config: Dict):
self.config = config
self.optimization_results = {}
def optimize(self, model_path: str, output_dir: str) -> Dict:
"""Run complete optimization pipeline."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Step 1: Export to ONNX
print("Step 1: Exporting to ONNX...")
onnx_path = output_path / "model.onnx"
self._export_to_onnx(model_path, str(onnx_path))
# Step 2: Optimize with ONNX Runtime
print("Step 2: Optimizing with ONNX Runtime...")
optimized_path = output_path / "model_optimized.onnx"
self._optimize_onnx(str(onnx_path), str(optimized_path))
# Step 3: Compile to TensorRT
print("Step 3: Compiling to TensorRT...")
trt_path = output_path / "model.trt"
self._compile_tensorrt(str(optimized_path), str(trt_path))
# Step 4: Benchmark
print("Step 4: Benchmarking...")
benchmarks = self._benchmark(
str(onnx_path),
str(optimized_path),
str(trt_path)
)
# Save results
results = {
'onnx_path': str(onnx_path),
'optimized_path': str(optimized_path),
'trt_path': str(trt_path),
'benchmarks': benchmarks
}
with open(output_path / 'optimization_results.json', 'w') as f:
json.dump(results, f, indent=2)
return results
def _export_to_onnx(self, model_path: str, output_path: str):
"""Export model to ONNX format."""
# Implementation depends on model framework
pass
def _optimize_onnx(self, onnx_path: str, output_path: str):
"""Optimize ONNX model."""
import onnxruntime as ort
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model(
onnx_path,
model_type='bert',
num_heads=12,
hidden_size=768,
optimization_options=None
)
optimized_model.save_model_to_file(output_path)
def _compile_tensorrt(self, onnx_path: str, output_path: str):
"""Compile to TensorRT."""
compiler = TensorRTCompiler(fp16=True)
compiler.compile_from_onnx(onnx_path, output_path)
def _benchmark(self, onnx_path: str, optimized_path: str,
trt_path: str) -> Dict:
"""Benchmark different model versions."""
benchmarks = {}
# Benchmark ONNX
onnx_server = ONNXModelServer(onnx_path)
onnx_latency = self._measure_latency(onnx_server)
benchmarks['onnx'] = {
'latency_ms': onnx_latency,
'throughput_rps': 1000 / onnx_latency
}
# Benchmark Optimized ONNX
optimized_server = ONNXModelServer(optimized_path)
optimized_latency = self._measure_latency(optimized_server)
benchmarks['optimized_onnx'] = {
'latency_ms': optimized_latency,
'throughput_rps': 1000 / optimized_latency
}
# Benchmark TensorRT
trt_server = TensorRTInference(trt_path)
trt_latency = self._measure_latency(trt_server)
benchmarks['tensorrt'] = {
'latency_ms': trt_latency,
'throughput_rps': 1000 / trt_latency
}
# Calculate speedups
benchmarks['speedup'] = {
'optimized_vs_onnx': onnx_latency / optimized_latency,
'trt_vs_onnx': onnx_latency / trt_latency,
'trt_vs_optimized': optimized_latency / trt_latency
}
return benchmarks
def _measure_latency(self, server, n_runs: int = 100) -> float:
"""Measure average inference latency."""
# Create dummy input
dummy_input = self._create_dummy_input()
# Warmup
for _ in range(10):
server.predict(dummy_input)
# Measure
latencies = []
for _ in range(n_runs):
start = time.time()
server.predict(dummy_input)
latency = (time.time() - start) * 1000
latencies.append(latency)
return np.mean(latencies)
def _create_dummy_input(self):
"""Create dummy input for benchmarking."""
return {'input': np.random.randn(1, 128).astype(np.float32)}
Summary
Containerization is essential for ML deployment:
- Docker Best Practices: Multi-stage builds, non-root users, health checks
- NVIDIA Triton: Optimized GPU inference with dynamic batching
- TensorRT: Model optimization for maximum performance
- ONNX Runtime: Cross-platform inference optimization
- Kubernetes: Scalable deployment with auto-scaling
Implement containerization to ensure consistent, portable, and optimized ML deployments.