Edge AI Deployment
Model Optimization for Edge
Edge deployment requires optimizing models for size, speed, and power consumption while maintaining acceptable accuracy.
ONNX Runtime Inference
import onnxruntime as ort
import numpy as np
from typing import Dict
class EdgeInference:
def __init__(self, model_path: str):
self.session = ort.InferenceSession(
model_path,
providers=['CPUExecutionProvider']
)
self.input_name = self.session.get_inputs()[0].name
def predict(self, input_data: np.ndarray) -> np.ndarray:
outputs = self.session.run(
None,
{self.input_name: input_data}
)
return outputs[0]
def benchmark(self, input_data: np.ndarray, n_runs: int = 100):
import time
latencies = []
for _ in range(n_runs):
start = time.time()
self.predict(input_data)
latencies.append(time.time() - start)
return {
"mean_latency_ms": np.mean(latencies) * 1000,
"p95_latency_ms": np.percentile(latencies, 95) * 1000,
"throughput": 1.0 / np.mean(latencies)
}
inference = EdgeInference("model.onnx")
result = inference.predict(test_input)
benchmark = inference.benchmark(test_input)
print(f"Latency: {benchmark['mean_latency_ms']:.2f}ms")
Model Export and Conversion
import torch
def export_to_onnx(model, input_shape, output_path):
model.eval()
dummy_input = torch.randn(input_shape)
torch.onnx.export(
model,
dummy_input,
output_path,
opset_version=13,
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"}
}
)
print(f"Exported to {output_path}")
def quantize_model(onnx_path, quantized_path):
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
onnx_path,
quantized_path,
weight_type=QuantType.QUInt8
)
print(f"Quantized model saved to {quantized_path}")
export_to_onnx(model, (1, 3, 224, 224), "model.onnx")
quantize_model("model.onnx", "model_quantized.onnx")
TensorFlow Lite Conversion
import tensorflow as tf
def convert_to_tflite(saved_model_dir, tflite_path):
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_model = converter.convert()
with open(tflite_path, "wb") as f:
f.write(tflite_model)
print(f"TFLite model saved to {tflite_path}")
def convert_with_quantization(saved_model_dir, tflite_path, calibration_data):
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
def representative_dataset():
for data in calibration_data[:100]:
yield [data]
converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS_INT8
]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
tflite_model = converter.convert()
with open(tflite_path, "wb") as f:
f.write(tflite_model)
Mobile Deployment
class MobileModelManager:
def __init__(self):
self.models = {}
def load_model(self, name: str, path: str):
import tensorflow as tf
self.models[name] = tf.lite.Interpreter(model_path=path)
self.models[name].allocate_tensors()
def predict(self, name: str, input_data) -> np.ndarray:
interpreter = self.models[name]
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()
return interpreter.get_tensor(output_details[0]['index'])
def get_model_info(self, name: str) -> Dict:
interpreter = self.models[name]
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
return {
"input_shape": input_details[0]['shape'],
"output_shape": output_details[0]['shape'],
"input_dtype": input_details[0]['dtype'],
}
manager = MobileModelManager()
manager.load_model("classifier", "model.tflite")
prediction = manager.predict("classifier", test_image)
Edge-Cloud Hybrid
class EdgeCloudHybrid:
def __init__(self, edge_model, cloud_endpoint):
self.edge_model = edge_model
self.cloud_endpoint = cloud_endpoint
self.confidence_threshold = 0.8
def predict(self, input_data):
edge_result = self.edge_model.predict(input_data)
confidence = self.edge_model.get_confidence(input_data)
if confidence < self.confidence_threshold:
cloud_result = self.query_cloud(input_data)
return self.fusion_strategy(edge_result, cloud_result)
return edge_result
def query_cloud(self, input_data):
import requests
response = requests.post(
self.cloud_endpoint,
json={"data": input_data.tolist()}
)
return response.json()["prediction"]
def fusion_strategy(self, edge_result, cloud_result):
return cloud_result if cloud_result is not None else edge_result
Best Practices
- Profile models on target hardware before deployment
- Use mixed precision for optimal performance
- Implement model caching for repeated inferences
- Monitor battery and thermal impact on mobile
- Use hardware accelerators when available
- Implement graceful degradation for edge cases