Edge AI Deployment

Model Optimization for Edge

Edge deployment requires optimizing models for size, speed, and power consumption while maintaining acceptable accuracy.

ONNX Runtime Inference

import onnxruntime as ort
import numpy as np
from typing import Dict

class EdgeInference:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(
            model_path,
            providers=['CPUExecutionProvider']
        )
        self.input_name = self.session.get_inputs()[0].name
    
    def predict(self, input_data: np.ndarray) -> np.ndarray:
        outputs = self.session.run(
            None, 
            {self.input_name: input_data}
        )
        return outputs[0]
    
    def benchmark(self, input_data: np.ndarray, n_runs: int = 100):
        import time
        
        latencies = []
        for _ in range(n_runs):
            start = time.time()
            self.predict(input_data)
            latencies.append(time.time() - start)
        
        return {
            "mean_latency_ms": np.mean(latencies) * 1000,
            "p95_latency_ms": np.percentile(latencies, 95) * 1000,
            "throughput": 1.0 / np.mean(latencies)
        }

inference = EdgeInference("model.onnx")
result = inference.predict(test_input)
benchmark = inference.benchmark(test_input)
print(f"Latency: {benchmark['mean_latency_ms']:.2f}ms")

Model Export and Conversion

import torch

def export_to_onnx(model, input_shape, output_path):
    model.eval()
    dummy_input = torch.randn(input_shape)
    
    torch.onnx.export(
        model,
        dummy_input,
        output_path,
        opset_version=13,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={
            "input": {0: "batch_size"},
            "output": {0: "batch_size"}
        }
    )
    print(f"Exported to {output_path}")

def quantize_model(onnx_path, quantized_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    
    quantize_dynamic(
        onnx_path,
        quantized_path,
        weight_type=QuantType.QUInt8
    )
    print(f"Quantized model saved to {quantized_path}")

export_to_onnx(model, (1, 3, 224, 224), "model.onnx")
quantize_model("model.onnx", "model_quantized.onnx")

TensorFlow Lite Conversion

import tensorflow as tf

def convert_to_tflite(saved_model_dir, tflite_path):
    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
    
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    
    tflite_model = converter.convert()
    
    with open(tflite_path, "wb") as f:
        f.write(tflite_model)
    
    print(f"TFLite model saved to {tflite_path}")

def convert_with_quantization(saved_model_dir, tflite_path, calibration_data):
    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    
    def representative_dataset():
        for data in calibration_data[:100]:
            yield [data]
    
    converter.representative_dataset = representative_dataset
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS_INT8
    ]
    converter.inference_input_type = tf.int8
    converter.inference_output_type = tf.int8
    
    tflite_model = converter.convert()
    
    with open(tflite_path, "wb") as f:
        f.write(tflite_model)

Mobile Deployment

class MobileModelManager:
    def __init__(self):
        self.models = {}
    
    def load_model(self, name: str, path: str):
        import tensorflow as tf
        self.models[name] = tf.lite.Interpreter(model_path=path)
        self.models[name].allocate_tensors()
    
    def predict(self, name: str, input_data) -> np.ndarray:
        interpreter = self.models[name]
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        
        return interpreter.get_tensor(output_details[0]['index'])
    
    def get_model_info(self, name: str) -> Dict:
        interpreter = self.models[name]
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        
        return {
            "input_shape": input_details[0]['shape'],
            "output_shape": output_details[0]['shape'],
            "input_dtype": input_details[0]['dtype'],
        }

manager = MobileModelManager()
manager.load_model("classifier", "model.tflite")
prediction = manager.predict("classifier", test_image)

Edge-Cloud Hybrid

class EdgeCloudHybrid:
    def __init__(self, edge_model, cloud_endpoint):
        self.edge_model = edge_model
        self.cloud_endpoint = cloud_endpoint
        self.confidence_threshold = 0.8
    
    def predict(self, input_data):
        edge_result = self.edge_model.predict(input_data)
        confidence = self.edge_model.get_confidence(input_data)
        
        if confidence < self.confidence_threshold:
            cloud_result = self.query_cloud(input_data)
            return self.fusion_strategy(edge_result, cloud_result)
        
        return edge_result
    
    def query_cloud(self, input_data):
        import requests
        response = requests.post(
            self.cloud_endpoint,
            json={"data": input_data.tolist()}
        )
        return response.json()["prediction"]
    
    def fusion_strategy(self, edge_result, cloud_result):
        return cloud_result if cloud_result is not None else edge_result

Best Practices

Profile models on target hardware before deployment
Use mixed precision for optimal performance
Implement model caching for repeated inferences
Monitor battery and thermal impact on mobile
Use hardware accelerators when available
Implement graceful degradation for edge cases

Edge AI Deployment

Edge AI Deployment

Model Optimization for Edge

ONNX Runtime Inference

Model Export and Conversion

TensorFlow Lite Conversion

Mobile Deployment

Edge-Cloud Hybrid

Best Practices

Premium Content

Need Expert Generative AI Help?