πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Edge AI Deployment

🟒 Free Lesson

Advertisement

Edge AI Deployment

Edge AI DeploymentCloud ModelOptimizationCompressionEdge RuntimeInferenceEdge Platformsβ€’ TensorFlow Lite (Mobile)β€’ ONNX Runtime (Cross-platform)β€’ Core ML (Apple)β€’ TensorRT (NVIDIA)Optimization Methodsβ€’ INT8/INT4 Quantizationβ€’ Knowledge Distillationβ€’ Structured Pruningβ€’ Operator Fusion

Model Optimization for Edge

Edge deployment requires optimizing models for size, speed, and power consumption while maintaining acceptable accuracy.

ONNX Runtime Inference

import onnxruntime as ort
import numpy as np
from typing import Dict

class EdgeInference:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(
            model_path,
            providers=['CPUExecutionProvider']
        )
        self.input_name = self.session.get_inputs()[0].name
    
    def predict(self, input_data: np.ndarray) -> np.ndarray:
        outputs = self.session.run(
            None, 
            {self.input_name: input_data}
        )
        return outputs[0]
    
    def benchmark(self, input_data: np.ndarray, n_runs: int = 100):
        import time
        
        latencies = []
        for _ in range(n_runs):
            start = time.time()
            self.predict(input_data)
            latencies.append(time.time() - start)
        
        return {
            "mean_latency_ms": np.mean(latencies) * 1000,
            "p95_latency_ms": np.percentile(latencies, 95) * 1000,
            "throughput": 1.0 / np.mean(latencies)
        }

inference = EdgeInference("model.onnx")
result = inference.predict(test_input)
benchmark = inference.benchmark(test_input)
print(f"Latency: {benchmark['mean_latency_ms']:.2f}ms")

Model Export and Conversion

import torch

def export_to_onnx(model, input_shape, output_path):
    model.eval()
    dummy_input = torch.randn(input_shape)
    
    torch.onnx.export(
        model,
        dummy_input,
        output_path,
        opset_version=13,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes={
            "input": {0: "batch_size"},
            "output": {0: "batch_size"}
        }
    )
    print(f"Exported to {output_path}")

def quantize_model(onnx_path, quantized_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    
    quantize_dynamic(
        onnx_path,
        quantized_path,
        weight_type=QuantType.QUInt8
    )
    print(f"Quantized model saved to {quantized_path}")

export_to_onnx(model, (1, 3, 224, 224), "model.onnx")
quantize_model("model.onnx", "model_quantized.onnx")

TensorFlow Lite Conversion

import tensorflow as tf

def convert_to_tflite(saved_model_dir, tflite_path):
    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
    
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    
    tflite_model = converter.convert()
    
    with open(tflite_path, "wb") as f:
        f.write(tflite_model)
    
    print(f"TFLite model saved to {tflite_path}")

def convert_with_quantization(saved_model_dir, tflite_path, calibration_data):
    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    
    def representative_dataset():
        for data in calibration_data[:100]:
            yield [data]
    
    converter.representative_dataset = representative_dataset
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS_INT8
    ]
    converter.inference_input_type = tf.int8
    converter.inference_output_type = tf.int8
    
    tflite_model = converter.convert()
    
    with open(tflite_path, "wb") as f:
        f.write(tflite_model)

Mobile Deployment

class MobileModelManager:
    def __init__(self):
        self.models = {}
    
    def load_model(self, name: str, path: str):
        import tensorflow as tf
        self.models[name] = tf.lite.Interpreter(model_path=path)
        self.models[name].allocate_tensors()
    
    def predict(self, name: str, input_data) -> np.ndarray:
        interpreter = self.models[name]
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        
        return interpreter.get_tensor(output_details[0]['index'])
    
    def get_model_info(self, name: str) -> Dict:
        interpreter = self.models[name]
        input_details = interpreter.get_input_details()
        output_details = interpreter.get_output_details()
        
        return {
            "input_shape": input_details[0]['shape'],
            "output_shape": output_details[0]['shape'],
            "input_dtype": input_details[0]['dtype'],
        }

manager = MobileModelManager()
manager.load_model("classifier", "model.tflite")
prediction = manager.predict("classifier", test_image)

Edge-Cloud Hybrid

class EdgeCloudHybrid:
    def __init__(self, edge_model, cloud_endpoint):
        self.edge_model = edge_model
        self.cloud_endpoint = cloud_endpoint
        self.confidence_threshold = 0.8
    
    def predict(self, input_data):
        edge_result = self.edge_model.predict(input_data)
        confidence = self.edge_model.get_confidence(input_data)
        
        if confidence < self.confidence_threshold:
            cloud_result = self.query_cloud(input_data)
            return self.fusion_strategy(edge_result, cloud_result)
        
        return edge_result
    
    def query_cloud(self, input_data):
        import requests
        response = requests.post(
            self.cloud_endpoint,
            json={"data": input_data.tolist()}
        )
        return response.json()["prediction"]
    
    def fusion_strategy(self, edge_result, cloud_result):
        return cloud_result if cloud_result is not None else edge_result

Best Practices

  • Profile models on target hardware before deployment
  • Use mixed precision for optimal performance
  • Implement model caching for repeated inferences
  • Monitor battery and thermal impact on mobile
  • Use hardware accelerators when available
  • Implement graceful degradation for edge cases
⭐

Premium Content

Edge AI Deployment

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement