Monitoring - Metrics, Health Checks, APM Tools

Introduction

Monitoring provides visibility into application health and performance. This tutorial covers metrics collection, health checks, and Application Performance Monitoring (APM) tools.

Health Check Endpoints

# health.py
from fastapi import APIRouter
from datetime import datetime
import psutil

router = APIRouter()

@router.get("/health")
async def basic_health():
    return {
        "status": "healthy",
        "timestamp": datetime.utcnow().isoformat()
    }

@router.get("/health/detailed")
async def detailed_health():
    # Check database
    db_healthy = check_database()
    
    # Check external services
    redis_healthy = check_redis()
    
    status = "healthy" if db_healthy and redis_healthy else "unhealthy"
    
    return {
        "status": status,
        "checks": {
            "database": "ok" if db_healthy else "failed",
            "redis": "ok" if redis_healthy else "failed"
        },
        "timestamp": datetime.utcnow().isoformat()
    }

@router.get("/health/ready")
async def readiness_check():
    """Check if service is ready to accept traffic."""
    if not is_database_ready():
        return {"status": "not ready"}, 503
    
    return {"status": "ready"}

@router.get("/health/live")
async def liveness_check():
    """Check if service is alive."""
    return {"status": "alive"}

def check_database() -> bool:
    try:
        # Check database connection
        return True
    except:
        return False

Metrics with Prometheus

from prometheus_client import Counter, Histogram, Gauge
import time

# Request metrics
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_DURATION = Histogram(
    'http_request_duration_seconds',
    'HTTP request duration',
    ['method', 'endpoint'],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
)

# Business metrics
ACTIVE_USERS = Gauge('active_users', 'Number of active users')

TASKS_PROCESSED = Counter(
    'tasks_processed_total',
    'Total tasks processed',
    ['status']
)

# Middleware to track requests
@app.middleware("http")
async def metrics_middleware(request, call_next):
    start_time = time.time()
    
    response = await call_next(request)
    
    duration = time.time() - start_time
    
    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.url.path,
        status=response.status_code
    ).inc()
    
    REQUEST_DURATION.labels(
        method=request.method,
        endpoint=request.url.path
    ).observe(duration)
    
    return response

Custom Metrics

# custom_metrics.py
from prometheus_client import Info, Enum

app_info = Info('app', 'Application info')
app_info.info({'version': '1.0.0', 'environment': 'production'})

task_status = Enum(
    'task_status',
    'Task execution status',
    states=['success', 'failed', 'pending']
)

# Using in code
@app.on_event("startup")
async def startup():
    app_info.info({'version': '1.0.0'})

def process_task(task_id: str):
    try:
        do_task(task_id)
        TASKS_PROCESSED.labels(status='success').inc()
    except Exception as e:
        TASKS_PROCESSED.labels(status='failed').inc()
        raise

APM Integration

# opentelemetry_example.py
from opentelemetry import trace
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

# Configure tracing
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)

jaeger_exporter = JaegerExporter(
    agent_host_name="localhost",
    agent_port=6831,
)

trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(jaeger_exporter)
)

# Usage
@app.get("/users/{user_id}")
async def get_user(user_id: int):
    with tracer.start_as_current_span("get_user") as span:
        span.set_attribute("user_id", user_id)
        
        user = db.get_user(user_id)
        
        span.set_attribute("user_found", user is not None)
        
        return user

Practice Problems

Create a custom exporter for metrics
Implement distributed tracing across services
Add custom metrics for business KPIs
Create alerting rules for critical metrics
Implement log correlation with traces

Monitoring - Metrics, Health Checks, APM Tools

Introduction

Health Check Endpoints

Metrics with Prometheus

Custom Metrics

APM Integration

Practice Problems

Need Expert Python Help?