Monitoring - Metrics, Health Checks, APM Tools

DevOpsMonitoringFree Lesson

Advertisement

Introduction

Monitoring provides visibility into application health and performance. This tutorial covers metrics collection, health checks, and Application Performance Monitoring (APM) tools.

Health Check Endpoints

# health.py
from fastapi import APIRouter
from datetime import datetime
import psutil

router = APIRouter()

@router.get("/health")
async def basic_health():
    return {
        "status": "healthy",
        "timestamp": datetime.utcnow().isoformat()
    }

@router.get("/health/detailed")
async def detailed_health():
    # Check database
    db_healthy = check_database()
    
    # Check external services
    redis_healthy = check_redis()
    
    status = "healthy" if db_healthy and redis_healthy else "unhealthy"
    
    return {
        "status": status,
        "checks": {
            "database": "ok" if db_healthy else "failed",
            "redis": "ok" if redis_healthy else "failed"
        },
        "timestamp": datetime.utcnow().isoformat()
    }

@router.get("/health/ready")
async def readiness_check():
    """Check if service is ready to accept traffic."""
    if not is_database_ready():
        return {"status": "not ready"}, 503
    
    return {"status": "ready"}

@router.get("/health/live")
async def liveness_check():
    """Check if service is alive."""
    return {"status": "alive"}

def check_database() -> bool:
    try:
        # Check database connection
        return True
    except:
        return False

Metrics with Prometheus

from prometheus_client import Counter, Histogram, Gauge
import time

# Request metrics
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_DURATION = Histogram(
    'http_request_duration_seconds',
    'HTTP request duration',
    ['method', 'endpoint'],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
)

# Business metrics
ACTIVE_USERS = Gauge('active_users', 'Number of active users')

TASKS_PROCESSED = Counter(
    'tasks_processed_total',
    'Total tasks processed',
    ['status']
)

# Middleware to track requests
@app.middleware("http")
async def metrics_middleware(request, call_next):
    start_time = time.time()
    
    response = await call_next(request)
    
    duration = time.time() - start_time
    
    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.url.path,
        status=response.status_code
    ).inc()
    
    REQUEST_DURATION.labels(
        method=request.method,
        endpoint=request.url.path
    ).observe(duration)
    
    return response

Custom Metrics

# custom_metrics.py
from prometheus_client import Info, Enum

app_info = Info('app', 'Application info')
app_info.info({'version': '1.0.0', 'environment': 'production'})

task_status = Enum(
    'task_status',
    'Task execution status',
    states=['success', 'failed', 'pending']
)

# Using in code
@app.on_event("startup")
async def startup():
    app_info.info({'version': '1.0.0'})

def process_task(task_id: str):
    try:
        do_task(task_id)
        TASKS_PROCESSED.labels(status='success').inc()
    except Exception as e:
        TASKS_PROCESSED.labels(status='failed').inc()
        raise

APM Integration

# opentelemetry_example.py
from opentelemetry import trace
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

# Configure tracing
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)

jaeger_exporter = JaegerExporter(
    agent_host_name="localhost",
    agent_port=6831,
)

trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(jaeger_exporter)
)

# Usage
@app.get("/users/{user_id}")
async def get_user(user_id: int):
    with tracer.start_as_current_span("get_user") as span:
        span.set_attribute("user_id", user_id)
        
        user = db.get_user(user_id)
        
        span.set_attribute("user_found", user is not None)
        
        return user

Practice Problems

  1. Create a custom exporter for metrics
  2. Implement distributed tracing across services
  3. Add custom metrics for business KPIs
  4. Create alerting rules for critical metrics
  5. Implement log correlation with traces

Advertisement

Need Expert Python Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement