Introduction
Monitoring provides visibility into application health and performance. This tutorial covers metrics collection, health checks, and Application Performance Monitoring (APM) tools.
Health Check Endpoints
# health.py
from fastapi import APIRouter
from datetime import datetime
import psutil
router = APIRouter()
@router.get("/health")
async def basic_health():
return {
"status": "healthy",
"timestamp": datetime.utcnow().isoformat()
}
@router.get("/health/detailed")
async def detailed_health():
# Check database
db_healthy = check_database()
# Check external services
redis_healthy = check_redis()
status = "healthy" if db_healthy and redis_healthy else "unhealthy"
return {
"status": status,
"checks": {
"database": "ok" if db_healthy else "failed",
"redis": "ok" if redis_healthy else "failed"
},
"timestamp": datetime.utcnow().isoformat()
}
@router.get("/health/ready")
async def readiness_check():
"""Check if service is ready to accept traffic."""
if not is_database_ready():
return {"status": "not ready"}, 503
return {"status": "ready"}
@router.get("/health/live")
async def liveness_check():
"""Check if service is alive."""
return {"status": "alive"}
def check_database() -> bool:
try:
# Check database connection
return True
except:
return False
Metrics with Prometheus
from prometheus_client import Counter, Histogram, Gauge
import time
# Request metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'endpoint'],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
)
# Business metrics
ACTIVE_USERS = Gauge('active_users', 'Number of active users')
TASKS_PROCESSED = Counter(
'tasks_processed_total',
'Total tasks processed',
['status']
)
# Middleware to track requests
@app.middleware("http")
async def metrics_middleware(request, call_next):
start_time = time.time()
response = await call_next(request)
duration = time.time() - start_time
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
REQUEST_DURATION.labels(
method=request.method,
endpoint=request.url.path
).observe(duration)
return response
Custom Metrics
# custom_metrics.py
from prometheus_client import Info, Enum
app_info = Info('app', 'Application info')
app_info.info({'version': '1.0.0', 'environment': 'production'})
task_status = Enum(
'task_status',
'Task execution status',
states=['success', 'failed', 'pending']
)
# Using in code
@app.on_event("startup")
async def startup():
app_info.info({'version': '1.0.0'})
def process_task(task_id: str):
try:
do_task(task_id)
TASKS_PROCESSED.labels(status='success').inc()
except Exception as e:
TASKS_PROCESSED.labels(status='failed').inc()
raise
APM Integration
# opentelemetry_example.py
from opentelemetry import trace
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
# Configure tracing
trace.set_tracer_provider(TracerProvider())
tracer = trace.get_tracer(__name__)
jaeger_exporter = JaegerExporter(
agent_host_name="localhost",
agent_port=6831,
)
trace.get_tracer_provider().add_span_processor(
BatchSpanProcessor(jaeger_exporter)
)
# Usage
@app.get("/users/{user_id}")
async def get_user(user_id: int):
with tracer.start_as_current_span("get_user") as span:
span.set_attribute("user_id", user_id)
user = db.get_user(user_id)
span.set_attribute("user_found", user is not None)
return user
Practice Problems
- Create a custom exporter for metrics
- Implement distributed tracing across services
- Add custom metrics for business KPIs
- Create alerting rules for critical metrics
- Implement log correlation with traces