Cloud Observability

Difficulty: Senior Level | Companies: AWS, Google, Microsoft, Netflix, Uber

Observability vs Monitoring

Monitoring tells you when something is wrong. Observability helps you understand why. The three pillars are metrics, logs, and traces.

ℹ️

Observability is the ability to understand internal system state from external outputs. Logs, metrics, and traces provide different perspectives on system behavior.

Observability Architecture

Architecture Diagram

┌─────────────────────────────────────────────────────────────┐
│                    Observability Stack                      │
│  ┌──────────┐    ┌──────────┐    ┌──────────┐              │
│  │ Metrics  │    │   Logs   │    │ Traces   │              │
│  │Prometheus│    │  Loki    │    │  Tempo   │              │
│  │  /Cloud  │    │  /Cloud  │    │  /X-Ray  │              │
│  │  Watch   │    │ Watch    │    │          │              │
│  └────┬─────┘    └────┬─────┘    └────┬─────┘              │
│       │               │               │                    │
│       └───────────────┼───────────────┘                    │
│                       │                                    │
│              ┌────────▼────────┐                           │
│              │    Grafana      │                           │
│              │  (Visualization)│                           │
│              └────────┬────────┘                           │
│                       │                                    │
│              ┌────────▼────────┐                           │
│              │    Alerting     │                           │
│              │  (PagerDuty)    │                           │
│              └─────────────────┘                           │
└─────────────────────────────────────────────────────────────┘

Pattern 1: Structured Logging

Emit structured JSON logs for easy querying.

# Structured logging with correlation IDs
import json
import logging
from datetime import datetime
from contextvars import ContextVar

correlation_id: ContextVar[str] = ContextVar('correlation_id', default='')

class StructuredFormatter(logging.Formatter):
    def format(self, record):
        log_entry = {
            'timestamp': datetime.utcnow().isoformat(),
            'level': record.levelname,
            'logger': record.name,
            'message': record.getMessage(),
            'module': record.module,
            'function': record.funcName,
            'line': record.lineno,
            'correlation_id': correlation_id.get(),
        }
        
        # Add extra fields
        if hasattr(record, 'extra_data'):
            log_entry['data'] = record.extra_data
        
        # Add exception info
        if record.exc_info:
            log_entry['exception'] = {
                'type': record.exc_info[0].__name__,
                'message': str(record.exc_info[1]),
                'traceback': self.formatException(record.exc_info),
            }
        
        return json.dumps(log_entry)

# Configure logger
logger = logging.getLogger('app')
handler = logging.StreamHandler()
handler.setFormatter(StructuredFormatter())
logger.addHandler(handler)
logger.setLevel(logging.INFO)

# Usage with correlation ID
def process_request(request):
    correlation_id.set(request.headers.get('X-Correlation-ID', ''))
    
    logger.info('Processing request', extra={
        'extra_data': {
            'method': request.method,
            'path': request.path,
            'user_id': request.user_id,
        }
    })
    
    try:
        result = handle_business_logic(request)
        logger.info('Request completed successfully', extra={
            'extra_data': {'result_id': result.id}
        })
        return result
    except Exception as e:
        logger.error('Request failed', extra={
            'extra_data': {'error': str(e)}
        })
        raise

Pattern 2: Custom Metrics with CloudWatch

Emit custom metrics for business and application monitoring.

# Custom CloudWatch metrics
import boto3
import time
from functools import wraps

cloudwatch = boto3.client('cloudwatch')

class MetricsCollector:
    def __init__(self, namespace: str):
        self.namespace = namespace
    
    def put_metric(self, name: str, value: float, unit: str = 'Count', dimensions: dict = None):
        metric_data = {
            'MetricName': name,
            'Value': value,
            'Unit': unit,
            'Timestamp': int(time.time()),
        }
        
        if dimensions:
            metric_data['Dimensions'] = [
                {'Name': k, 'Value': v} for k, v in dimensions.items()
            ]
        
        cloudwatch.put_metric_data(
            Namespace=self.namespace,
            MetricData=[metric_data],
        )
    
    def track_latency(self, operation: str):
        """Decorator to track operation latency."""
        def decorator(func):
            @wraps(func)
            def wrapper(*args, **kwargs):
                start = time.time()
                try:
                    result = func(*args, **kwargs)
                    self.put_metric(
                        f'{operation}.Latency',
                        (time.time() - start) * 1000,
                        'Milliseconds',
                        {'Status': 'Success'},
                    )
                    return result
                except Exception as e:
                    self.put_metric(
                        f'{operation}.Latency',
                        (time.time() - start) * 1000,
                        'Milliseconds',
                        {'Status': 'Error'},
                    )
                    self.put_metric(f'{operation}.Errors', 1, 'Count')
                    raise
            return wrapper
        return decorator
    
    def track_business_metric(self, name: str, value: float, **dimensions):
        """Track business-specific metrics."""
        self.put_metric(f'Business.{name}', value, 'None', dimensions)

# Usage
metrics = MetricsCollector('MyApp/Production')

@metrics.track_latency('OrderProcessing')
def process_order(order):
    # Business logic
    metrics.track_business_metric('OrderValue', order.total, 
                                   Region=order.region)
    return result

ℹ️

Use high-resolution metrics (1-second intervals) for latency tracking. Standard resolution (1-minute) is sufficient for most other metrics.

Pattern 3: Distributed Tracing

Trace requests across service boundaries.

// OpenTelemetry tracing setup
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
import { trace, context, SpanKind } from '@opentelemetry/api';

// Initialize tracer
const provider = new NodeTracerProvider({
  resource: new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: 'order-service',
    [SemanticResourceAttributes.SERVICE_VERSION]: '2.0.0',
    [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: 'production',
  }),
});

const exporter = new OTLPTraceExporter({
  url: 'http://otel-collector:4318/v1/traces',
});

provider.addSpanProcessor(new BatchSpanProcessor(exporter));
provider.register();

// Trace function execution
export async function tracedOperation<T>(
  name: string,
  fn: () => Promise<T>,
  attributes?: Record<string, string>,
): Promise<T> {
  const tracer = trace.getTracer('order-service');
  
  return tracer.startActiveSpan(name, async (span) => {
    try {
      if (attributes) {
        span.setAttributes(attributes);
      }
      
      const result = await fn();
      span.setStatus({ code: 0 }); // OK
      return result;
    } catch (error) {
      span.setStatus({ code: 2, message: error.message }); // ERROR
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  });
}

// Usage
async function processOrder(orderId: string) {
  return tracedOperation('process-order', async () => {
    // Add custom attributes
    trace.getActiveSpan()?.setAttribute('order.id', orderId);
    
    const order = await getOrder(orderId);
    trace.getActiveSpan()?.setAttribute('order.total', order.total);
    
    await reserveInventory(order);
    await processPayment(order);
    
    return order;
  }, { 'order.id': orderId });
}

Pattern 4: Alerting Strategy

Define meaningful alerts with proper thresholds.

# Prometheus alerting rules
groups:
  - name: application-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          /
          sum(rate(http_requests_total[5m]))
          > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"
      
      - alert: HighLatency
        expr: |
          histogram_quantile(0.99, 
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "P99 latency is {{ $value }}s"
      
      - alert: DiskSpaceLow
        expr: |
          (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Disk space low"
          description: "Only {{ $value | humanizePercentage }} disk space remaining"
      
      - alert: PodCrashLooping
        expr: |
          rate(kube_pod_container_status_restarts_total[15m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Pod is crash looping"
          description: "Pod {{ $labels.pod }} is restarting frequently"

⚠️

Alert fatigue is real. Only alert on conditions that require human action. Suppress alerts during known maintenance windows.

Pattern 5: SLI/SLO Monitoring

Track service level indicators and objectives.

# SLI/SLO calculation
from dataclasses import dataclass
from typing import List
import datetime

@dataclass
class SLIResult:
    name: str
    value: float
    target: float
    error_budget_remaining: float

class SLOMonitor:
    def __init__(self, prometheus_client):
        self.prom = prometheus_client
    
    def calculate_availability_sli(self, service: str, window_days: int = 30) -> SLIResult:
        """Calculate availability SLI."""
        query = f"""
            1 - (
                sum(rate(http_requests_total{{service="{service}",status=~"5.."}}[{window_days}d]))
                /
                sum(rate(http_requests_total{{service="{service}"}}[{window_days}d]))
            )
        """
        
        availability = self.prom.query(query)
        target = 0.999  # 99.9% availability SLO
        
        error_budget = (1 - target) * window_days * 24 * 60  # minutes
        error_budget_used = (1 - availability) * window_days * 24 * 60
        error_budget_remaining = error_budget - error_budget_used
        
        return SLIResult(
            name='availability',
            value=availability,
            target=target,
            error_budget_remaining=error_budget_remaining,
        )
    
    def calculate_latency_sli(self, service: str) -> SLIResult:
        """Calculate latency SLI (P99)."""
        query = f"""
            histogram_quantile(0.99,
                sum(rate(http_request_duration_seconds_bucket{{service="{service}"}}[5m])) by (le)
            )
        """
        
        p99_latency = self.prom.query(query)
        target = 1.0  # 1 second P99 target
        
        # Convert to percentage within target
        sli_value = max(0, 1 - (p99_latency - target) / target) if p99_latency > target else 1.0
        
        return SLIResult(
            name='latency',
            value=sli_value,
            target=0.99,  # 99% of requests within 1s
            error_budget_remaining=0,  # Calculated based on time window
        )

Observability Checklist

Structured Logging - JSON format with correlation IDs
Custom Metrics - Business and application metrics
Distributed Tracing - Trace requests across services
Alerting - Actionable alerts with proper thresholds
SLI/SLO - Track reliability objectives
Dashboards - Visualize system health
Runbooks - Document response procedures

Follow-Up Questions

How do you implement observability for serverless applications with short-lived functions?
What strategies would you use to reduce alert noise while maintaining visibility?
How do you correlate metrics, logs, and traces for a single request?