Real-World Data Pipeline Patterns

Architecture Diagram

Formal Definitions

Detailed Explanation

ETL vs ELT: Which Pattern to Choose?

Factor	ETL	ELT
Data Volume	Small (<1GB)	Large (>1GB)
Transformation	In-memory	In warehouse
Latency	Minutes	Seconds to minutes
Cost	Compute-intensive	Storage-intensive
Flexibility	Lower	Higher

Rule of Thumb: Use ETL for small datasets where you need complex transformations. Use ELT when your warehouse can handle the transformation load efficiently.

Pipeline Design Principles

Idempotency: Every pipeline run should produce the same result given the same input
Atomicity: Each task should do one thing well
Observability: Include logging, metrics, and alerting at every stage
Resilience: Design for failure with retries, checkpoints, and rollback capabilities
Data Quality: Validate data at every stage boundary

ETL Pipeline Pattern

from airflow.decorators import task, dag
from datetime import datetime, timedelta
from typing import Dict, Any

@dag(
    schedule_interval="0 6 * * *",
    start_date=datetime(2024, 1, 1),
    catchup=False,
    tags=['etl', 'production'],
    default_args={
        'retries': 3,
        'retry_delay': timedelta(minutes=5),
    },
)
def etl_pipeline():
    
    @task
    def extract_orders() -> list:
        """Extract orders from source database."""
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        hook = PostgresHook(postgres_conn_id='source_db')
        
        query = """
            SELECT order_id, customer_id, order_date, total_amount
            FROM orders
            WHERE order_date >= '{{ ds }}'
            AND order_date < '{{ next_ds }}'
        """
        
        df = hook.get_pandas_df(query)
        return df.to_dict('records')
    
    @task
    def transform_orders(orders: list) -> list:
        """Transform orders with business logic."""
        transformed = []
        
        for order in orders:
            # Apply business rules
            transformed_order = {
                'order_id': order['order_id'],
                'customer_id': order['customer_id'],
                'order_date': order['order_date'],
                'total_amount': float(order['total_amount']),
                'order_category': categorize_order(float(order['total_amount'])),
                'processed_at': datetime.now().isoformat(),
            }
            transformed.append(transformed_order)
        
        return transformed
    
    def categorize_order(amount: float) -> str:
        """Categorize order by amount."""
        if amount >= 1000:
            return 'enterprise'
        elif amount >= 100:
            return 'business'
        else:
            return 'consumer'
    
    @task
    def validate_data(data: list) -> dict:
        """Validate data quality."""
        errors = []
        
        for record in data:
            if not record.get('order_id'):
                errors.append(f"Missing order_id: {record}")
            if record.get('total_amount', 0) < 0:
                errors.append(f"Negative amount: {record}")
        
        return {
            'valid': len(errors) == 0,
            'total_records': len(data),
            'error_count': len(errors),
            'errors': errors[:10],
        }
    
    @task
    def load_to_warehouse(data: list, validation: dict) -> int:
        """Load validated data to warehouse."""
        if not validation['valid']:
            raise ValueError(f"Data validation failed: {validation['errors']}")
        
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        hook = PostgresHook(postgres_conn_id='warehouse_db')
        
        # Insert data
        rows = [
            (d['order_id'], d['customer_id'], d['order_date'],
             d['total_amount'], d['order_category'], d['processed_at'])
            for d in data
        ]
        
        hook.insert_rows(
            table='fact_orders',
            rows=rows,
            target_fields=['order_id', 'customer_id', 'order_date',
                          'total_amount', 'order_category', 'processed_at'],
        )
        
        return len(rows)
    
    # Define pipeline
    orders = extract_orders()
    transformed = transform_orders(orders)
    validation = validate_data(transformed)
    loaded = load_to_warehouse(transformed, validation)
    
    loaded

etl_pipeline()

ELT Pipeline Pattern

from airflow.decorators import task, dag
from datetime import datetime

@dag(
    schedule_interval="0 2 * * *",
    start_date=datetime(2024, 1, 1),
    catchup=False,
    tags=['elt', 'dbt', 'production'],
)
def elt_pipeline():
    
    @task
    def extract_to_staging():
        """Extract raw data to staging area."""
        from airflow.providers.amazon.aws.hooks.s3 import S3Hook
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        # Extract from source
        source_hook = PostgresHook(postgres_conn_id='source_db')
        df = source_hook.get_pandas_df("SELECT * FROM raw_events")
        
        # Load to S3 (staging)
        s3_hook = S3Hook(aws_conn_id='aws_default')
        
        local_path = '/tmp/staging_events.parquet'
        df.to_parquet(local_path, index=False)
        
        s3_hook.load_file(
            filename=local_path,
            key=f'staging/events/{datetime.now().strftime("%Y/%m/%d")}/events.parquet',
            bucket_name='data-lake',
            replace=True,
        )
        
        return {'rows': len(df), 'status': 'staged'}
    
    @task
    def run_dbt_models():
        """Run dbt transformations in warehouse."""
        import subprocess
        
        # Run dbt models
        result = subprocess.run(
            ['dbt', 'run', '--models', 'staging.*', 'marts.*'],
            capture_output=True,
            text=True,
            cwd='/opt/dbt/project',
        )
        
        if result.returncode != 0:
            raise Exception(f"dbt failed: {result.stderr}")
        
        return {'status': 'transformed', 'output': result.stdout}
    
    @task
    def run_dbt_tests():
        """Run dbt data quality tests."""
        import subprocess
        
        result = subprocess.run(
            ['dbt', 'test'],
            capture_output=True,
            text=True,
            cwd='/opt/dbt/project',
        )
        
        if result.returncode != 0:
            raise Exception(f"dbt tests failed: {result.stderr}")
        
        return {'status': 'tests_passed'}
    
    @task
    def refresh_dashboards():
        """Refresh BI dashboards after transformation."""
        # Trigger dashboard refresh
        print("Refreshing dashboards...")
        return {'status': 'refreshed'}
    
    # Define pipeline
    staged = extract_to_staging()
    transformed = run_dbt_models()
    tested = run_dbt_tests()
    refreshed = refresh_dashboards()
    
    staged >> transformed >> tested >> refreshed

elt_pipeline()

Incremental Loading Pattern

from airflow.decorators import task, dag
from datetime import datetime, timedelta
from typing import Optional

@dag(
    schedule_interval="@hourly",
    start_date=datetime(2024, 1, 1),
    catchup=False,
    tags=['incremental', 'production'],
)
def incremental_pipeline():
    
    @task
    def get_last_checkpoint() -> Optional[str]:
        """Get the last processing checkpoint."""
        from airflow.models import Variable
        
        checkpoint = Variable.get(
            'incremental_checkpoint',
            default_var=None,
        )
        return checkpoint
    
    @task
    def extract_incremental(last_checkpoint: Optional[str]) -> list:
        """Extract only new/changed records."""
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        hook = PostgresHook(postgres_conn_id='source_db')
        
        if last_checkpoint:
            query = f"""
                SELECT * FROM events
                WHERE updated_at > '{last_checkpoint}'
                ORDER BY updated_at
                LIMIT 10000
            """
        else:
            query = "SELECT * FROM events ORDER BY updated_at LIMIT 10000"
        
        df = hook.get_pandas_df(query)
        return df.to_dict('records')
    
    @task
    def transform_incremental(records: list) -> list:
        """Transform incremental records."""
        transformed = []
        
        for record in records:
            # Apply transformations
            transformed_record = {
                'id': record['id'],
                'event_type': record['event_type'].lower(),
                'payload': record['payload'],
                'event_timestamp': record['event_timestamp'],
                'processed_at': datetime.now().isoformat(),
            }
            transformed.append(transformed_record)
        
        return transformed
    
    @task
    def upsert_to_warehouse(records: list) -> int:
        """Upsert records to warehouse (insert or update)."""
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        hook = PostgresHook(postgres_conn_id='warehouse_db')
        
        # Use upsert (INSERT ON CONFLICT UPDATE)
        rows = [
            (r['id'], r['event_type'], r['payload'],
             r['event_timestamp'], r['processed_at'])
            for r in records
        ]
        
        hook.insert_rows(
            table='events',
            rows=rows,
            target_fields=['id', 'event_type', 'payload',
                          'event_timestamp', 'processed_at'],
        )
        
        return len(rows)
    
    @task
    def update_checkpoint(records: list):
        """Update checkpoint with latest timestamp."""
        from airflow.models import Variable
        
        if records:
            latest_timestamp = max(r['event_timestamp'] for r in records)
            Variable.set('incremental_checkpoint', latest_timestamp)
    
    # Define pipeline
    last_checkpoint = get_last_checkpoint()
    records = extract_incremental(last_checkpoint)
    transformed = transform_incremental(records)
    loaded = upsert_to_warehouse(transformed)
    update_checkpoint(records)
    
    loaded

incremental_pipeline()

Real-World Data Pipeline Patterns in Apache Airflow

Real-World Data Pipeline Patterns

Architecture Diagram

Formal Definitions

Detailed Explanation

ETL vs ELT: Which Pattern to Choose?

Pipeline Design Principles

ETL Pipeline Pattern

ELT Pipeline Pattern

Incremental Loading Pattern

Need Expert Airflow Help?