Connection Management and Hooks

Architecture Diagram

Formal Definitions

Detailed Explanation

Creating Connections

Connections can be created through the Airflow UI, CLI, REST API, or environment variables.

Creation Methods:

Method	Command/Location
CLI	`airflow connections add 'my_conn' --conn-type ...`
Environment Variable	`AIRFLOW_CONN_MY_CONN='...'`
UI	Admin → Connections → Add
Programmatic	`Connection(conn_id=..., conn_type=...)`

Using Hooks

Hooks provide a consistent interface for external system interaction.

from airflow.providers.postgres.hooks.postgres import PostgresHook

hook = PostgresHook(postgres_conn_id='postgres_default')
df = hook.get_pandas_df("SELECT * FROM orders LIMIT 100")
hook.insert_rows(table='staging_orders', rows=[(1, 'order_1', 100.00)])

Tip: Use connection pools to reduce overhead by 60-80%.

Key Concepts Table

Connection Type	Conn Type	Hook Class	Key Parameters
PostgreSQL	`postgres`	`PostgresHook`	host, login, password, schema, port
MySQL	`mysql`	`MySqlHook`	host, login, password, schema, port
S3	`aws`	`S3Hook`	login (key), password (secret), extra (region)
GCS	`google_cloud_platform`	`GCSHook`	extra (keyfile, project)
HTTP	`http`	`HttpHook`	host, schema, extra (headers)

Key Concepts Table

Connection Type	Conn Type	Hook Class	Key Parameters
PostgreSQL	`postgres`	`PostgresHook`	host, login, password, schema, port
MySQL	`mysql`	`MySqlHook`	host, login, password, schema, port
S3	`aws`	`S3Hook`	login (key), password (secret), extra (region)
GCS	`google_cloud_platform`	`GCSHook`	extra (keyfile, project)
HTTP	`http`	`HttpHook`	host, schema, extra (headers)
Spark	`spark`	`SparkHook`	host, extra (master, deploy-mode)
Redis	`redis`	`RedisHook`	host, login, password, port, db

Code Examples

Custom Hook Implementation

# custom_hook.py
from airflow.hooks.base import BaseHook
from airflow.exceptions import AirflowException
import requests
import json

class CustomAPIHook(BaseHook):
    """Custom hook for internal API services."""
    
    conn_name_attr = 'custom_api_conn_id'
    default_conn_name = 'custom_api_default'
    conn_type = 'custom_api'
    hook_name = 'Custom API'
    
    def __init__(self, custom_api_conn_id=None):
        self.conn_id = custom_api_conn_id or self.default_conn_name
        self.base_url = None
        self.headers = {}
        self._get_connection()
    
    def _get_connection(self):
        """Parse connection details."""
        conn = self.get_connection(self.conn_id)
        schema = conn.schema or 'https'
        self.base_url = f"{schema}://{conn.host}"
        if conn.port:
            self.base_url += f":{conn.port}"
        
        if conn.login:
            self.headers['Authorization'] = f"Bearer {conn.password}"
        
        if conn.extra:
            extra = json.loads(conn.extra) if isinstance(conn.extra, str) else conn.extra
            self.headers.update(extra.get('headers', {}))
    
    def get(self, endpoint, params=None):
        """Make GET request."""
        url = f"{self.base_url}{endpoint}"
        try:
            response = requests.get(
                url,
                headers=self.headers,
                params=params,
                timeout=30,
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise AirflowException(f"API request failed: {e}")
    
    def post(self, endpoint, data=None):
        """Make POST request."""
        url = f"{self.base_url}{endpoint}"
        try:
            response = requests.post(
                url,
                headers=self.headers,
                json=data,
                timeout=30,
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise AirflowException(f"API request failed: {e}")

# Usage in DAG
from airflow.decorators import task, dag
from datetime import datetime

@dag(schedule_interval="@daily", start_date=datetime(2024, 1, 1))
def custom_api_dag():
    
    @task
    def fetch_data():
        hook = CustomAPIHook(custom_api_conn_id='my_api')
        return hook.get('/api/v1/data', params={'limit': 100})
    
    @task
    def process_data(data: dict):
        print(f"Processing {len(data.get('results', []))} records")
    
    data = fetch_data()
    process_data(data)

custom_api_dag()

Connection Pool Configuration

# connection_pool_config.py
from airflow import settings
from sqlalchemy.pool import QueuePool
from sqlalchemy import create_engine

def configure_connection_pool():
    """Configure optimal connection pool settings."""
    engine = settings.engine
    
    # Check current pool status
    pool = engine.pool
    print(f"Pool type: {type(pool).__name__}")
    print(f"Pool size: {pool.size()}")
    print(f"Checked in: {pool.checkedin()}")
    print(f"Checked out: {pool.checkedout()}")
    print(f"Overflow: {pool.overflow()}")
    
    return {
        'pool_size': pool.size(),
        'checked_in': pool.checkedin(),
        'checked_out': pool.checkedout(),
        'overflow': pool.overflow(),
    }

def optimize_pool_for_workload(
    avg_concurrent_tasks: int,
    peak_concurrent_tasks: int,
    task_duration_seconds: int,
):
    """Calculate optimal pool settings based on workload."""
    
    # Base pool size should handle average concurrent tasks
    pool_size = max(5, avg_concurrent_tasks)
    
    # Overflow handles peaks
    max_overflow = max(10, peak_concurrent_tasks - pool_size)
    
    # Timeout should be longer than typical task duration
    pool_timeout = task_duration_seconds * 2
    
    # Recycle connections to avoid stale connections
    pool_recycle = 1800  # 30 minutes
    
    return {
        'pool_size': pool_size,
        'max_overflow': max_overflow,
        'pool_timeout': pool_timeout,
        'pool_recycle': pool_recycle,
        'pool_pre_ping': True,
    }

# Example: 50 avg tasks, 100 peak, 60s duration
config = optimize_pool_for_workload(
    avg_concurrent_tasks=50,
    peak_concurrent_tasks=100,
    task_duration_seconds=60,
)
print(f"Recommended pool config: {config}")

# Apply to airflow.cfg
# [database]
# sql_alchemy_pool_size = 50
# sql_alchemy_max_overflow = 50
# sql_alchemy_pool_timeout = 120
# sql_alchemy_pool_recycle = 1800
# sql_alchemy_pool_pre_ping = True

Secrets Backend Integration

# secrets_backend.py
from airflow.providers.amazon.aws.secrets.secrets_manager import SecretsManagerBackend
from airflow.providers.hashicorp.secrets.vault import VaultBackend
from airflow.providers.google.cloud.secrets.secret_manager import CloudSecretManagerBackend

# AWS Secrets Manager configuration
aws_secrets_backend = SecretsManagerBackend(
    conn_id='aws_default',
    region_name='us-east-1',
    sep='/',
)

# HashiCorp Vault configuration
vault_backend = VaultBackend(
    conn_id='vault_default',
    secret_path='airflow',
    mount_point='secret',
)

# Google Secret Manager configuration
gcp_backend = CloudSecretManagerBackend(
    conn_id='google_cloud_default',
    project_id='my-project',
    secret_path='airflow',
    sep='-',
)

# Configure in airflow.cfg
# [secrets]
# backend = airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend
# backend_kwargs = {"connections_prefix": "airflow/connections/", "variables_prefix": "airflow/variables/"}

# Usage in DAG - hooks automatically resolve from secrets backend
from airflow.decorators import task, dag
from datetime import datetime

@dag(schedule_interval="@daily", start_date=datetime(2024, 1, 1))
def secrets_backend_dag():
    
    @task
    def use_secret_connection():
        """Connection is automatically retrieved from secrets backend."""
        from airflow.providers.postgres.hooks.postgres import PostgresHook
        
        # This will look up 'production_db' in the configured secrets backend
        hook = PostgresHook(postgres_conn_id='production_db')
        result = hook.get_first("SELECT NOW()")
        return result[0]
    
    @task
    def use_secret_variable():
        """Variable is automatically retrieved from secrets backend."""
        from airflow.models import Variable
        
        api_key = Variable.get("api_key")
        print(f"Using API key: {api_key[:4]}...")
    
    use_secret_connection() >> use_secret_variable()

secrets_backend_dag()

Performance Metrics

Connection Types Comparison

Aspect	Metadata DB	Environment Variables	Secrets Manager
Security	Medium	High	Very High
Performance	Fast (DB lookup)	Fast (OS lookup)	Medium (API call)
Audit Trail	Yes	No	Yes
Rotation	Manual	Manual	Automated
Cost	Free	Free	Low-Medium
Complexity	Low	Low	Medium
Multi-environment	Difficult	Easy	Easy

Connection Pool Metrics

Metric	Recommended	Warning	Critical
Pool Utilization	< 70%	70-90%	> 90%
Connection Wait Time	< 1s	1-5s	> 5s
Connection Errors	< 1%	1-5%	> 5%
Pool Overflow	< 50%	50-80%	> 80%

Connection Management and Hooks in Apache Airflow

Connection Management and Hooks

Architecture Diagram

Formal Definitions

Detailed Explanation

Creating Connections

Using Hooks

Key Concepts Table

Key Concepts Table

Code Examples

Custom Hook Implementation

Connection Pool Configuration

Secrets Backend Integration

Performance Metrics

Connection Types Comparison

Connection Pool Metrics

See Also

Need Expert Airflow Help?