Snowflake Advanced · Interview Prep

ETL/ELT Patterns & Transformations

Difficulty: Medium-Hard · Commonly asked at Amazon, Apple, Netflix

Interview Question

"Compare ETL vs ELT approaches in Snowflake. When would you choose one over the other? Walk me through implementing a slowly changing dimension (SCD Type 2) using Snowflake's native features."

ℹ️

Companies Asking This: Amazon (L6 Data Engineer), Apple (Senior Data Engineer), Netflix (Staff Data Engineer), Meta (Data Platform Engineer)

ETL vs ELT

When to Use Each

-- ELT: Transform in Snowflake (recommended for most cases)
-- Step 1: Load raw data into staging
COPY INTO staging.raw_orders
FROM @s3_stage/orders/
FILE_FORMAT = (TYPE = 'PARQUET');

-- Step 2: Transform in Snowflake using SQL
CREATE TABLE prod.orders_transformed AS
SELECT 
    order_id,
    customer_id,
    order_date,
    total_amount,
    status,
    -- Data quality transformations
    TRIM(UPPER(status)) AS status_cleaned,
    COALESCE(discount, 0) AS discount_amount,
    total_amount - COALESCE(discount, 0) AS net_amount,
    -- Derived columns
    DATEDIFF('day', order_date, CURRENT_DATE()) AS days_since_order,
    CASE 
        WHEN total_amount > 1000 THEN 'HIGH_VALUE'
        WHEN total_amount > 100 THEN 'MEDIUM_VALUE'
        ELSE 'LOW_VALUE'
    END AS value_tier,
    -- Timestamp
    CURRENT_TIMESTAMP() AS transformed_at
FROM staging.raw_orders
WHERE order_date >= DATEADD(year, -1, CURRENT_DATE());

-- ETL: Transform outside Snowflake (use for complex transformations)
-- Example: Using dbt, Spark, or other tools
-- These tools connect to Snowflake, extract data, transform externally, 
-- and load back

SCD Type 2 Implementation

-- 1. Create target SCD table
CREATE TABLE prod.customers_scd (
    customer_key NUMBER AUTOINCREMENT,
    customer_id VARCHAR(100),
    customer_name VARCHAR(200),
    email VARCHAR(200),
    segment VARCHAR(50),
    effective_start_date TIMESTAMP_NTZ,
    effective_end_date TIMESTAMP_NTZ DEFAULT TO_TIMESTAMP_NTZ('9999-12-31'),
    is_current BOOLEAN DEFAULT TRUE,
    _ingestion_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
    _source_system VARCHAR(50)
);

-- 2. Create staging table for new/changed records
CREATE TABLE staging.customer_changes (
    customer_id VARCHAR(100),
    customer_name VARCHAR(200),
    email VARCHAR(200),
    segment VARCHAR(50),
    _change_type VARCHAR(10),  -- INSERT, UPDATE, DELETE
    _change_timestamp TIMESTAMP_NTZ
);

-- 3. SCD Type 2 merge procedure
CREATE OR REPLACE PROCEDURE merge_scd_type2()
RETURNS VARCHAR
LANGUAGE SQL
AS
$$
DECLARE
    merge_count NUMBER DEFAULT 0;
    insert_count NUMBER DEFAULT 0;
BEGIN
    -- Close existing records for changed customers
    MERGE INTO prod.customers_scd t
    USING staging.customer_changes s
    ON t.customer_id = s.customer_id 
       AND t.is_current = TRUE
       AND s._change_type = 'UPDATE'
    WHEN MATCHED AND (
        t.customer_name != s.customer_name OR
        t.email != s.email OR
        t.segment != s.segment
    ) THEN
        UPDATE SET 
            effective_end_date = s._change_timestamp,
            is_current = FALSE;
    
    merge_count := SQLROWCOUNT;
    
    -- Insert new current records
    INSERT INTO prod.customers_scd (
        customer_id, customer_name, email, segment,
        effective_start_date, is_current, _source_system
    )
    SELECT 
        s.customer_id,
        s.customer_name,
        s.email,
        s.segment,
        s._change_timestamp,
        TRUE,
        'SOURCE_SYSTEM'
    FROM staging.customer_changes s
    WHERE s._change_type IN ('INSERT', 'UPDATE')
      AND NOT EXISTS (
          SELECT 1 FROM prod.customers_scd t
          WHERE t.customer_id = s.customer_id
            AND t.is_current = TRUE
            AND t.customer_name = s.customer_name
            AND t.email = s.email
            AND t.segment = s.segment
      );
    
    insert_count := SQLROWCOUNT;
    
    -- Handle soft deletes
    UPDATE prod.customers_scd
    SET is_current = FALSE,
        effective_end_date = CURRENT_TIMESTAMP()
    WHERE customer_id IN (
        SELECT customer_id FROM staging.customer_changes
        WHERE _change_type = 'DELETE'
    )
    AND is_current = TRUE;
    
    RETURN 'SCD2 merge complete. Closed: ' || merge_count || 
           ', Inserted: ' || insert_count;
END;
$$;

Real-World Scenario: Amazon

Question: "Design a data pipeline that ingests data from multiple sources (APIs, databases, files), applies data quality rules, and loads into a star schema."

Solution: Multi-Source Pipeline

-- 1. Create raw ingestion layer
CREATE SCHEMA raw;
CREATE SCHEMA staging;
CREATE SCHEMA prod;

-- 2. Raw tables for each source
CREATE TABLE raw.api_orders (
    payload VARIANT,
    _source VARCHAR(50),
    _ingestion_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);

CREATE TABLE raw.db_inventory (
    payload VARIANT,
    _source VARCHAR(50),
    _ingestion_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);

-- 3. Data quality UDFs
CREATE OR REPLACE FUNCTION validate_order(order_data VARIANT)
RETURNS BOOLEAN
LANGUAGE SQL
AS
    'order_data:order_id::VARCHAR IS NOT NULL 
     AND order_data:amount::NUMBER > 0 
     AND order_data:order_date::DATE IS NOT NULL';

-- 4. Staging with quality checks
CREATE TABLE staging.orders_staged AS
SELECT 
    payload:order_id::VARCHAR AS order_id,
    payload:customer_id::VARCHAR AS customer_id,
    payload:order_date::DATE AS order_date,
    payload:amount::NUMBER AS amount,
    payload:status::VARCHAR AS status,
    validate_order(payload) AS is_valid,
    CASE 
        WHEN NOT validate_order(payload) THEN 'INVALID_DATA'
        ELSE 'VALID'
    END AS quality_flag,
    _source,
    _ingestion_time
FROM raw.api_orders;

-- 5. Quality check procedure
CREATE OR REPLACE PROCEDURE run_quality_checks()
RETURNS VARCHAR
LANGUAGE SQL
AS
$$
DECLARE
    invalid_count NUMBER;
    total_count NUMBER;
BEGIN
    SELECT COUNT(*) INTO invalid_count
    FROM staging.orders_staged
    WHERE quality_flag = 'INVALID_DATA';
    
    SELECT COUNT(*) INTO total_count
    FROM staging.orders_staged;
    
    IF invalid_count > total_count * 0.1 THEN
        RETURN 'ERROR: ' || invalid_count || ' invalid records (' || 
               ROUND(invalid_count * 100.0 / total_count, 2) || '%). Pipeline halted.';
    ELSE
        RETURN 'Quality check passed. Invalid: ' || invalid_count || 
               ' (' || ROUND(invalid_count * 100.0 / total_count, 2) || '%)';
    END IF;
END;
$$;

-- 6. Star schema loading
-- Fact table
INSERT INTO prod.fact_orders
SELECT 
    s.order_id,
    s.customer_id,
    s.order_date,
    s.amount,
    s.status,
    d.date_key,
    c.customer_key
FROM staging.orders_staged s
JOIN prod.dim_date d ON s.order_date = d.full_date
JOIN prod.dim_customer c ON s.customer_id = c.customer_id
WHERE s.quality_flag = 'VALID';

Best Practices

Practice	Recommendation
ELT over ETL	Use Snowflake's compute for transformations
Staging layer	Always stage raw data before transformation
Data quality	Implement quality checks in staging
Incremental loads	Use Streams for change detection
Idempotency	Design pipelines to be re-runnable
Documentation	Document transformation logic

⚠️

Common Pitfalls:

Loading directly to prod — Always use staging layer
No error handling — Implement TRY/CATCH and rollback
Full reloads — Use incremental loads for large tables
No data quality checks — Validate before loading to prod
Ignoring dependencies — Schedule tasks based on dependencies

Topic: ETL/ELT Patterns & Transformations