CW

Snowflake Slowly Changing Dimensions (SCD)

Free Lesson

Advertisement

Snowflake Slowly Changing Dimensions (SCD)

Slowly Changing Dimensions (SCD) techniques manage how historical data is tracked when dimension attributes change over time.

SCD Type 1: Overwrite

SCD Type 1 overwrites the existing record with new data, with no history maintained.

-- Create dimension table for SCD Type 1
CREATE OR REPLACE TABLE dim_customer_scd1 (
  customer_id INTEGER PRIMARY KEY,
  customer_name VARCHAR(100),
  email VARCHAR(255),
  phone VARCHAR(50),
  address VARCHAR(500),
  last_updated TIMESTAMP_NTZ
);

-- Load SCD Type 1 using MERGE
MERGE INTO dim_customer_scd1 AS target
USING (
  SELECT
    customer_id,
    customer_name,
    email,
    phone,
    address,
    CURRENT_TIMESTAMP() as last_updated
  FROM staging.customer_updates
) AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
  target.customer_name != source.customer_name OR
  target.email != source.email OR
  target.phone != source.phone OR
  target.address != source.address
) THEN
  UPDATE SET
    customer_name = source.customer_name,
    email = source.email,
    phone = source.phone,
    address = source.address,
    last_updated = source.last_updated
WHEN NOT MATCHED THEN
  INSERT (customer_id, customer_name, email, phone, address, last_updated)
  VALUES (source.customer_id, source.customer_name, source.email, source.phone, source.address, source.last_updated);

SCD Type 2: Versioning

SCD Type 2 creates new records for changes, maintaining a complete history with effective dates.

-- Create dimension table for SCD Type 2
CREATE OR REPLACE TABLE dim_customer_scd2 (
  customer_sk INTEGER PRIMARY KEY,        -- Surrogate key
  customer_id INTEGER,                     -- Business key
  customer_name VARCHAR(100),
  email VARCHAR(255),
  phone VARCHAR(50),
  address VARCHAR(500),
  effective_start_date TIMESTAMP_NTZ,
  effective_end_date TIMESTAMP_NTZ,
  is_current BOOLEAN,
  record_source VARCHAR(100)
);

-- Create sequence for surrogate key
CREATE SEQUENCE customer_sk_seq START = 1;

-- Load SCD Type 2
MERGE INTO dim_customer_scd2 AS target
USING (
  SELECT
    s.customer_id,
    s.customer_name,
    s.email,
    s.phone,
    s.address,
    CURRENT_TIMESTAMP() as effective_start_date,
    NULL::TIMESTAMP_NTZ as effective_end_date,
    TRUE as is_current,
    'SOURCE_SYSTEM' as record_source
  FROM staging.customer_updates s
) AS source
ON target.customer_id = source.customer_id AND target.is_current = TRUE
WHEN MATCHED AND (
  target.customer_name != source.customer_name OR
  target.email != source.email OR
  target.phone != source.phone OR
  target.address != source.address
) THEN
  -- Close existing record
  UPDATE SET
    effective_end_date = source.effective_start_date,
    is_current = FALSE
WHEN NOT MATCHED THEN
  INSERT (customer_sk, customer_id, customer_name, email, phone, address,
          effective_start_date, effective_end_date, is_current, record_source)
  VALUES (NEXTVAL(customer_sk_seq), source.customer_id, source.customer_name,
          source.email, source.phone, source.address,
          source.effective_start_date, source.effective_end_date,
          source.is_current, source.record_source);

-- Insert new version for changed records
INSERT INTO dim_customer_scd2 (
  customer_sk, customer_id, customer_name, email, phone, address,
  effective_start_date, effective_end_date, is_current, record_source
)
SELECT
  NEXTVAL(customer_sk_seq),
  source.customer_id,
  source.customer_name,
  source.email,
  source.phone,
  source.address,
  source.effective_start_date,
  NULL::TIMESTAMP_NTZ,
  TRUE,
  source.record_source
FROM (
  SELECT
    s.customer_id,
    s.customer_name,
    s.email,
    s.phone,
    s.address,
    CURRENT_TIMESTAMP() as effective_start_date,
    'SOURCE_SYSTEM' as record_source
  FROM staging.customer_updates s
  JOIN dim_customer_scd2 t
    ON s.customer_id = t.customer_id AND t.is_current = TRUE
  WHERE s.customer_name != t.customer_name OR
        s.email != t.email OR
        s.phone != t.phone OR
        s.address != t.address
) source;

Querying SCD Type 2

-- Query current state
SELECT * FROM dim_customer_scd2 WHERE is_current = TRUE;

-- Query as of specific date
SELECT * FROM dim_customer_scd2
WHERE customer_id = 123
  AND effective_start_date <= '2024-01-15'::TIMESTAMP
  AND (effective_end_date IS NULL OR effective_end_date > '2024-01-15'::TIMESTAMP);

-- Query history for customer
SELECT
  customer_name,
  email,
  effective_start_date,
  effective_end_date,
  is_current
FROM dim_customer_scd2
WHERE customer_id = 123
ORDER BY effective_start_date;

SCD Type 3: Limited History

SCD Type 3 stores limited history by adding columns for previous values.

-- Create dimension table for SCD Type 3
CREATE OR REPLACE TABLE dim_customer_scd3 (
  customer_id INTEGER PRIMARY KEY,
  customer_name VARCHAR(100),
  email VARCHAR(255),
  phone VARCHAR(50),
  address VARCHAR(500),
  previous_email VARCHAR(255),
  previous_phone VARCHAR(50),
  previous_address VARCHAR(500),
  effective_date TIMESTAMP_NTZ
);

-- Load SCD Type 3
MERGE INTO dim_customer_scd3 AS target
USING (
  SELECT
    customer_id,
    customer_name,
    email,
    phone,
    address,
    CURRENT_TIMESTAMP() as effective_date
  FROM staging.customer_updates
) AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
  target.email != source.email OR
  target.phone != source.phone OR
  target.address != source.address
) THEN
  UPDATE SET
    customer_name = source.customer_name,
    previous_email = target.email,
    previous_phone = target.phone,
    previous_address = target.address,
    email = source.email,
    phone = source.phone,
    address = source.address,
    effective_date = source.effective_date
WHEN NOT MATCHED THEN
  INSERT (customer_id, customer_name, email, phone, address,
          previous_email, previous_phone, previous_address, effective_date)
  VALUES (source.customer_id, source.customer_name, source.email, source.phone,
          source.address, NULL, NULL, NULL, source.effective_date);

SCD Comparison

AspectSCD Type 1SCD Type 2SCD Type 3
HistoryNoneCompleteLimited
StorageMinimalHighMedium
ComplexityLowHighMedium
QuerySimpleComplexModerate
Use CaseReference dataAudit trailsRecent changes
-- Compare SCD approaches
SELECT
  'SCD Type 1' as scd_type,
  COUNT(*) as record_count,
  SUM(CASE WHEN last_updated >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd1

UNION ALL

SELECT
  'SCD Type 2' as scd_type,
  COUNT(*) as record_count,
  SUM(CASE WHEN effective_start_date >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd2

UNION ALL

SELECT
  'SCD Type 3' as scd_type,
  COUNT(*) as record_count,
  SUM(CASE WHEN effective_date >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd3;

Automated SCD Processing

-- Create SCD Type 2 automation procedure
CREATE OR REPLACE PROCEDURE process_scd_type2(
  source_table STRING,
  target_table STRING,
  business_key STRING,
  change_columns ARRAY
)
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  merge_sql STRING;
  insert_sql STRING;
BEGIN
  -- Build MERGE statement dynamically
  merge_sql := 'MERGE INTO ' || target_table || ' AS target
    USING (SELECT * FROM ' || source_table || ') AS source
    ON target.' || business_key || ' = source.' || business_key || ' AND target.is_current = TRUE';

  -- Add change detection
  merge_sql := merge_sql || ' WHEN MATCHED AND (';

  -- Build change detection conditions
  FOR i IN 1 TO ARRAY_SIZE(change_columns) DO
    IF i > 1 THEN
      merge_sql := merge_sql || ' OR';
    END IF;
    merge_sql := merge_sql || ' target.' || change_columns[i] || ' != source.' || change_columns[i];
  END FOR;

  merge_sql := merge_sql || ') THEN UPDATE SET effective_end_date = CURRENT_TIMESTAMP(), is_current = FALSE';

  EXECUTE IMMEDIATE merge_sql;

  RETURN 'SUCCESS: SCD Type 2 processing completed';
END;
$$;

Choose SCD Type based on business requirements: Type 1 for simple overwrites, Type 2 for complete audit trails, and Type 3 for limited historical tracking. Consider storage and query complexity implications.

Best Practices

SCD TypeBest Practice
Type 1Use for reference data that doesn't need history
Type 2Implement surrogate keys for efficient joins
Type 2Use effective dates for point-in-time queries
Type 3Limit to 1-2 previous versions
AllIndex on business keys for performance

Key Takeaways:

  • SCD Type 1 overwrites data (no history)
  • SCD Type 2 creates new records (complete history)
  • SCD Type 3 stores limited previous values
  • SCD Type 2 is most common for audit requirements
  • Surrogate keys enable efficient joins in SCD Type 2
  • Choose SCD type based on business requirements

Advertisement

Need Expert Snowflake Help?

Get personalized warehouse optimization, data modeling, or Snowflake platform consulting.

Advertisement