Snowflake Slowly Changing Dimensions (SCD)
Slowly Changing Dimensions (SCD) techniques manage how historical data is tracked when dimension attributes change over time.
SCD Type 1: Overwrite
SCD Type 1 overwrites the existing record with new data, with no history maintained.
-- Create dimension table for SCD Type 1
CREATE OR REPLACE TABLE dim_customer_scd1 (
customer_id INTEGER PRIMARY KEY,
customer_name VARCHAR(100),
email VARCHAR(255),
phone VARCHAR(50),
address VARCHAR(500),
last_updated TIMESTAMP_NTZ
);
-- Load SCD Type 1 using MERGE
MERGE INTO dim_customer_scd1 AS target
USING (
SELECT
customer_id,
customer_name,
email,
phone,
address,
CURRENT_TIMESTAMP() as last_updated
FROM staging.customer_updates
) AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
target.customer_name != source.customer_name OR
target.email != source.email OR
target.phone != source.phone OR
target.address != source.address
) THEN
UPDATE SET
customer_name = source.customer_name,
email = source.email,
phone = source.phone,
address = source.address,
last_updated = source.last_updated
WHEN NOT MATCHED THEN
INSERT (customer_id, customer_name, email, phone, address, last_updated)
VALUES (source.customer_id, source.customer_name, source.email, source.phone, source.address, source.last_updated);
SCD Type 2: Versioning
SCD Type 2 creates new records for changes, maintaining a complete history with effective dates.
-- Create dimension table for SCD Type 2
CREATE OR REPLACE TABLE dim_customer_scd2 (
customer_sk INTEGER PRIMARY KEY, -- Surrogate key
customer_id INTEGER, -- Business key
customer_name VARCHAR(100),
email VARCHAR(255),
phone VARCHAR(50),
address VARCHAR(500),
effective_start_date TIMESTAMP_NTZ,
effective_end_date TIMESTAMP_NTZ,
is_current BOOLEAN,
record_source VARCHAR(100)
);
-- Create sequence for surrogate key
CREATE SEQUENCE customer_sk_seq START = 1;
-- Load SCD Type 2
MERGE INTO dim_customer_scd2 AS target
USING (
SELECT
s.customer_id,
s.customer_name,
s.email,
s.phone,
s.address,
CURRENT_TIMESTAMP() as effective_start_date,
NULL::TIMESTAMP_NTZ as effective_end_date,
TRUE as is_current,
'SOURCE_SYSTEM' as record_source
FROM staging.customer_updates s
) AS source
ON target.customer_id = source.customer_id AND target.is_current = TRUE
WHEN MATCHED AND (
target.customer_name != source.customer_name OR
target.email != source.email OR
target.phone != source.phone OR
target.address != source.address
) THEN
-- Close existing record
UPDATE SET
effective_end_date = source.effective_start_date,
is_current = FALSE
WHEN NOT MATCHED THEN
INSERT (customer_sk, customer_id, customer_name, email, phone, address,
effective_start_date, effective_end_date, is_current, record_source)
VALUES (NEXTVAL(customer_sk_seq), source.customer_id, source.customer_name,
source.email, source.phone, source.address,
source.effective_start_date, source.effective_end_date,
source.is_current, source.record_source);
-- Insert new version for changed records
INSERT INTO dim_customer_scd2 (
customer_sk, customer_id, customer_name, email, phone, address,
effective_start_date, effective_end_date, is_current, record_source
)
SELECT
NEXTVAL(customer_sk_seq),
source.customer_id,
source.customer_name,
source.email,
source.phone,
source.address,
source.effective_start_date,
NULL::TIMESTAMP_NTZ,
TRUE,
source.record_source
FROM (
SELECT
s.customer_id,
s.customer_name,
s.email,
s.phone,
s.address,
CURRENT_TIMESTAMP() as effective_start_date,
'SOURCE_SYSTEM' as record_source
FROM staging.customer_updates s
JOIN dim_customer_scd2 t
ON s.customer_id = t.customer_id AND t.is_current = TRUE
WHERE s.customer_name != t.customer_name OR
s.email != t.email OR
s.phone != t.phone OR
s.address != t.address
) source;
Querying SCD Type 2
-- Query current state
SELECT * FROM dim_customer_scd2 WHERE is_current = TRUE;
-- Query as of specific date
SELECT * FROM dim_customer_scd2
WHERE customer_id = 123
AND effective_start_date <= '2024-01-15'::TIMESTAMP
AND (effective_end_date IS NULL OR effective_end_date > '2024-01-15'::TIMESTAMP);
-- Query history for customer
SELECT
customer_name,
email,
effective_start_date,
effective_end_date,
is_current
FROM dim_customer_scd2
WHERE customer_id = 123
ORDER BY effective_start_date;
SCD Type 3: Limited History
SCD Type 3 stores limited history by adding columns for previous values.
-- Create dimension table for SCD Type 3
CREATE OR REPLACE TABLE dim_customer_scd3 (
customer_id INTEGER PRIMARY KEY,
customer_name VARCHAR(100),
email VARCHAR(255),
phone VARCHAR(50),
address VARCHAR(500),
previous_email VARCHAR(255),
previous_phone VARCHAR(50),
previous_address VARCHAR(500),
effective_date TIMESTAMP_NTZ
);
-- Load SCD Type 3
MERGE INTO dim_customer_scd3 AS target
USING (
SELECT
customer_id,
customer_name,
email,
phone,
address,
CURRENT_TIMESTAMP() as effective_date
FROM staging.customer_updates
) AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
target.email != source.email OR
target.phone != source.phone OR
target.address != source.address
) THEN
UPDATE SET
customer_name = source.customer_name,
previous_email = target.email,
previous_phone = target.phone,
previous_address = target.address,
email = source.email,
phone = source.phone,
address = source.address,
effective_date = source.effective_date
WHEN NOT MATCHED THEN
INSERT (customer_id, customer_name, email, phone, address,
previous_email, previous_phone, previous_address, effective_date)
VALUES (source.customer_id, source.customer_name, source.email, source.phone,
source.address, NULL, NULL, NULL, source.effective_date);
SCD Comparison
| Aspect | SCD Type 1 | SCD Type 2 | SCD Type 3 |
|---|---|---|---|
| History | None | Complete | Limited |
| Storage | Minimal | High | Medium |
| Complexity | Low | High | Medium |
| Query | Simple | Complex | Moderate |
| Use Case | Reference data | Audit trails | Recent changes |
-- Compare SCD approaches
SELECT
'SCD Type 1' as scd_type,
COUNT(*) as record_count,
SUM(CASE WHEN last_updated >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd1
UNION ALL
SELECT
'SCD Type 2' as scd_type,
COUNT(*) as record_count,
SUM(CASE WHEN effective_start_date >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd2
UNION ALL
SELECT
'SCD Type 3' as scd_type,
COUNT(*) as record_count,
SUM(CASE WHEN effective_date >= DATEADD('day', -30, CURRENT_DATE()) THEN 1 ELSE 0 END) as recent_changes
FROM dim_customer_scd3;
Automated SCD Processing
-- Create SCD Type 2 automation procedure
CREATE OR REPLACE PROCEDURE process_scd_type2(
source_table STRING,
target_table STRING,
business_key STRING,
change_columns ARRAY
)
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
merge_sql STRING;
insert_sql STRING;
BEGIN
-- Build MERGE statement dynamically
merge_sql := 'MERGE INTO ' || target_table || ' AS target
USING (SELECT * FROM ' || source_table || ') AS source
ON target.' || business_key || ' = source.' || business_key || ' AND target.is_current = TRUE';
-- Add change detection
merge_sql := merge_sql || ' WHEN MATCHED AND (';
-- Build change detection conditions
FOR i IN 1 TO ARRAY_SIZE(change_columns) DO
IF i > 1 THEN
merge_sql := merge_sql || ' OR';
END IF;
merge_sql := merge_sql || ' target.' || change_columns[i] || ' != source.' || change_columns[i];
END FOR;
merge_sql := merge_sql || ') THEN UPDATE SET effective_end_date = CURRENT_TIMESTAMP(), is_current = FALSE';
EXECUTE IMMEDIATE merge_sql;
RETURN 'SUCCESS: SCD Type 2 processing completed';
END;
$$;
Choose SCD Type based on business requirements: Type 1 for simple overwrites, Type 2 for complete audit trails, and Type 3 for limited historical tracking. Consider storage and query complexity implications.
Best Practices
| SCD Type | Best Practice |
|---|---|
| Type 1 | Use for reference data that doesn't need history |
| Type 2 | Implement surrogate keys for efficient joins |
| Type 2 | Use effective dates for point-in-time queries |
| Type 3 | Limit to 1-2 previous versions |
| All | Index on business keys for performance |
Key Takeaways:
- SCD Type 1 overwrites data (no history)
- SCD Type 2 creates new records (complete history)
- SCD Type 3 stores limited previous values
- SCD Type 2 is most common for audit requirements
- Surrogate keys enable efficient joins in SCD Type 2
- Choose SCD type based on business requirements