Snowflake Data Vault Modeling
Data Vault 2.0 is a methodology for designing scalable, auditable, and flexible data warehouses that support historical tracking and parallel loading.
Data Vault Components
Hubs (Business Keys)
-- Create Hub table
CREATE OR REPLACE TABLE dv.hub_customer (
hub_customer_hk BINARY(16) PRIMARY KEY, -- Hash key
customer_bk VARCHAR(50), -- Business key
load_date TIMESTAMP_NTZ, -- Load timestamp
record_source VARCHAR(100) -- Source system
);
-- Load Hub table
INSERT INTO dv.hub_customer (
hub_customer_hk,
customer_bk,
load_date,
record_source
)
SELECT
SHA2(customer_id::STRING, 256) as hub_customer_hk,
customer_id::STRING as customer_bk,
CURRENT_TIMESTAMP() as load_date,
'ERP_SYSTEM' as record_source
FROM staging.customers
WHERE customer_id IS NOT NULL
AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);
Links (Relationships)
-- Create Link table
CREATE OR REPLACE TABLE dv.link_order_customer (
link_order_customer_hk BINARY(16) PRIMARY KEY,
hub_order_hk BINARY(16),
hub_customer_hk BINARY(16),
load_date TIMESTAMP_NTZ,
record_source VARCHAR(100)
);
-- Load Link table
INSERT INTO dv.link_order_customer (
link_order_customer_hk,
hub_order_hk,
hub_customer_hk,
load_date,
record_source
)
SELECT
SHA2(CONCAT(o.hub_order_hk, c.hub_customer_hk), 256) as link_order_customer_hk,
o.hub_order_hk,
c.hub_customer_hk,
CURRENT_TIMESTAMP() as load_date,
'ERP_SYSTEM' as record_source
FROM dv.hub_order o
JOIN staging.order_details od ON o.order_bk = od.order_id::STRING
JOIN dv.hub_customer c ON od.customer_id::STRING = c.customer_bk;
Satellites (Attributes)
-- Create Satellite table
CREATE OR REPLACE TABLE dv.sat_customer_details (
hub_customer_hk BINARY(16),
load_date TIMESTAMP_NTZ,
load_end_date TIMESTAMP_NTZ,
hash_diff BINARY(16), -- Hash of attributes
first_name VARCHAR(100),
last_name VARCHAR(100),
email VARCHAR(255),
phone VARCHAR(50),
address VARCHAR(500),
record_source VARCHAR(100),
PRIMARY KEY (hub_customer_hk, load_date)
);
-- Load Satellite table
INSERT INTO dv.sat_customer_details (
hub_customer_hk,
load_date,
load_end_date,
hash_diff,
first_name,
last_name,
email,
phone,
address,
record_source
)
SELECT
c.hub_customer_hk,
CURRENT_TIMESTAMP() as load_date,
NULL as load_end_date,
SHA2(CONCAT(
COALESCE(s.first_name, ''),
COALESCE(s.last_name, ''),
COALESCE(s.email, ''),
COALESCE(s.phone, ''),
COALESCE(s.address, '')
), 256) as hash_diff,
s.first_name,
s.last_name,
s.email,
s.phone,
s.address,
'ERP_SYSTEM' as record_source
FROM dv.hub_customer c
JOIN staging.customers s ON c.customer_bk = s.customer_id::STRING;
Hash Key Generation
-- Create hash key generation functions
CREATE OR REPLACE FUNCTION dv.hash_business_key(business_key VARCHAR)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
SHA2(business_key, 256)
$$;
-- Create hash diff function
CREATE OR REPLACE FUNCTION dv.hash_diff(attributes ARRAY)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
SHA2(CONCAT_WS(',', attributes), 256)
$$;
-- Use in loading process
SELECT
dv.hash_business_key(customer_id::STRING) as hub_customer_hk,
dv.hash_diff(ARRAY_CONSTRUCT(first_name, last_name, email)) as hash_diff
FROM staging.customers;
Loading Patterns
Full Load
-- Full load procedure
CREATE OR REPLACE PROCEDURE dv.load_full_customer()
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
-- Truncate staging
TRUNCATE TABLE staging.customers;
-- Load from source
INSERT INTO staging.customers
SELECT * FROM source_system.customers;
-- Load Hub
INSERT INTO dv.hub_customer (
hub_customer_hk, customer_bk, load_date, record_source
)
SELECT
SHA2(customer_id::STRING, 256),
customer_id::STRING,
CURRENT_TIMESTAMP(),
'SOURCE_SYSTEM'
FROM staging.customers
WHERE customer_id IS NOT NULL;
-- Load Satellite
INSERT INTO dv.sat_customer_details (
hub_customer_hk, load_date, load_end_date, hash_diff,
first_name, last_name, email, phone, address, record_source
)
SELECT
SHA2(c.customer_id::STRING, 256),
CURRENT_TIMESTAMP(),
NULL,
SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
c.first_name, c.last_name, c.email, c.phone, c.address,
'SOURCE_SYSTEM'
FROM staging.customers c;
RETURN 'SUCCESS: Customer data loaded';
END;
$$;
Incremental Load
-- Incremental load using Streams
CREATE STREAM customer_stream
ON TABLE staging.customers
APPEND_ONLY = FALSE;
-- Load Hub incrementally
INSERT INTO dv.hub_customer (
hub_customer_hk, customer_bk, load_date, record_source
)
SELECT
SHA2(customer_id::STRING, 256),
customer_id::STRING,
CURRENT_TIMESTAMP(),
'SOURCE_SYSTEM'
FROM customer_stream
WHERE METADATA$ACTION = 'INSERT'
AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);
-- Load Satellite incrementally
INSERT INTO dv.sat_customer_details (
hub_customer_hk, load_date, load_end_date, hash_diff,
first_name, last_name, email, phone, address, record_source
)
SELECT
SHA2(c.customer_id::STRING, 256),
CURRENT_TIMESTAMP(),
NULL,
SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
c.first_name, c.last_name, c.email, c.phone, c.address,
'SOURCE_SYSTEM'
FROM customer_stream c
WHERE METADATA$ACTION = 'INSERT';
Data Vault loading is designed for parallel processing. Hubs, Links, and Satellites can be loaded independently, enabling high-throughput data ingestion.
Querying Data Vault
Point-in-Time Queries
-- Query as of specific timestamp
SELECT
c.customer_bk,
s.first_name,
s.last_name,
s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_date <= '2024-01-15'::TIMESTAMP
AND (s.load_end_date IS NULL OR s.load_end_date > '2024-01-15'::TIMESTAMP);
-- Query current state
SELECT
c.customer_bk,
s.first_name,
s.last_name,
s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_end_date IS NULL;
Building Business Marts
-- Create business mart
CREATE OR REPLACE VIEW mart.customer_orders AS
SELECT
c.customer_bk,
c.first_name,
c.last_name,
c.email,
COUNT(DISTINCT o.order_bk) as total_orders,
SUM(o.order_amount) as total_spend,
MIN(o.order_date) as first_order_date,
MAX(o.order_date) as last_order_date
FROM dv.hub_customer c
JOIN dv.link_order_customer loc ON c.hub_customer_hk = loc.hub_customer_hk
JOIN dv.hub_order o ON loc.hub_order_hk = o.hub_order_hk
JOIN dv.sat_order_details s ON o.hub_order_hk = s.hub_order_hk
GROUP BY c.customer_bk, c.first_name, c.last_name, c.email;
Data Vault Benefits
| Benefit | Description |
|---|---|
| Auditability | Complete history of all changes |
| Parallel Loading | Independent Hub/Link/Satellite loading |
| Flexibility | Easy to add new sources |
| Scalability | Supports massive data volumes |
| Point-in-Time | Historical query capability |
Key Takeaways:
- Data Vault 2.0 uses Hubs, Links, and Satellites for modeling
- Hubs store business keys, Links store relationships, Satellites store attributes
- Hash keys enable efficient joins and parallel loading
- Incremental loading supports real-time data integration
- Point-in-time queries enable historical analysis
- Business Marts provide analytics-ready datasets