CW

Snowflake Data Vault Modeling

Free Lesson

Advertisement

Snowflake Data Vault Modeling

Data Vault 2.0 is a methodology for designing scalable, auditable, and flexible data warehouses that support historical tracking and parallel loading.

Data Vault Components

Hubs (Business Keys)

-- Create Hub table
CREATE OR REPLACE TABLE dv.hub_customer (
  hub_customer_hk BINARY(16) PRIMARY KEY,  -- Hash key
  customer_bk VARCHAR(50),                 -- Business key
  load_date TIMESTAMP_NTZ,                -- Load timestamp
  record_source VARCHAR(100)              -- Source system
);

-- Load Hub table
INSERT INTO dv.hub_customer (
  hub_customer_hk,
  customer_bk,
  load_date,
  record_source
)
SELECT
  SHA2(customer_id::STRING, 256) as hub_customer_hk,
  customer_id::STRING as customer_bk,
  CURRENT_TIMESTAMP() as load_date,
  'ERP_SYSTEM' as record_source
FROM staging.customers
WHERE customer_id IS NOT NULL
  AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);

Links (Relationships)

-- Create Link table
CREATE OR REPLACE TABLE dv.link_order_customer (
  link_order_customer_hk BINARY(16) PRIMARY KEY,
  hub_order_hk BINARY(16),
  hub_customer_hk BINARY(16),
  load_date TIMESTAMP_NTZ,
  record_source VARCHAR(100)
);

-- Load Link table
INSERT INTO dv.link_order_customer (
  link_order_customer_hk,
  hub_order_hk,
  hub_customer_hk,
  load_date,
  record_source
)
SELECT
  SHA2(CONCAT(o.hub_order_hk, c.hub_customer_hk), 256) as link_order_customer_hk,
  o.hub_order_hk,
  c.hub_customer_hk,
  CURRENT_TIMESTAMP() as load_date,
  'ERP_SYSTEM' as record_source
FROM dv.hub_order o
JOIN staging.order_details od ON o.order_bk = od.order_id::STRING
JOIN dv.hub_customer c ON od.customer_id::STRING = c.customer_bk;

Satellites (Attributes)

-- Create Satellite table
CREATE OR REPLACE TABLE dv.sat_customer_details (
  hub_customer_hk BINARY(16),
  load_date TIMESTAMP_NTZ,
  load_end_date TIMESTAMP_NTZ,
  hash_diff BINARY(16),                    -- Hash of attributes
  first_name VARCHAR(100),
  last_name VARCHAR(100),
  email VARCHAR(255),
  phone VARCHAR(50),
  address VARCHAR(500),
  record_source VARCHAR(100),
  PRIMARY KEY (hub_customer_hk, load_date)
);

-- Load Satellite table
INSERT INTO dv.sat_customer_details (
  hub_customer_hk,
  load_date,
  load_end_date,
  hash_diff,
  first_name,
  last_name,
  email,
  phone,
  address,
  record_source
)
SELECT
  c.hub_customer_hk,
  CURRENT_TIMESTAMP() as load_date,
  NULL as load_end_date,
  SHA2(CONCAT(
    COALESCE(s.first_name, ''),
    COALESCE(s.last_name, ''),
    COALESCE(s.email, ''),
    COALESCE(s.phone, ''),
    COALESCE(s.address, '')
  ), 256) as hash_diff,
  s.first_name,
  s.last_name,
  s.email,
  s.phone,
  s.address,
  'ERP_SYSTEM' as record_source
FROM dv.hub_customer c
JOIN staging.customers s ON c.customer_bk = s.customer_id::STRING;

Hash Key Generation

-- Create hash key generation functions
CREATE OR REPLACE FUNCTION dv.hash_business_key(business_key VARCHAR)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
  SHA2(business_key, 256)
$$;

-- Create hash diff function
CREATE OR REPLACE FUNCTION dv.hash_diff(attributes ARRAY)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
  SHA2(CONCAT_WS(',', attributes), 256)
$$;

-- Use in loading process
SELECT
  dv.hash_business_key(customer_id::STRING) as hub_customer_hk,
  dv.hash_diff(ARRAY_CONSTRUCT(first_name, last_name, email)) as hash_diff
FROM staging.customers;

Loading Patterns

Full Load

-- Full load procedure
CREATE OR REPLACE PROCEDURE dv.load_full_customer()
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
  -- Truncate staging
  TRUNCATE TABLE staging.customers;

  -- Load from source
  INSERT INTO staging.customers
  SELECT * FROM source_system.customers;

  -- Load Hub
  INSERT INTO dv.hub_customer (
    hub_customer_hk, customer_bk, load_date, record_source
  )
  SELECT
    SHA2(customer_id::STRING, 256),
    customer_id::STRING,
    CURRENT_TIMESTAMP(),
    'SOURCE_SYSTEM'
  FROM staging.customers
  WHERE customer_id IS NOT NULL;

  -- Load Satellite
  INSERT INTO dv.sat_customer_details (
    hub_customer_hk, load_date, load_end_date, hash_diff,
    first_name, last_name, email, phone, address, record_source
  )
  SELECT
    SHA2(c.customer_id::STRING, 256),
    CURRENT_TIMESTAMP(),
    NULL,
    SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
    c.first_name, c.last_name, c.email, c.phone, c.address,
    'SOURCE_SYSTEM'
  FROM staging.customers c;

  RETURN 'SUCCESS: Customer data loaded';
END;
$$;

Incremental Load

-- Incremental load using Streams
CREATE STREAM customer_stream
  ON TABLE staging.customers
  APPEND_ONLY = FALSE;

-- Load Hub incrementally
INSERT INTO dv.hub_customer (
  hub_customer_hk, customer_bk, load_date, record_source
)
SELECT
  SHA2(customer_id::STRING, 256),
  customer_id::STRING,
  CURRENT_TIMESTAMP(),
  'SOURCE_SYSTEM'
FROM customer_stream
WHERE METADATA$ACTION = 'INSERT'
  AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);

-- Load Satellite incrementally
INSERT INTO dv.sat_customer_details (
  hub_customer_hk, load_date, load_end_date, hash_diff,
  first_name, last_name, email, phone, address, record_source
)
SELECT
  SHA2(c.customer_id::STRING, 256),
  CURRENT_TIMESTAMP(),
  NULL,
  SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
  c.first_name, c.last_name, c.email, c.phone, c.address,
  'SOURCE_SYSTEM'
FROM customer_stream c
WHERE METADATA$ACTION = 'INSERT';

Data Vault loading is designed for parallel processing. Hubs, Links, and Satellites can be loaded independently, enabling high-throughput data ingestion.

Querying Data Vault

Point-in-Time Queries

-- Query as of specific timestamp
SELECT
  c.customer_bk,
  s.first_name,
  s.last_name,
  s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_date <= '2024-01-15'::TIMESTAMP
  AND (s.load_end_date IS NULL OR s.load_end_date > '2024-01-15'::TIMESTAMP);

-- Query current state
SELECT
  c.customer_bk,
  s.first_name,
  s.last_name,
  s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_end_date IS NULL;

Building Business Marts

-- Create business mart
CREATE OR REPLACE VIEW mart.customer_orders AS
SELECT
  c.customer_bk,
  c.first_name,
  c.last_name,
  c.email,
  COUNT(DISTINCT o.order_bk) as total_orders,
  SUM(o.order_amount) as total_spend,
  MIN(o.order_date) as first_order_date,
  MAX(o.order_date) as last_order_date
FROM dv.hub_customer c
JOIN dv.link_order_customer loc ON c.hub_customer_hk = loc.hub_customer_hk
JOIN dv.hub_order o ON loc.hub_order_hk = o.hub_order_hk
JOIN dv.sat_order_details s ON o.hub_order_hk = s.hub_order_hk
GROUP BY c.customer_bk, c.first_name, c.last_name, c.email;

Data Vault Benefits

BenefitDescription
AuditabilityComplete history of all changes
Parallel LoadingIndependent Hub/Link/Satellite loading
FlexibilityEasy to add new sources
ScalabilitySupports massive data volumes
Point-in-TimeHistorical query capability

Key Takeaways:

  • Data Vault 2.0 uses Hubs, Links, and Satellites for modeling
  • Hubs store business keys, Links store relationships, Satellites store attributes
  • Hash keys enable efficient joins and parallel loading
  • Incremental loading supports real-time data integration
  • Point-in-time queries enable historical analysis
  • Business Marts provide analytics-ready datasets

Advertisement

Need Expert Snowflake Help?

Get personalized warehouse optimization, data modeling, or Snowflake platform consulting.

Advertisement