Advanced Data Modeling

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

Star Schema Design

-- Fact table
CREATE TABLE fact_sales (
  sale_id BIGINT PRIMARY KEY,
  date_key INT REFERENCES dim_date(date_key),
  product_key INT REFERENCES dim_product(product_key),
  customer_key INT REFERENCES dim_customer(customer_key),
  store_key INT REFERENCES dim_store(store_key),
  quantity INT,
  unit_price DECIMAL(10,2),
  discount_amount DECIMAL(10,2),
  total_amount DECIMAL(10,2)
);

-- Dimension tables
CREATE TABLE dim_date (
  date_key INT PRIMARY KEY,
  full_date DATE,
  year INT,
  quarter INT,
  month INT,
  day_of_week INT,
  week_number INT
);

CREATE TABLE dim_product (
  product_key INT PRIMARY KEY,
  product_id INT,
  product_name VARCHAR(255),
  category VARCHAR(100),
  subcategory VARCHAR(100),
  brand VARCHAR(100)
);

ℹ️

Key Insight: Star schema denormalizes dimension tables for query performance. Fact tables contain foreign keys to dimensions and measurable metrics. This structure is optimal for OLAP queries.

Snowflake Schema

-- Normalized dimensions
CREATE TABLE dim_product (
  product_key INT PRIMARY KEY,
  product_id INT,
  product_name VARCHAR(255),
  category_key INT REFERENCES dim_category(category_key),
  brand_key INT REFERENCES dim_brand(brand_key)
);

CREATE TABLE dim_category (
  category_key INT PRIMARY KEY,
  category_name VARCHAR(100),
  department_key INT REFERENCES dim_department(department_key)
);

CREATE TABLE dim_department (
  department_key INT PRIMARY KEY,
  department_name VARCHAR(100)
);

Slowly Changing Dimensions (SCD)

-- SCD Type 1: Overwrite
UPDATE dim_customer
SET city = 'San Francisco',
    last_modified = CURRENT_TIMESTAMP
WHERE customer_id = 101;

-- SCD Type 2: Add new row
INSERT INTO dim_customer (
  customer_id, customer_name, city,
  effective_date, expiry_date, is_current
)
SELECT
  customer_id, customer_name, 'San Francisco',
  CURRENT_DATE, '9999-12-31', true
FROM dim_customer
WHERE customer_id = 101 AND is_current = true;

UPDATE dim_customer
SET expiry_date = CURRENT_DATE - 1,
    is_current = false
WHERE customer_id = 101 AND is_current = true;

-- SCD Type 3: Add new column
ALTER TABLE dim_customer ADD COLUMN previous_city VARCHAR(100);
UPDATE dim_customer
SET previous_city = city,
    city = 'San Francisco'
WHERE customer_id = 101;

Bridge Table for Many-to-Many

-- Many-to-many relationship
CREATE TABLE bridge_order_products (
  order_id INT,
  product_id INT,
  quantity INT,
  unit_price DECIMAL(10,2),
  PRIMARY KEY (order_id, product_id)
);

-- Query with bridge table
SELECT
  o.order_id,
  p.product_name,
  b.quantity,
  b.unit_price,
  b.quantity * b.unit_price AS line_total
FROM orders o
INNER JOIN bridge_order_products b ON o.order_id = b.order_id
INNER JOIN products p ON b.product_id = p.product_id;

Temporal Data Model

-- Bi-temporal table
CREATE TABLE employee_history (
  employee_id INT,
  effective_date DATE,
  expiry_date DATE,
  as_of_date TIMESTAMP,
  name VARCHAR(255),
  department_id INT,
  salary DECIMAL(10,2),
  PRIMARY KEY (employee_id, effective_date, as_of_date)
);

-- Query point in time
SELECT *
FROM employee_history
WHERE employee_id = 101
  AND effective_date <= '2024-06-15'
  AND expiry_date > '2024-06-15'
  AND as_of_date = (
    SELECT MAX(as_of_date)
    FROM employee_history
    WHERE employee_id = 101
      AND effective_date <= '2024-06-15'
  );

BigQuery Denormalized Model

-- BigQuery nested and repeated fields
CREATE TABLE `project.dataset.orders` (
  order_id INT64,
  customer STRUCT<
    customer_id INT64,
    name STRING,
    email STRING
  >,
  items ARRAY<STRUCT<
    product_id INT64,
    product_name STRING,
    quantity INT64,
    price NUMERIC
  >>,
  order_date TIMESTAMP
);

-- Query nested fields
SELECT
  order_id,
  customer.name,
  item.product_name,
  item.quantity
FROM `project.dataset.orders`,
UNNEST(items) AS item
WHERE customer.name LIKE '%Smith%';

Data Vault Modeling

-- Hub: Business keys
CREATE TABLE hub_customer (
  customer_hk CHAR(32) PRIMARY KEY,
  customer_bk VARCHAR(100),
  load_date TIMESTAMP,
  record_source VARCHAR(100)
);

-- Link: Relationships
CREATE TABLE link_order_customer (
  order_customer_hk CHAR(32) PRIMARY KEY,
  order_hk CHAR(32),
  customer_hk CHAR(32),
  load_date TIMESTAMP,
  record_source VARCHAR(100)
);

-- Satellite: Descriptive attributes
CREATE TABLE sat_customer_details (
  customer_hk CHAR(32),
  load_date TIMESTAMP,
  load_end_date TIMESTAMP,
  name VARCHAR(255),
  email VARCHAR(255),
  phone VARCHAR(50),
  record_source VARCHAR(100)
);

Graph Data Model

-- Nodes and edges for graph data
CREATE TABLE graph_nodes (
  node_id VARCHAR(100) PRIMARY KEY,
  node_type VARCHAR(50),
  properties JSONB
);

CREATE TABLE graph_edges (
  edge_id SERIAL PRIMARY KEY,
  source_node VARCHAR(100),
  target_node VARCHAR(100),
  edge_type VARCHAR(50),
  weight DECIMAL(10,4),
  properties JSONB,
  FOREIGN KEY (source_node) REFERENCES graph_nodes(node_id),
  FOREIGN KEY (target_node) REFERENCES graph_nodes(node_id)
);

-- Query graph relationships
SELECT
  n1.node_id AS source,
  n2.node_id AS target,
  e.edge_type,
  e.weight
FROM graph_nodes n1
INNER JOIN graph_edges e ON n1.node_id = e.source_node
INNER JOIN graph_nodes n2 ON e.target_node = n2.node_id
WHERE n1.node_type = 'user'
  AND n2.node_type = 'product';

Materialized Path Pattern

-- Store hierarchy as path
CREATE TABLE categories (
  id INT PRIMARY KEY,
  name VARCHAR(100),
  path TEXT,  -- '/1/5/12/'
  depth INT
);

-- Query all descendants
SELECT *
FROM categories
WHERE path LIKE '/1/5/%'
ORDER BY path;

-- Query ancestors
SELECT *
FROM categories
WHERE id IN (
  SELECT UNNEST(STRING_TO_ARRAY('/1/5/12/', '/')::INT[])
  WHERE UNNEST != ''
);

Adjacency List with Closure Table

-- Closure table for hierarchy
CREATE TABLE category_closure (
  ancestor_id INT,
  descendant_id INT,
  depth INT,
  PRIMARY KEY (ancestor_id, descendant_id)
);

-- Query descendants
SELECT c.*
FROM categories c
INNER JOIN category_closure cc ON c.id = cc.descendant_id
WHERE cc.ancestor_id = 5;

-- Query ancestors
SELECT c.*
FROM categories c
INNER JOIN category_closure cc ON c.id = cc.ancestor_id
WHERE cc.descendant_id = 12;

Follow-Up Questions

When would you choose star schema over snowflake schema?
How do you handle many-to-many relationships in a star schema?
Explain the trade-offs between SCD Type 1, 2, and 3.
How would you design a data model for real-time analytics?
What's the best approach for modeling hierarchical data?
How do you handle schema evolution in a data warehouse?