Advanced Data Modeling
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
Star Schema Design
-- Fact table
CREATE TABLE fact_sales (
sale_id BIGINT PRIMARY KEY,
date_key INT REFERENCES dim_date(date_key),
product_key INT REFERENCES dim_product(product_key),
customer_key INT REFERENCES dim_customer(customer_key),
store_key INT REFERENCES dim_store(store_key),
quantity INT,
unit_price DECIMAL(10,2),
discount_amount DECIMAL(10,2),
total_amount DECIMAL(10,2)
);
-- Dimension tables
CREATE TABLE dim_date (
date_key INT PRIMARY KEY,
full_date DATE,
year INT,
quarter INT,
month INT,
day_of_week INT,
week_number INT
);
CREATE TABLE dim_product (
product_key INT PRIMARY KEY,
product_id INT,
product_name VARCHAR(255),
category VARCHAR(100),
subcategory VARCHAR(100),
brand VARCHAR(100)
);
βΉοΈ
Key Insight: Star schema denormalizes dimension tables for query performance. Fact tables contain foreign keys to dimensions and measurable metrics. This structure is optimal for OLAP queries.
Snowflake Schema
-- Normalized dimensions
CREATE TABLE dim_product (
product_key INT PRIMARY KEY,
product_id INT,
product_name VARCHAR(255),
category_key INT REFERENCES dim_category(category_key),
brand_key INT REFERENCES dim_brand(brand_key)
);
CREATE TABLE dim_category (
category_key INT PRIMARY KEY,
category_name VARCHAR(100),
department_key INT REFERENCES dim_department(department_key)
);
CREATE TABLE dim_department (
department_key INT PRIMARY KEY,
department_name VARCHAR(100)
);
Slowly Changing Dimensions (SCD)
-- SCD Type 1: Overwrite
UPDATE dim_customer
SET city = 'San Francisco',
last_modified = CURRENT_TIMESTAMP
WHERE customer_id = 101;
-- SCD Type 2: Add new row
INSERT INTO dim_customer (
customer_id, customer_name, city,
effective_date, expiry_date, is_current
)
SELECT
customer_id, customer_name, 'San Francisco',
CURRENT_DATE, '9999-12-31', true
FROM dim_customer
WHERE customer_id = 101 AND is_current = true;
UPDATE dim_customer
SET expiry_date = CURRENT_DATE - 1,
is_current = false
WHERE customer_id = 101 AND is_current = true;
-- SCD Type 3: Add new column
ALTER TABLE dim_customer ADD COLUMN previous_city VARCHAR(100);
UPDATE dim_customer
SET previous_city = city,
city = 'San Francisco'
WHERE customer_id = 101;
Bridge Table for Many-to-Many
-- Many-to-many relationship
CREATE TABLE bridge_order_products (
order_id INT,
product_id INT,
quantity INT,
unit_price DECIMAL(10,2),
PRIMARY KEY (order_id, product_id)
);
-- Query with bridge table
SELECT
o.order_id,
p.product_name,
b.quantity,
b.unit_price,
b.quantity * b.unit_price AS line_total
FROM orders o
INNER JOIN bridge_order_products b ON o.order_id = b.order_id
INNER JOIN products p ON b.product_id = p.product_id;
Temporal Data Model
-- Bi-temporal table
CREATE TABLE employee_history (
employee_id INT,
effective_date DATE,
expiry_date DATE,
as_of_date TIMESTAMP,
name VARCHAR(255),
department_id INT,
salary DECIMAL(10,2),
PRIMARY KEY (employee_id, effective_date, as_of_date)
);
-- Query point in time
SELECT *
FROM employee_history
WHERE employee_id = 101
AND effective_date <= '2024-06-15'
AND expiry_date > '2024-06-15'
AND as_of_date = (
SELECT MAX(as_of_date)
FROM employee_history
WHERE employee_id = 101
AND effective_date <= '2024-06-15'
);
BigQuery Denormalized Model
-- BigQuery nested and repeated fields
CREATE TABLE `project.dataset.orders` (
order_id INT64,
customer STRUCT<
customer_id INT64,
name STRING,
email STRING
>,
items ARRAY<STRUCT<
product_id INT64,
product_name STRING,
quantity INT64,
price NUMERIC
>>,
order_date TIMESTAMP
);
-- Query nested fields
SELECT
order_id,
customer.name,
item.product_name,
item.quantity
FROM `project.dataset.orders`,
UNNEST(items) AS item
WHERE customer.name LIKE '%Smith%';
Data Vault Modeling
-- Hub: Business keys
CREATE TABLE hub_customer (
customer_hk CHAR(32) PRIMARY KEY,
customer_bk VARCHAR(100),
load_date TIMESTAMP,
record_source VARCHAR(100)
);
-- Link: Relationships
CREATE TABLE link_order_customer (
order_customer_hk CHAR(32) PRIMARY KEY,
order_hk CHAR(32),
customer_hk CHAR(32),
load_date TIMESTAMP,
record_source VARCHAR(100)
);
-- Satellite: Descriptive attributes
CREATE TABLE sat_customer_details (
customer_hk CHAR(32),
load_date TIMESTAMP,
load_end_date TIMESTAMP,
name VARCHAR(255),
email VARCHAR(255),
phone VARCHAR(50),
record_source VARCHAR(100)
);
Graph Data Model
-- Nodes and edges for graph data
CREATE TABLE graph_nodes (
node_id VARCHAR(100) PRIMARY KEY,
node_type VARCHAR(50),
properties JSONB
);
CREATE TABLE graph_edges (
edge_id SERIAL PRIMARY KEY,
source_node VARCHAR(100),
target_node VARCHAR(100),
edge_type VARCHAR(50),
weight DECIMAL(10,4),
properties JSONB,
FOREIGN KEY (source_node) REFERENCES graph_nodes(node_id),
FOREIGN KEY (target_node) REFERENCES graph_nodes(node_id)
);
-- Query graph relationships
SELECT
n1.node_id AS source,
n2.node_id AS target,
e.edge_type,
e.weight
FROM graph_nodes n1
INNER JOIN graph_edges e ON n1.node_id = e.source_node
INNER JOIN graph_nodes n2 ON e.target_node = n2.node_id
WHERE n1.node_type = 'user'
AND n2.node_type = 'product';
Materialized Path Pattern
-- Store hierarchy as path
CREATE TABLE categories (
id INT PRIMARY KEY,
name VARCHAR(100),
path TEXT, -- '/1/5/12/'
depth INT
);
-- Query all descendants
SELECT *
FROM categories
WHERE path LIKE '/1/5/%'
ORDER BY path;
-- Query ancestors
SELECT *
FROM categories
WHERE id IN (
SELECT UNNEST(STRING_TO_ARRAY('/1/5/12/', '/')::INT[])
WHERE UNNEST != ''
);
Adjacency List with Closure Table
-- Closure table for hierarchy
CREATE TABLE category_closure (
ancestor_id INT,
descendant_id INT,
depth INT,
PRIMARY KEY (ancestor_id, descendant_id)
);
-- Query descendants
SELECT c.*
FROM categories c
INNER JOIN category_closure cc ON c.id = cc.descendant_id
WHERE cc.ancestor_id = 5;
-- Query ancestors
SELECT c.*
FROM categories c
INNER JOIN category_closure cc ON c.id = cc.ancestor_id
WHERE cc.descendant_id = 12;
Follow-Up Questions
- When would you choose star schema over snowflake schema?
- How do you handle many-to-many relationships in a star schema?
- Explain the trade-offs between SCD Type 1, 2, and 3.
- How would you design a data model for real-time analytics?
- What's the best approach for modeling hierarchical data?
- How do you handle schema evolution in a data warehouse?