πΎ Materialized Views
Google & Amazon Interview Deep Dive
π’ Googleπ’ Amazonβ‘ Difficulty: Mediumβ±οΈ 30 min
π Interview Question
βΉοΈπ΄ Google/Amazon Interview Question
"Design a caching strategy using materialized views for an e-commerce analytics dashboard. Consider: 1) When to use materialized views vs regular views, 2) Refresh strategies (full vs incremental), 3) Handling stale data, 4) Performance trade-offs."
Companies: Google, Amazon | Difficulty: Medium | Time: 30 minutes
π Setup: E-Commerce Schema
CREATE TABLE products (
product_id SERIAL PRIMARY KEY,
product_name VARCHAR(200),
category VARCHAR(100),
price DECIMAL(10, 2),
cost DECIMAL(10, 2),
stock_quantity INT
);
CREATE TABLE orders (
order_id SERIAL PRIMARY KEY,
customer_id INT,
order_date TIMESTAMP,
status VARCHAR(20)
);
CREATE TABLE order_items (
item_id SERIAL PRIMARY KEY,
order_id INT REFERENCES orders(order_id),
product_id INT REFERENCES products(product_id),
quantity INT,
unit_price DECIMAL(10, 2),
discount DECIMAL(10, 2) DEFAULT 0
);
CREATE TABLE customers (
customer_id SERIAL PRIMARY KEY,
name VARCHAR(200),
email VARCHAR(200),
segment VARCHAR(50),
created_at TIMESTAMP DEFAULT NOW()
);
π‘ Part 1: Materialized Views vs Regular Views
βΉοΈπ View vs Materialized View
| Feature | Regular View | Materialized View |
|---|---|---|
| Storage | No storage, query re-run | Physical storage, cached result |
| Performance | Depends on underlying query | Fast reads, slow refresh |
| Data freshness | Always current | Stale until refreshed |
| Indexes | Cannot create | Can create indexes |
| Updates | Automatic | Manual refresh needed |
-- Regular View (no storage)
CREATE VIEW v_product_sales AS
SELECT
p.product_id,
p.product_name,
p.category,
SUM(oi.quantity) AS total_quantity_sold,
SUM(oi.quantity * oi.unit_price) AS total_revenue,
SUM(oi.quantity * (oi.unit_price - p.cost)) AS total_profit
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p.product_id, p.product_name, p.category;
-- Materialized View (stores result)
CREATE MATERIALIZED VIEW mv_product_sales AS
SELECT
p.product_id,
p.product_name,
p.category,
SUM(oi.quantity) AS total_quantity_sold,
SUM(oi.quantity * oi.unit_price) AS total_revenue,
SUM(oi.quantity * (oi.unit_price - p.cost)) AS total_profit,
NOW() AS last_refreshed
FROM products p
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status = 'completed'
GROUP BY p.product_id, p.product_name, p.category;
π Part 2: Refresh Strategies
Full Refresh
-- Full refresh: Rebuild entire materialized view
REFRESH MATERIALIZED VIEW mv_product_sales;
-- Concurrent refresh (PostgreSQL): Allows reads during refresh
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_product_sales;
-- Requires unique index on materialized view
CREATE UNIQUE INDEX idx_mv_product_sales_id ON mv_product_sales(product_id);
Incremental Refresh Pattern
-- Create materialized view with refresh tracking
CREATE MATERIALIZED VIEW mv_orders_daily AS
SELECT
DATE_TRUNC('day', order_date)::DATE AS order_date,
COUNT(*) AS order_count,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) AS completed_orders,
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END) AS cancelled_orders
FROM orders
GROUP BY DATE_TRUNC('day', order_date)
WITH DATA;
-- Refresh only recent data (incremental pattern)
CREATE OR REPLACE PROCEDURE refresh_incremental_mv()
LANGUAGE plpgsql
AS $$
DECLARE
v_last_refresh TIMESTAMP;
v_cutoff TIMESTAMP;
BEGIN
-- Get last refresh time
SELECT MAX(order_date) INTO v_last_refresh
FROM mv_orders_daily;
v_cutoff := COALESCE(v_last_refresh, '2020-01-01'::timestamp);
-- Delete affected rows
DELETE FROM mv_orders_daily
WHERE order_date >= v_cutoff::date;
-- Insert updated rows
INSERT INTO mv_orders_daily
SELECT
DATE_TRUNC('day', order_date)::DATE,
COUNT(*),
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END),
SUM(CASE WHEN status = 'cancelled' THEN 1 ELSE 0 END)
FROM orders
WHERE order_date >= v_cutoff
GROUP BY DATE_TRUNC('day', order_date);
RAISE NOTICE 'Incremental refresh completed from %', v_cutoff;
END;
$$;
π Part 3: Materialized View Patterns
Dashboard Aggregates
-- Sales dashboard materialized view
CREATE MATERIALIZED VIEW mv_sales_dashboard AS
WITH
daily_metrics AS (
SELECT
DATE_TRUNC('day', o.order_date)::DATE AS sale_date,
p.category,
COUNT(DISTINCT o.order_id) AS orders,
COUNT(DISTINCT o.customer_id) AS unique_customers,
SUM(oi.quantity) AS units_sold,
SUM(oi.quantity * oi.unit_price) AS revenue,
SUM(oi.quantity * (oi.unit_price - p.cost)) AS profit
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.status = 'completed'
GROUP BY 1, 2
),
category_totals AS (
SELECT
category,
SUM(revenue) AS total_revenue,
SUM(profit) AS total_profit,
AVG(revenue) AS avg_daily_revenue
FROM daily_metrics
GROUP BY category
)
SELECT
dm.*,
ct.total_revenue AS category_total_revenue,
ROUND(dm.revenue * 100.0 / ct.total_revenue, 2) AS category_share_pct,
ROUND(dm.profit * 100.0 / NULLIF(dm.revenue, 0), 2) AS profit_margin_pct
FROM daily_metrics dm
JOIN category_totals ct ON dm.category = ct.category;
-- Add indexes for fast dashboard queries
CREATE INDEX idx_mv_sales_date ON mv_sales_dashboard(sale_date);
CREATE INDEX idx_mv_sales_category ON mv_sales_dashboard(category);
CREATE INDEX idx_mv_sales_date_category ON mv_sales_dashboard(sale_date, category);
Customer Segmentation
-- Customer segmentation materialized view
CREATE MATERIALIZED VIEW mv_customer_segmentation AS
WITH customer_metrics AS (
SELECT
c.customer_id,
c.name,
c.segment,
COUNT(DISTINCT o.order_id) AS total_orders,
SUM(oi.quantity * oi.unit_price) AS total_spent,
AVG(oi.quantity * oi.unit_price) AS avg_order_value,
MAX(o.order_date) AS last_order_date,
MIN(o.order_date) AS first_order_date,
COUNT(DISTINCT DATE_TRUNC('month', o.order_date)) AS active_months
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
LEFT JOIN order_items oi ON o.order_id = oi.order_id
WHERE o.status = 'completed'
GROUP BY c.customer_id, c.name, c.segment
)
SELECT
*,
CASE
WHEN total_spent > 10000 THEN 'VIP'
WHEN total_spent > 5000 THEN 'Premium'
WHEN total_spent > 1000 THEN 'Regular'
ELSE 'New'
END AS value_tier,
CASE
WHEN last_order_date > NOW() - INTERVAL '30 days' THEN 'Active'
WHEN last_order_date > NOW() - INTERVAL '90 days' THEN 'At Risk'
WHEN last_order_date > NOW() - INTERVAL '180 days' THEN 'Lapsing'
ELSE 'Churned'
END AS activity_status,
NOW() AS last_refreshed
FROM customer_metrics;
π Part 4: Refresh Scheduling
-- Schedule refresh using pg_cron (if available)
-- Refresh daily at 2 AM
SELECT cron.schedule(
'refresh-sales-dashboard',
'0 2 * * *',
'REFRESH MATERIALIZED VIEW CONCURRENTLY mv_sales_dashboard'
);
-- Refresh every hour
SELECT cron.schedule(
'refresh-customer-segmentation',
'0 * * * *',
'REFRESH MATERIALIZED VIEW CONCURRENTLY mv_customer_segmentation'
);
-- Manual refresh with timing
CREATE OR REPLACE PROCEDURE refresh_with_timing(p_view_name TEXT)
LANGUAGE plpgsql
AS $$
DECLARE
v_start TIMESTAMP;
v_end TIMESTAMP;
v_duration INTERVAL;
BEGIN
v_start := clock_timestamp();
EXECUTE FORMAT('REFRESH MATERIALIZED VIEW CONCURRENTLY %I', p_view_name);
v_end := clock_timestamp();
v_duration := v_end - v_start;
RAISE NOTICE 'Refreshed % in %', p_view_name, v_duration;
END;
$$;
CALL refresh_with_timing('mv_sales_dashboard');
π Part 5: Monitoring Materialized Views
-- Check materialized view sizes
SELECT
schemaname,
matviewname,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||matviewname)) AS total_size
FROM pg_matviews
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||matviewname) DESC;
-- Check staleness (if tracking refresh time)
SELECT
matviewname,
CASE
WHEN pg_total_relation_size(schemaname||'.'||matviewname) > 1073741824
THEN 'Consider refresh strategy'
ELSE 'OK'
END AS size_status
FROM pg_matviews;
-- View refresh history (custom tracking table)
CREATE TABLE mv_refresh_log (
log_id SERIAL PRIMARY KEY,
view_name VARCHAR(100),
refresh_start TIMESTAMP,
refresh_end TIMESTAMP,
duration INTERVAL,
rows_affected INT
);
-- Track refresh performance
CREATE OR REPLACE PROCEDURE tracked_refresh(p_view_name TEXT)
LANGUAGE plpgsql
AS $$
DECLARE
v_start TIMESTAMP;
v_end TIMESTAMP;
v_rows INT;
BEGIN
v_start := clock_timestamp();
EXECUTE FORMAT('REFRESH MATERIALIZED VIEW CONCURRENTLY %I', p_view_name);
GET DIAGNOSTICS v_rows = ROW_COUNT;
v_end := clock_timestamp();
INSERT INTO mv_refresh_log (view_name, refresh_start, refresh_end, duration, rows_affected)
VALUES (p_view_name, v_start, v_end, v_end - v_start, v_rows);
END;
$$;
π― Quiz Section
π Best Practices for Interviews
π‘β Materialized View Best Practices
1. Add Indexes for Query Patterns:
-- Create indexes based on how the MV will be queried
CREATE INDEX idx_mv_date ON mv_sales(sale_date);
CREATE INDEX idx_mv_category ON mv_sales(category);
2. Use CONCURRENTLY for Production:
-- Avoid blocking reads during refresh
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_name;
-- Requires unique index
3. Monitor Staleness:
-- Add refresh timestamp
SELECT *, NOW() AS last_refreshed FROM mv_sales;
-- Or use tracking table
INSERT INTO mv_refresh_log (view_name, refreshed_at)
VALUES ('mv_sales', NOW());
4. Consider Refresh Frequency:
-- Daily for historical reports
-- Hourly for near-real-time dashboards
-- On-demand for ad-hoc analysis
5. Plan for Growth:
-- Partition materialized views if they grow large
-- Consider archiving old data
-- Monitor size and refresh times
β οΈβ οΈ Common Pitfalls
- Forgetting to refresh: Data becomes permanently stale
- Refreshing too frequently: Wastes resources
- No unique index: Can't use CONCURRENTLY
- Large MVs: Long refresh times, consider partitioning
- Complex MVs: May not support all operations