Snowflake Advanced Β· Interview Prep
Schema Evolution & Semi-Structured Data
Difficulty: Hard Β· Commonly asked at Amazon, Netflix, Meta
Interview Question
"How does Snowflake handle schema evolution for semi-structured data? Walk me through using VARIANT columns, LATERAL FLATTEN, and schema detection for JSON/Parquet files. What are the performance trade-offs?"
βΉοΈ
Companies Asking This: Amazon (L5/L6 Data Engineer), Netflix (Senior Data Engineer), Meta (Data Platform Engineer), Apple (Staff Data Engineer)
Semi-Structured Data in Snowflake
Snowflake natively supports semi-structured data types: VARIANT, OBJECT, and ARRAY. These types can hold JSON, Avro, Parquet, ORC, and XML data.
VARIANT Data Type
-- VARIANT can store any semi-structured data
CREATE TABLE raw_json_data (
id NUMBER AUTOINCREMENT,
data VARIANT,
_file_name VARCHAR(500),
_load_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);
-- Insert JSON directly
INSERT INTO raw_json_data (data)
SELECT PARSE_JSON('{
"event_id": "evt_123",
"event_type": "page_view",
"timestamp": "2024-01-15T10:30:00Z",
"user": {
"id": "user_456",
"name": "John Doe",
"email": "john@example.com"
},
"properties": {
"page_url": "/home",
"referrer": "google.com",
"device": "mobile"
},
"tags": ["analytics", "web", "production"]
}');
-- Query using dot notation
SELECT
data:event_id::VARCHAR AS event_id,
data:event_type::VARCHAR AS event_type,
data:user:id::VARCHAR AS user_id,
data:user:name::VARCHAR AS user_name,
data:properties:page_url::VARCHAR AS page_url,
data:tags[0]::VARCHAR AS first_tag
FROM raw_json_data;
-- Check data type of VARIANT column
SELECT
VARIANT_TYPE(data:event_id) AS event_id_type,
VARIANT_TYPE(data:user:name) AS user_name_type,
VARIANT_TYPE(data:properties) AS properties_type
FROM raw_json_data;
LATERAL FLATTEN
FLATTEN is used to unnest VARIANT arrays and objects into rows.
-- Create table with nested JSON
CREATE TABLE user_events (
user_id VARCHAR(100),
events VARIANT
);
INSERT INTO user_events VALUES
('user_1', PARSE_JSON('{"events": [{"type": "click", "ts": "2024-01-15T10:00:00"}, {"type": "view", "ts": "2024-01-15T10:01:00"}]}')),
('user_2', PARSE_JSON('{"events": [{"type": "purchase", "ts": "2024-01-15T10:02:00"}]}'));
-- Flatten the events array
SELECT
u.user_id,
f.value:type::VARCHAR AS event_type,
f.value:ts::TIMESTAMP_NTZ AS event_time,
f.index AS event_index
FROM user_events u,
LATERAL FLATTEN(input => u.events:events) f;
-- Flatten with path
SELECT
u.user_id,
f.key AS property_key,
f.value AS property_value
FROM user_events u,
LATERAL FLATTEN(input => u.events, path => 'events') f;
-- Recursive flatten for deeply nested data
SELECT
u.user_id,
f1.value:type::VARCHAR AS event_type,
f2.value::VARCHAR AS tag
FROM user_events u,
LATERAL FLATTEN(input => u.events:events) f1,
LATERAL FLATTEN(input => f1.value:tags) f2;
-- FLATTEN modes: OBJECT, ARRAY, BOTH
SELECT * FROM LATERAL FLATTEN(input => '{"a": 1, "b": 2}'::VARIANT, mode => 'OBJECT');
SELECT * FROM LATERAL FLATTEN(input => '[1, 2, 3]'::VARIANT, mode => 'ARRAY');
Real-World Scenario: Amazon
Question: "Design a schema for ingesting 500GB/day of Parquet files with evolving schemas. Some files have new columns, some have renamed columns. How do you handle this?"
Solution: Schema Evolution Strategy
-- 1. Create target table with VARIANT for flexibility
CREATE TABLE events_evolution (
id NUMBER AUTOINCREMENT,
event_id VARCHAR(100),
event_type VARCHAR(50),
event_timestamp TIMESTAMP_NTZ,
raw_data VARIANT,
_schema_version VARCHAR(20),
_file_name VARCHAR(500),
_load_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);
-- 2. Create external stage for Parquet files
CREATE OR REPLACE STAGE parquet_stage
URL = 's3://data-lake/events/'
STORAGE_INTEGRATION = s3_integration
FILE_FORMAT = (TYPE = 'PARQUET');
-- 3. Load Parquet with schema detection
COPY INTO events_evolution (event_id, event_type, event_timestamp, raw_data, _schema_version, _file_name)
FROM (
SELECT
$1:event_id::VARCHAR,
$1:event_type::VARCHAR,
$1:timestamp::TIMESTAMP_NTZ,
$1, -- Store entire record
COALESCE($1:schema_version::VARCHAR, 'v1'),
METADATA$FILENAME
FROM @parquet_stage
)
FILE_FORMAT = (TYPE = 'PARQUET')
ON_ERROR = 'CONTINUE';
-- 4. Detect schema changes over time
WITH schema_snapshot AS (
SELECT
_schema_version,
OBJECT_KEYS(raw_data) AS columns,
COUNT(*) AS record_count
FROM events_evolution
GROUP BY 1, 2
)
SELECT
_schema_version,
columns,
record_count,
LAG(columns) OVER (ORDER BY _schema_version) AS prev_columns,
ARRAY_DIFF(columns, LAG(columns) OVER (ORDER BY _schema_version)) AS new_columns
FROM schema_snapshot
ORDER BY _schema_version;
-- 5. Create views for different schema versions
CREATE OR REPLACE VIEW events_v1_view AS
SELECT
id,
event_id,
event_type,
event_timestamp,
raw_data:user_id::VARCHAR AS user_id,
raw_data:page_url::VARCHAR AS page_url,
NULL::VARCHAR AS device_type,
NULL::VARCHAR AS app_version,
_schema_version
FROM events_evolution
WHERE _schema_version = 'v1';
CREATE OR REPLACE VIEW events_v2_view AS
SELECT
id,
event_id,
event_type,
event_timestamp,
raw_data:user_id::VARCHAR AS user_id,
raw_data:page_url::VARCHAR AS page_url,
raw_data:device_type::VARCHAR AS device_type,
raw_data:app_version::VARCHAR AS app_version,
_schema_version
FROM events_evolution
WHERE _schema_version = 'v2';
Real-World Scenario: Netflix
Question: "How do you optimize queries on VARIANT columns? The queries are slow because of all the parsing."
Performance Optimization for Semi-Structured Data
-- 1. Create table with explicit columns + VARIANT for unknown fields
CREATE TABLE events_optimized (
id NUMBER AUTOINCREMENT,
event_id VARCHAR(100) NOT NULL,
event_type VARCHAR(50) NOT NULL,
event_timestamp TIMESTAMP_NTZ NOT NULL,
user_id VARCHAR(100),
page_url VARCHAR(500),
device_type VARCHAR(50),
raw_data VARIANT, -- Keep for new/unknown fields
_ingestion_time TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
PRIMARY KEY (event_id)
);
-- 2. Load with explicit column extraction
COPY INTO events_optimized (event_id, event_type, event_timestamp, user_id, page_url, device_type, raw_data)
FROM (
SELECT
$1:event_id::VARCHAR,
$1:event_type::VARCHAR,
$1:timestamp::TIMESTAMP_NTZ,
$1:user_id::VARCHAR,
$1:page_url::VARCHAR,
$1:device_type::VARCHAR,
$1
FROM @parquet_stage
)
FILE_FORMAT = (TYPE = 'PARQUET')
ON_ERROR = 'CONTINUE';
-- 3. Create clustering key for common query patterns
ALTER TABLE events_optimized
CLUSTER BY (event_type, event_timestamp);
-- 4. Use generated columns for frequently queried VARIANT paths
ALTER TABLE events_optimized
ADD COLUMN user_country VARCHAR(2)
AS (raw_data:user:country::VARCHAR);
ALTER TABLE events_optimized
ADD COLUMN session_duration NUMBER
AS (raw_data:session:duration::NUMBER);
-- 5. Create search optimization for VARIANT queries
ALTER TABLE events_optimized
SET SEARCH_OPTIMIZATION = ON;
-- 6. Create search optimization on specific VARIANT paths
ALTER TABLE events_optimized
ADD SEARCH OPTIMIZATION ON
(raw_data:user:country, raw_data:properties:page_url);
-- 7. Query performance comparison
-- Before optimization (slow - full VARIANT parsing)
SELECT COUNT(*)
FROM events_optimized
WHERE raw_data:user:country::VARCHAR = 'US';
-- After optimization (fast - uses search optimization)
SELECT COUNT(*)
FROM events_optimized
WHERE user_country = 'US';
Schema Evolution Patterns
Pattern: Rename Detection
-- Create a schema change log
CREATE TABLE schema_changes (
change_id NUMBER AUTOINCREMENT,
table_name VARCHAR(100),
old_column_name VARCHAR(100),
new_column_name VARCHAR(100),
change_type VARCHAR(50),
detected_at TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
resolved BOOLEAN DEFAULT FALSE
);
-- Detect potential renames by comparing old and new schemas
WITH old_schema AS (
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'EVENTS'
AND column_name NOT LIKE '_%'
),
new_schema AS (
SELECT DISTINCT
f.key AS column_name
FROM events_evolution,
LATERAL FLATTEN(input => OBJECT_KEYS(raw_data)) f
)
SELECT
o.column_name AS old_name,
n.column_name AS new_name,
'POTENTIAL_RENAME' AS change_type
FROM old_schema o
CROSS JOIN new_schema n
WHERE o.column_name != n.column_name
AND LEVENSHTEIN(LOWER(o.column_name), LOWER(n.column_name)) <= 2;
Pattern: Schema Validation
-- Validate JSON against expected schema
CREATE OR REPLACE PROCEDURE validate_event_schema(
event_data VARIANT,
expected_schema VARIANT
)
RETURNS BOOLEAN
LANGUAGE SQL
AS
$$
DECLARE
required_fields ARRAY;
field VARIANT;
missing_fields ARRAY DEFAULT ARRAY_CONSTRUCT();
BEGIN
required_fields := expected_schema:required::ARRAY;
FOR field IN SELECT VALUE FROM TABLE(FLATTEN(input => required_fields)) DO
IF NOT OBJECT_HAS_KEY(event_data, field.value::VARCHAR) THEN
missing_fields := ARRAY_APPEND(missing_fields, field.value);
END IF;
END FOR;
RETURN ARRAY_SIZE(missing_fields) = 0;
END;
$$;
-- Use validation in COPY INTO
COPY INTO events_validated
FROM (
SELECT
*,
validate_event_schema($1, PARSE_JSON('{"required": ["event_id", "event_type", "timestamp"]}')) AS is_valid
FROM @parquet_stage
)
FILE_FORMAT = (TYPE = 'PARQUET');
Best Practices
| Practice | Recommendation |
|---|---|
| VARIANT usage | Use for unknown/flexible fields; extract known fields to columns |
| FLATTEN | Use LATERAL FLATTEN for array unnesting; consider performance impact |
| Clustering | Cluster by commonly queried VARIANT paths |
| Schema evolution | Track schema versions; create versioned views |
| Search optimization | Enable for frequently queried VARIANT paths |
| File format | Parquet > JSON for performance; use columnar for analytics |
β οΈ
Performance Warning: Querying VARIANT columns is slower than querying typed columns because Snowflake must parse JSON on each query. Extract frequently queried fields to typed columns and index them.