Snowflake Data Quality Frameworks
Data quality frameworks in Snowflake ensure data accuracy, completeness, consistency, and timeliness through systematic validation and monitoring.
Data Quality Dimensions
Completeness
-- Check for null values
SELECT
'email' as column_name,
COUNT(*) as total_rows,
COUNT(email) as non_null_rows,
ROUND(COUNT(email) / COUNT(*) * 100, 2) as completeness_pct
FROM customers
UNION ALL
SELECT
'phone' as column_name,
COUNT(*) as total_rows,
COUNT(phone) as non_null_rows,
ROUND(COUNT(phone) / COUNT(*) * 100, 2) as completeness_pct
FROM customers;
-- Create completeness validation
CREATE OR REPLACE PROCEDURE check_completeness(
table_name STRING,
column_name STRING,
threshold FLOAT
)
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
completeness FLOAT;
BEGIN
EXECUTE IMMEDIATE
'SELECT COUNT(' || column_name || ') / COUNT(*) FROM ' || table_name
INTO completeness;
IF completeness >= threshold THEN
RETURN 'PASS: ' || ROUND(completeness * 100, 2) || '% completeness';
ELSE
RETURN 'FAIL: ' || ROUND(completeness * 100, 2) || '% completeness (threshold: ' || ROUND(threshold * 100, 2) || '%)';
END IF;
END;
$$;
-- Execute completeness check
CALL check_completeness('CUSTOMERS', 'EMAIL', 0.95);
Accuracy
-- Validate email format
CREATE OR REPLACE MASKING POLICY email_validation AS (val STRING)
RETURNS STRING ->
CASE
WHEN REGEXP_LIKE(val, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN val
ELSE 'INVALID_EMAIL'
END;
-- Check for valid dates
SELECT
order_id,
order_date,
CASE
WHEN order_date > CURRENT_DATE() THEN 'FUTURE_DATE'
WHEN order_date < '2000-01-01' THEN 'INVALID_DATE'
ELSE 'VALID'
END as date_validation
FROM orders;
-- Create accuracy validation
CREATE OR REPLACE PROCEDURE validate_data_accuracy()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
invalid_count INTEGER;
BEGIN
-- Check invalid emails
SELECT COUNT(*) INTO invalid_count
FROM customers
WHERE NOT REGEXP_LIKE(email, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$');
IF invalid_count > 0 THEN
RETURN 'ERROR: ' || invalid_count || ' invalid emails found';
END IF;
-- Check invalid dates
SELECT COUNT(*) INTO invalid_count
FROM orders
WHERE order_date > CURRENT_DATE() OR order_date < '2000-01-01';
IF invalid_count > 0 THEN
RETURN 'ERROR: ' || invalid_count || ' invalid dates found';
END IF;
RETURN 'SUCCESS: All data accurate';
END;
$$;
Consistency
-- Check cross-table consistency
SELECT
o.customer_id,
COUNT(o.order_id) as order_count,
CASE
WHEN c.customer_id IS NULL THEN 'ORPHAN_ORDER'
ELSE 'CONSISTENT'
END as consistency_status
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
WHERE c.customer_id IS NULL;
-- Create consistency validation
CREATE OR REPLACE PROCEDURE check_cross_table_consistency()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
orphan_count INTEGER;
BEGIN
SELECT COUNT(*) INTO orphan_count
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
WHERE c.customer_id IS NULL;
IF orphan_count > 0 THEN
RETURN 'ERROR: ' || orphan_count || ' orphan orders found';
END IF;
RETURN 'SUCCESS: Cross-table consistency maintained';
END;
$$;
Data Profiling
-- Comprehensive data profiling
CREATE OR REPLACE PROCEDURE profile_table(table_name STRING)
RETURNS VARIANT
LANGUAGE SQL
AS
$$
DECLARE
result VARIANT;
BEGIN
EXECUTE IMMEDIATE
'SELECT
COUNT(*) as total_rows,
COUNT(DISTINCT *) as unique_values,
MIN(created_at) as min_date,
MAX(created_at) as max_date,
AVG(amount) as avg_amount,
STDDEV(amount) as stddev_amount
FROM ' || table_name
INTO result;
RETURN result;
END;
$$;
-- Profile specific columns
SELECT
COUNT(*) as total_rows,
COUNT(DISTINCT customer_id) as unique_customers,
MIN(order_date) as min_date,
MAX(order_date) as max_date,
AVG(amount) as avg_order_amount,
STDDEV(amount) as stddev_amount,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_amount
FROM orders;
Data Quality Rules Engine
Rule Definitions
-- Create quality rules table
CREATE OR REPLACE TABLE data_quality_rules (
rule_id INTEGER PRIMARY KEY,
table_name STRING,
column_name STRING,
rule_type STRING,
rule_expression STRING,
threshold FLOAT,
enabled BOOLEAN DEFAULT TRUE
);
-- Insert quality rules
INSERT INTO data_quality_rules VALUES
(1, 'CUSTOMERS', 'EMAIL', 'NOT_NULL', 'email IS NOT NULL', 0.95, TRUE),
(2, 'CUSTOMERS', 'EMAIL', 'FORMAT', 'REGEXP_LIKE(email, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'')', 0.98, TRUE),
(3, 'ORDERS', 'AMOUNT', 'RANGE', 'amount > 0 AND amount < 1000000', 0.99, TRUE),
(4, 'ORDERS', 'ORDER_DATE', 'NOT_FUTURE', 'order_date <= CURRENT_DATE()', 1.0, TRUE);
Rule Execution
-- Execute quality rules
CREATE OR REPLACE PROCEDURE execute_quality_rules()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
rule_record RECORD;
total_rows INTEGER;
passing_rows INTEGER;
result STRING;
BEGIN
FOR rule_record IN
SELECT * FROM data_quality_rules WHERE enabled = TRUE
DO
-- Count total rows
EXECUTE IMMEDIATE
'SELECT COUNT(*) FROM ' || rule_record.table_name
INTO total_rows;
-- Count passing rows
EXECUTE IMMEDIATE
'SELECT COUNT(*) FROM ' || rule_record.table_name
|| ' WHERE ' || rule_record.rule_expression
INTO passing_rows;
-- Check threshold
IF (passing_rows::FLOAT / total_rows) < rule_record.threshold THEN
result := result || 'FAIL: Rule ' || rule_record.rule_id || ' - ' ||
ROUND((passing_rows::FLOAT / total_rows) * 100, 2) || '% passing\n';
END IF;
END FOR;
RETURN COALESCE(result, 'SUCCESS: All rules passing');
END;
$$;
-- Schedule rule execution
CREATE TASK daily_quality_check
SCHEDULE = 'USING CRON 0 2 * * * America/New_York'
AS
CALL execute_quality_rules();
Data quality rules should be defined collaboratively with business stakeholders to ensure they capture meaningful business requirements. Technical rules alone may not capture all quality concerns.
Quality Monitoring Dashboard
-- Create quality metrics view
CREATE OR REPLACE VIEW data_quality_dashboard AS
SELECT
table_name,
COUNT(*) as total_rules,
SUM(CASE WHEN last_execution_result = 'PASS' THEN 1 ELSE 0 END) as passing_rules,
ROUND(SUM(CASE WHEN last_execution_result = 'PASS' THEN 1 ELSE 0 END) / COUNT(*) * 100, 2) as quality_score
FROM data_quality_rules r
LEFT JOIN (
SELECT rule_id, 'PASS' as last_execution_result
FROM quality_execution_history
WHERE execution_date = (SELECT MAX(execution_date) FROM quality_execution_history)
) h ON r.rule_id = h.rule_id
GROUP BY table_name;
-- Monitor quality trends
SELECT
execution_date,
table_name,
quality_score,
LAG(quality_score) OVER (PARTITION BY table_name ORDER BY execution_date) as prev_score,
quality_score - LAG(quality_score) OVER (PARTITION BY table_name ORDER BY execution_date) as score_change
FROM quality_metrics_history
WHERE execution_date >= DATEADD('day', -30, CURRENT_DATE());
Data Remediation
-- Auto-fix common issues
CREATE OR REPLACE PROCEDURE remediate_data_issues()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
fixed_count INTEGER;
BEGIN
-- Fix invalid emails
UPDATE customers
SET email = LOWER(TRIM(email))
WHERE email != LOWER(TRIM(email));
GET DIAGNOSTICS fixed_count = ROW_COUNT;
-- Fix future dates
UPDATE orders
SET order_date = CURRENT_DATE()
WHERE order_date > CURRENT_DATE();
GET DIAGNOSTICS fixed_count = fixed_count + ROW_COUNT;
RETURN 'Fixed ' || fixed_count || ' records';
END;
$$;
-- Create data quality exception table
CREATE TABLE data_quality_exceptions (
exception_id INTEGER PRIMARY KEY,
table_name STRING,
column_name STRING,
rule_id INTEGER,
exception_value STRING,
detected_at TIMESTAMP_NTZ,
resolved_at TIMESTAMP_NTZ,
resolution_notes STRING
);
Key Takeaways:
- Data quality frameworks ensure accuracy, completeness, and consistency
- Data profiling provides comprehensive dataset analysis
- Quality rules engine automates validation processes
- Monitoring dashboards track quality trends over time
- Remediation procedures automatically fix common issues
- Exception tracking enables systematic issue resolution