CW

Snowflake Data Quality Frameworks

Free Lesson

Advertisement

Snowflake Data Quality Frameworks

Data quality frameworks in Snowflake ensure data accuracy, completeness, consistency, and timeliness through systematic validation and monitoring.

Data Quality Dimensions

Completeness

-- Check for null values
SELECT
  'email' as column_name,
  COUNT(*) as total_rows,
  COUNT(email) as non_null_rows,
  ROUND(COUNT(email) / COUNT(*) * 100, 2) as completeness_pct
FROM customers

UNION ALL

SELECT
  'phone' as column_name,
  COUNT(*) as total_rows,
  COUNT(phone) as non_null_rows,
  ROUND(COUNT(phone) / COUNT(*) * 100, 2) as completeness_pct
FROM customers;

-- Create completeness validation
CREATE OR REPLACE PROCEDURE check_completeness(
  table_name STRING,
  column_name STRING,
  threshold FLOAT
)
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  completeness FLOAT;
BEGIN
  EXECUTE IMMEDIATE
    'SELECT COUNT(' || column_name || ') / COUNT(*) FROM ' || table_name
    INTO completeness;

  IF completeness >= threshold THEN
    RETURN 'PASS: ' || ROUND(completeness * 100, 2) || '% completeness';
  ELSE
    RETURN 'FAIL: ' || ROUND(completeness * 100, 2) || '% completeness (threshold: ' || ROUND(threshold * 100, 2) || '%)';
  END IF;
END;
$$;

-- Execute completeness check
CALL check_completeness('CUSTOMERS', 'EMAIL', 0.95);

Accuracy

-- Validate email format
CREATE OR REPLACE MASKING POLICY email_validation AS (val STRING)
RETURNS STRING ->
  CASE
    WHEN REGEXP_LIKE(val, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN val
    ELSE 'INVALID_EMAIL'
  END;

-- Check for valid dates
SELECT
  order_id,
  order_date,
  CASE
    WHEN order_date > CURRENT_DATE() THEN 'FUTURE_DATE'
    WHEN order_date < '2000-01-01' THEN 'INVALID_DATE'
    ELSE 'VALID'
  END as date_validation
FROM orders;

-- Create accuracy validation
CREATE OR REPLACE PROCEDURE validate_data_accuracy()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  invalid_count INTEGER;
BEGIN
  -- Check invalid emails
  SELECT COUNT(*) INTO invalid_count
  FROM customers
  WHERE NOT REGEXP_LIKE(email, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$');

  IF invalid_count > 0 THEN
    RETURN 'ERROR: ' || invalid_count || ' invalid emails found';
  END IF;

  -- Check invalid dates
  SELECT COUNT(*) INTO invalid_count
  FROM orders
  WHERE order_date > CURRENT_DATE() OR order_date < '2000-01-01';

  IF invalid_count > 0 THEN
    RETURN 'ERROR: ' || invalid_count || ' invalid dates found';
  END IF;

  RETURN 'SUCCESS: All data accurate';
END;
$$;

Consistency

-- Check cross-table consistency
SELECT
  o.customer_id,
  COUNT(o.order_id) as order_count,
  CASE
    WHEN c.customer_id IS NULL THEN 'ORPHAN_ORDER'
    ELSE 'CONSISTENT'
  END as consistency_status
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
WHERE c.customer_id IS NULL;

-- Create consistency validation
CREATE OR REPLACE PROCEDURE check_cross_table_consistency()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  orphan_count INTEGER;
BEGIN
  SELECT COUNT(*) INTO orphan_count
  FROM orders o
  LEFT JOIN customers c ON o.customer_id = c.customer_id
  WHERE c.customer_id IS NULL;

  IF orphan_count > 0 THEN
    RETURN 'ERROR: ' || orphan_count || ' orphan orders found';
  END IF;

  RETURN 'SUCCESS: Cross-table consistency maintained';
END;
$$;

Data Profiling

-- Comprehensive data profiling
CREATE OR REPLACE PROCEDURE profile_table(table_name STRING)
RETURNS VARIANT
LANGUAGE SQL
AS
$$
DECLARE
  result VARIANT;
BEGIN
  EXECUTE IMMEDIATE
    'SELECT
      COUNT(*) as total_rows,
      COUNT(DISTINCT *) as unique_values,
      MIN(created_at) as min_date,
      MAX(created_at) as max_date,
      AVG(amount) as avg_amount,
      STDDEV(amount) as stddev_amount
    FROM ' || table_name
    INTO result;

  RETURN result;
END;
$$;

-- Profile specific columns
SELECT
  COUNT(*) as total_rows,
  COUNT(DISTINCT customer_id) as unique_customers,
  MIN(order_date) as min_date,
  MAX(order_date) as max_date,
  AVG(amount) as avg_order_amount,
  STDDEV(amount) as stddev_amount,
  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY amount) as median_amount
FROM orders;

Data Quality Rules Engine

Rule Definitions

-- Create quality rules table
CREATE OR REPLACE TABLE data_quality_rules (
  rule_id INTEGER PRIMARY KEY,
  table_name STRING,
  column_name STRING,
  rule_type STRING,
  rule_expression STRING,
  threshold FLOAT,
  enabled BOOLEAN DEFAULT TRUE
);

-- Insert quality rules
INSERT INTO data_quality_rules VALUES
  (1, 'CUSTOMERS', 'EMAIL', 'NOT_NULL', 'email IS NOT NULL', 0.95, TRUE),
  (2, 'CUSTOMERS', 'EMAIL', 'FORMAT', 'REGEXP_LIKE(email, ''^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'')', 0.98, TRUE),
  (3, 'ORDERS', 'AMOUNT', 'RANGE', 'amount > 0 AND amount < 1000000', 0.99, TRUE),
  (4, 'ORDERS', 'ORDER_DATE', 'NOT_FUTURE', 'order_date <= CURRENT_DATE()', 1.0, TRUE);

Rule Execution

-- Execute quality rules
CREATE OR REPLACE PROCEDURE execute_quality_rules()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  rule_record RECORD;
  total_rows INTEGER;
  passing_rows INTEGER;
  result STRING;
BEGIN
  FOR rule_record IN
    SELECT * FROM data_quality_rules WHERE enabled = TRUE
  DO
    -- Count total rows
    EXECUTE IMMEDIATE
      'SELECT COUNT(*) FROM ' || rule_record.table_name
      INTO total_rows;

    -- Count passing rows
    EXECUTE IMMEDIATE
      'SELECT COUNT(*) FROM ' || rule_record.table_name
      || ' WHERE ' || rule_record.rule_expression
      INTO passing_rows;

    -- Check threshold
    IF (passing_rows::FLOAT / total_rows) < rule_record.threshold THEN
      result := result || 'FAIL: Rule ' || rule_record.rule_id || ' - ' ||
                ROUND((passing_rows::FLOAT / total_rows) * 100, 2) || '% passing\n';
    END IF;
  END FOR;

  RETURN COALESCE(result, 'SUCCESS: All rules passing');
END;
$$;

-- Schedule rule execution
CREATE TASK daily_quality_check
  SCHEDULE = 'USING CRON 0 2 * * * America/New_York'
AS
  CALL execute_quality_rules();

Data quality rules should be defined collaboratively with business stakeholders to ensure they capture meaningful business requirements. Technical rules alone may not capture all quality concerns.

Quality Monitoring Dashboard

-- Create quality metrics view
CREATE OR REPLACE VIEW data_quality_dashboard AS
SELECT
  table_name,
  COUNT(*) as total_rules,
  SUM(CASE WHEN last_execution_result = 'PASS' THEN 1 ELSE 0 END) as passing_rules,
  ROUND(SUM(CASE WHEN last_execution_result = 'PASS' THEN 1 ELSE 0 END) / COUNT(*) * 100, 2) as quality_score
FROM data_quality_rules r
LEFT JOIN (
  SELECT rule_id, 'PASS' as last_execution_result
  FROM quality_execution_history
  WHERE execution_date = (SELECT MAX(execution_date) FROM quality_execution_history)
) h ON r.rule_id = h.rule_id
GROUP BY table_name;

-- Monitor quality trends
SELECT
  execution_date,
  table_name,
  quality_score,
  LAG(quality_score) OVER (PARTITION BY table_name ORDER BY execution_date) as prev_score,
  quality_score - LAG(quality_score) OVER (PARTITION BY table_name ORDER BY execution_date) as score_change
FROM quality_metrics_history
WHERE execution_date >= DATEADD('day', -30, CURRENT_DATE());

Data Remediation

-- Auto-fix common issues
CREATE OR REPLACE PROCEDURE remediate_data_issues()
RETURNS STRING
LANGUAGE SQL
AS
$$
DECLARE
  fixed_count INTEGER;
BEGIN
  -- Fix invalid emails
  UPDATE customers
  SET email = LOWER(TRIM(email))
  WHERE email != LOWER(TRIM(email));

  GET DIAGNOSTICS fixed_count = ROW_COUNT;

  -- Fix future dates
  UPDATE orders
  SET order_date = CURRENT_DATE()
  WHERE order_date > CURRENT_DATE();

  GET DIAGNOSTICS fixed_count = fixed_count + ROW_COUNT;

  RETURN 'Fixed ' || fixed_count || ' records';
END;
$$;

-- Create data quality exception table
CREATE TABLE data_quality_exceptions (
  exception_id INTEGER PRIMARY KEY,
  table_name STRING,
  column_name STRING,
  rule_id INTEGER,
  exception_value STRING,
  detected_at TIMESTAMP_NTZ,
  resolved_at TIMESTAMP_NTZ,
  resolution_notes STRING
);

Key Takeaways:

  • Data quality frameworks ensure accuracy, completeness, and consistency
  • Data profiling provides comprehensive dataset analysis
  • Quality rules engine automates validation processes
  • Monitoring dashboards track quality trends over time
  • Remediation procedures automatically fix common issues
  • Exception tracking enables systematic issue resolution

Advertisement

Need Expert Snowflake Help?

Get personalized warehouse optimization, data modeling, or Snowflake platform consulting.

Advertisement