CW

Snowflake Federated Queries

Free Lesson

Advertisement

Snowflake Federated Queries

Federated queries in Snowflake enable querying data from external databases and data sources without copying or moving the data.

External Database Connections

PostgreSQL Connection

-- Create PostgreSQL external function
CREATE OR REPLACE EXTERNAL FUNCTION get_external_data(param VARCHAR)
RETURNS VARIANT
API_INTEGRATION = postgresql_api_integration
MAX_BATCH_ROWS = 1000
AS 'https://my-api.example.com/get-data';

-- Query external PostgreSQL
SELECT * FROM TABLE(get_external_data('SELECT * FROM customers WHERE active = true'));

Using Foreign Data Wrappers

-- Create database connection
CREATE OR REPLACE CONNECTION postgres_connection
  HOST = 'my-postgres.example.com'
  PORT = 5432
  DATABASE = 'production'
  USER = 'readonly_user'
  PASSWORD = 'secure_password'
  SSL = TRUE;

-- Create external function for queries
CREATE OR REPLACE EXTERNAL FUNCTION postgres_query(query_text VARCHAR)
RETURNS VARIANT
API_INTEGRATION = postgres_api_integration
AS 'https://my-api.example.com/postgres-query';

-- Query external PostgreSQL
SELECT *
FROM TABLE(postgres_query('SELECT * FROM customers WHERE created_at > ''2024-01-01'''));

External Tables from Cloud Storage

AWS S3

-- Create external table from S3
CREATE OR REPLACE EXTERNAL TABLE s3_customer_data
  WITH LOCATION = @s3_customer_stage
  FILE_FORMAT = (TYPE = PARQUET)
  AUTO_REFRESH = TRUE;

-- Query external data
SELECT * FROM s3_customer_data
WHERE created_date >= '2024-01-01';

-- Create view for complex queries
CREATE OR REPLACE VIEW v_s3_customers AS
SELECT
  raw:customer_id::INTEGER as customer_id,
  raw:name::STRING as customer_name,
  raw:email::STRING as email,
  raw:created_at::TIMESTAMP_NTZ as created_at
FROM s3_customer_data;

Azure Blob Storage

-- Create external stage for Azure
CREATE OR REPLACE STAGE azure_data_stage
  URL = 'azure://myaccount.blob.core.windows.net/data-container'
  STORAGE_INTEGRATION = azure_storage_integration;

-- Create external table
CREATE OR REPLACE EXTERNAL TABLE azure_events
  WITH LOCATION = @azure_data_stage
  FILE_FORMAT = (TYPE = JSON)
  AUTO_REFRESH = TRUE;

-- Query Azure data
SELECT
  raw:event_type::STRING as event_type,
  raw:user_id::INTEGER as user_id,
  raw:timestamp::TIMESTAMP_NTZ as event_time
FROM azure_events
WHERE raw:event_type = 'purchase';

Google Cloud Storage

-- Create external stage for GCS
CREATE OR REPLACE STAGE gcs_data_stage
  URL = 'gcs://my-bucket/data'
  STORAGE_INTEGRATION = gcs_storage_integration;

-- Create external table
CREATE OR REPLACE EXTERNAL TABLE gcs_analytics
  WITH LOCATION = @gcs_data_stage
  FILE_FORMAT = (TYPE = PARQUET)
  AUTO_REFRESH = TRUE;

-- Query GCS data
SELECT * FROM gcs_analytics
WHERE event_date >= '2024-01-01';

Cross-Database Queries

Querying Multiple Databases

-- Query across databases
SELECT
  c.customer_id,
  c.customer_name,
  o.order_id,
  o.order_amount,
  p.product_name
FROM analytics_db.production.customers c
JOIN sales_db.public.orders o ON c.customer_id = o.customer_id
JOIN inventory_db.public.products p ON o.product_id = p.product_id;

-- Use database prefixes for clarity
SELECT
  'CUSTOMER' as source_type,
  c.customer_id,
  c.customer_name
FROM customer_db.public.customers c

UNION ALL

SELECT
  'LEAD' as source_type,
  l.lead_id as customer_id,
  l.company_name as customer_name
FROM marketing_db.public.leads l;

Creating Cross-Database Views

-- Create unified view across databases
CREATE OR REPLACE VIEW unified_customer_data AS
SELECT
  c.customer_id,
  c.customer_name,
  c.email,
  o.total_orders,
  o.total_spend,
  l.lead_score,
  l.last_activity_date
FROM customer_db.public.customers c
LEFT JOIN (
  SELECT
    customer_id,
    COUNT(*) as total_orders,
    SUM(amount) as total_spend
  FROM sales_db.public.orders
  GROUP BY customer_id
) o ON c.customer_id = o.customer_id
LEFT JOIN marketing_db.public.lead_scores l ON c.customer_id = l.customer_id;

-- Query unified view
SELECT * FROM unified_customer_data
WHERE total_spend > 1000;

Cross-database queries may have performance implications as data is accessed from multiple databases. Consider materializing frequently used cross-database queries for better performance.

Data Virtualization Patterns

Unified Data Access Layer

-- Create data virtualization layer
CREATE OR REPLACE VIEW virtualized_data AS
-- CRM data
SELECT
  'CRM' as source_system,
  customer_id,
  customer_name,
  email,
  created_at
FROM crm_db.public.customers

UNION ALL

-- ERP data
SELECT
  'ERP' as source_system,
  customer_id,
  company_name as customer_name,
  contact_email as email,
  created_date as created_at
FROM erp_db.public.customers

UNION ALL

-- Marketing data
SELECT
  'MARKETING' as source_system,
  lead_id as customer_id,
  company_name as customer_name,
  email,
  created_at
FROM marketing_db.public.leads;

-- Query unified data
SELECT
  source_system,
  COUNT(*) as record_count
FROM virtualized_data
GROUP BY source_system;

Data Federation with Caching

-- Create cached external table
CREATE OR REPLACE EXTERNAL TABLE cached_external_data
  WITH LOCATION = @external_stage
  FILE_FORMAT = (TYPE = PARQUET)
  AUTO_REFRESH = TRUE
  REFRESH_AUTO_CREATE = TRUE;

-- Create materialized view for frequent queries
CREATE OR REPLACE MATERIALIZED VIEW mv_cached_data AS
SELECT
  customer_id,
  customer_name,
  COUNT(*) as access_count
FROM cached_external_data
GROUP BY customer_id, customer_name;

-- Refresh materialized view
ALTER MATERIALIZED VIEW mv_cached_data REFRESH;

External Functions

Creating External Functions

-- Create API integration
CREATE OR REPLACE API INTEGRATION external_api_integration
  TYPE = EXTERNAL_API
  ENABLED = TRUE
  ALLOWED_PREFIXES = ('https://my-api.example.com/')
  API_AWS_ROLE_ARN = 'arn:aws:iam::123456789012:role/my-api-role';

-- Create external function
CREATE OR REPLACE EXTERNAL FUNCTION call_external_api(param1 VARCHAR, param2 VARCHAR)
RETURNS VARIANT
API_INTEGRATION = external_api_integration
MAX_BATCH_ROWS = 100
AS 'https://my-api.example.com/api-function';

-- Use external function
SELECT
  customer_id,
  call_external_api(customer_id, 'enrich') as enriched_data
FROM customers
WHERE customer_id = 123;

External Function Best Practices

-- Create batch processing function
CREATE OR REPLACE EXTERNAL FUNCTION batch_process(data_array ARRAY)
RETURNS VARIANT
API_INTEGRATION = external_api_integration
MAX_BATCH_ROWS = 1000
AS 'https://my-api.example.com/batch-process';

-- Use in queries
SELECT
  customer_id,
  batch_process(ARRAY_CONSTRUCT(customer_id, email, phone)) as processed_data
FROM customers;

Performance Considerations

-- Monitor federated query performance
SELECT
  query_id,
  query_text,
  execution_time_ms,
  bytes_scanned,
  external_calls_count
FROM TABLE(INFORMATION_SCHEMA.QUERY_HISTORY(
  START_TIME => DATEADD('hour', -1, CURRENT_TIMESTAMP())
))
WHERE query_text LIKE '%EXTERNAL%'
   OR query_text LIKE '%S3%'
   OR query_text LIKE '%AZURE%'
ORDER BY execution_time_ms DESC;

-- Optimize external table queries
SELECT * FROM s3_customer_data
WHERE created_date = '2024-01-15'  -- Partition pruning
LIMIT 1000;  -- Limit results

Federated Query Best Practices

PracticeImplementationBenefit
Filter EarlyUse WHERE clausesReduce data transfer
Limit ResultsUse LIMIT clausesImprove response time
Cache ResultsCreate materialized viewsReduce external calls
Monitor PerformanceTrack execution timesOptimize queries
Handle ErrorsImplement retry logicEnsure reliability

Key Takeaways:

  • Federated queries enable querying external data without copying
  • External tables provide SQL access to cloud storage data
  • Cross-database queries unify data across Snowflake databases
  • External functions enable API integration from SQL
  • Performance optimization through filtering and caching
  • Error handling ensures reliable external data access

Advertisement

Need Expert Snowflake Help?

Get personalized warehouse optimization, data modeling, or Snowflake platform consulting.

Advertisement