Snowflake Federated Queries
Federated queries in Snowflake enable querying data from external databases and data sources without copying or moving the data.
External Database Connections
PostgreSQL Connection
-- Create PostgreSQL external function
CREATE OR REPLACE EXTERNAL FUNCTION get_external_data(param VARCHAR)
RETURNS VARIANT
API_INTEGRATION = postgresql_api_integration
MAX_BATCH_ROWS = 1000
AS 'https://my-api.example.com/get-data';
-- Query external PostgreSQL
SELECT * FROM TABLE(get_external_data('SELECT * FROM customers WHERE active = true'));
Using Foreign Data Wrappers
-- Create database connection
CREATE OR REPLACE CONNECTION postgres_connection
HOST = 'my-postgres.example.com'
PORT = 5432
DATABASE = 'production'
USER = 'readonly_user'
PASSWORD = 'secure_password'
SSL = TRUE;
-- Create external function for queries
CREATE OR REPLACE EXTERNAL FUNCTION postgres_query(query_text VARCHAR)
RETURNS VARIANT
API_INTEGRATION = postgres_api_integration
AS 'https://my-api.example.com/postgres-query';
-- Query external PostgreSQL
SELECT *
FROM TABLE(postgres_query('SELECT * FROM customers WHERE created_at > ''2024-01-01'''));
External Tables from Cloud Storage
AWS S3
-- Create external table from S3
CREATE OR REPLACE EXTERNAL TABLE s3_customer_data
WITH LOCATION = @s3_customer_stage
FILE_FORMAT = (TYPE = PARQUET)
AUTO_REFRESH = TRUE;
-- Query external data
SELECT * FROM s3_customer_data
WHERE created_date >= '2024-01-01';
-- Create view for complex queries
CREATE OR REPLACE VIEW v_s3_customers AS
SELECT
raw:customer_id::INTEGER as customer_id,
raw:name::STRING as customer_name,
raw:email::STRING as email,
raw:created_at::TIMESTAMP_NTZ as created_at
FROM s3_customer_data;
Azure Blob Storage
-- Create external stage for Azure
CREATE OR REPLACE STAGE azure_data_stage
URL = 'azure://myaccount.blob.core.windows.net/data-container'
STORAGE_INTEGRATION = azure_storage_integration;
-- Create external table
CREATE OR REPLACE EXTERNAL TABLE azure_events
WITH LOCATION = @azure_data_stage
FILE_FORMAT = (TYPE = JSON)
AUTO_REFRESH = TRUE;
-- Query Azure data
SELECT
raw:event_type::STRING as event_type,
raw:user_id::INTEGER as user_id,
raw:timestamp::TIMESTAMP_NTZ as event_time
FROM azure_events
WHERE raw:event_type = 'purchase';
Google Cloud Storage
-- Create external stage for GCS
CREATE OR REPLACE STAGE gcs_data_stage
URL = 'gcs://my-bucket/data'
STORAGE_INTEGRATION = gcs_storage_integration;
-- Create external table
CREATE OR REPLACE EXTERNAL TABLE gcs_analytics
WITH LOCATION = @gcs_data_stage
FILE_FORMAT = (TYPE = PARQUET)
AUTO_REFRESH = TRUE;
-- Query GCS data
SELECT * FROM gcs_analytics
WHERE event_date >= '2024-01-01';
Cross-Database Queries
Querying Multiple Databases
-- Query across databases
SELECT
c.customer_id,
c.customer_name,
o.order_id,
o.order_amount,
p.product_name
FROM analytics_db.production.customers c
JOIN sales_db.public.orders o ON c.customer_id = o.customer_id
JOIN inventory_db.public.products p ON o.product_id = p.product_id;
-- Use database prefixes for clarity
SELECT
'CUSTOMER' as source_type,
c.customer_id,
c.customer_name
FROM customer_db.public.customers c
UNION ALL
SELECT
'LEAD' as source_type,
l.lead_id as customer_id,
l.company_name as customer_name
FROM marketing_db.public.leads l;
Creating Cross-Database Views
-- Create unified view across databases
CREATE OR REPLACE VIEW unified_customer_data AS
SELECT
c.customer_id,
c.customer_name,
c.email,
o.total_orders,
o.total_spend,
l.lead_score,
l.last_activity_date
FROM customer_db.public.customers c
LEFT JOIN (
SELECT
customer_id,
COUNT(*) as total_orders,
SUM(amount) as total_spend
FROM sales_db.public.orders
GROUP BY customer_id
) o ON c.customer_id = o.customer_id
LEFT JOIN marketing_db.public.lead_scores l ON c.customer_id = l.customer_id;
-- Query unified view
SELECT * FROM unified_customer_data
WHERE total_spend > 1000;
Cross-database queries may have performance implications as data is accessed from multiple databases. Consider materializing frequently used cross-database queries for better performance.
Data Virtualization Patterns
Unified Data Access Layer
-- Create data virtualization layer
CREATE OR REPLACE VIEW virtualized_data AS
-- CRM data
SELECT
'CRM' as source_system,
customer_id,
customer_name,
email,
created_at
FROM crm_db.public.customers
UNION ALL
-- ERP data
SELECT
'ERP' as source_system,
customer_id,
company_name as customer_name,
contact_email as email,
created_date as created_at
FROM erp_db.public.customers
UNION ALL
-- Marketing data
SELECT
'MARKETING' as source_system,
lead_id as customer_id,
company_name as customer_name,
email,
created_at
FROM marketing_db.public.leads;
-- Query unified data
SELECT
source_system,
COUNT(*) as record_count
FROM virtualized_data
GROUP BY source_system;
Data Federation with Caching
-- Create cached external table
CREATE OR REPLACE EXTERNAL TABLE cached_external_data
WITH LOCATION = @external_stage
FILE_FORMAT = (TYPE = PARQUET)
AUTO_REFRESH = TRUE
REFRESH_AUTO_CREATE = TRUE;
-- Create materialized view for frequent queries
CREATE OR REPLACE MATERIALIZED VIEW mv_cached_data AS
SELECT
customer_id,
customer_name,
COUNT(*) as access_count
FROM cached_external_data
GROUP BY customer_id, customer_name;
-- Refresh materialized view
ALTER MATERIALIZED VIEW mv_cached_data REFRESH;
External Functions
Creating External Functions
-- Create API integration
CREATE OR REPLACE API INTEGRATION external_api_integration
TYPE = EXTERNAL_API
ENABLED = TRUE
ALLOWED_PREFIXES = ('https://my-api.example.com/')
API_AWS_ROLE_ARN = 'arn:aws:iam::123456789012:role/my-api-role';
-- Create external function
CREATE OR REPLACE EXTERNAL FUNCTION call_external_api(param1 VARCHAR, param2 VARCHAR)
RETURNS VARIANT
API_INTEGRATION = external_api_integration
MAX_BATCH_ROWS = 100
AS 'https://my-api.example.com/api-function';
-- Use external function
SELECT
customer_id,
call_external_api(customer_id, 'enrich') as enriched_data
FROM customers
WHERE customer_id = 123;
External Function Best Practices
-- Create batch processing function
CREATE OR REPLACE EXTERNAL FUNCTION batch_process(data_array ARRAY)
RETURNS VARIANT
API_INTEGRATION = external_api_integration
MAX_BATCH_ROWS = 1000
AS 'https://my-api.example.com/batch-process';
-- Use in queries
SELECT
customer_id,
batch_process(ARRAY_CONSTRUCT(customer_id, email, phone)) as processed_data
FROM customers;
Performance Considerations
-- Monitor federated query performance
SELECT
query_id,
query_text,
execution_time_ms,
bytes_scanned,
external_calls_count
FROM TABLE(INFORMATION_SCHEMA.QUERY_HISTORY(
START_TIME => DATEADD('hour', -1, CURRENT_TIMESTAMP())
))
WHERE query_text LIKE '%EXTERNAL%'
OR query_text LIKE '%S3%'
OR query_text LIKE '%AZURE%'
ORDER BY execution_time_ms DESC;
-- Optimize external table queries
SELECT * FROM s3_customer_data
WHERE created_date = '2024-01-15' -- Partition pruning
LIMIT 1000; -- Limit results
Federated Query Best Practices
| Practice | Implementation | Benefit |
|---|---|---|
| Filter Early | Use WHERE clauses | Reduce data transfer |
| Limit Results | Use LIMIT clauses | Improve response time |
| Cache Results | Create materialized views | Reduce external calls |
| Monitor Performance | Track execution times | Optimize queries |
| Handle Errors | Implement retry logic | Ensure reliability |
Key Takeaways:
- Federated queries enable querying external data without copying
- External tables provide SQL access to cloud storage data
- Cross-database queries unify data across Snowflake databases
- External functions enable API integration from SQL
- Performance optimization through filtering and caching
- Error handling ensures reliable external data access