Metadata Queries: Information Schema, System Catalogs, Lineage

Interview Question: "How do you find all tables with a specific column name? Explain the difference between Information Schema and system catalogs. How would you track data lineage?" — Asked at Oracle, IBM, SAP for Data Governance roles

ℹ️

Difficulty: Advanced | Companies: Oracle, IBM, SAP, Informatica, Collibra | Time: 45-60 minutes

Information Schema

Standard SQL metadata interface:

\text{Information Schema} = \{ \text{TABLES}, \text{COLUMNS}, \text{INDEXES}, ... \}

-- List all tables in current schema
SELECT 
    table_schema,
    table_name,
    table_type,
    pg_size_pretty(pg_total_relation_size(table_schema || '.' || table_name)) AS size
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY pg_total_relation_size(table_schema || '.' || table_name) DESC;

-- Find all columns with specific type
SELECT 
    table_name,
    column_name,
    data_type,
    character_maximum_length,
    is_nullable,
    column_default
FROM information_schema.columns
WHERE table_schema = 'public'
    AND data_type IN ('varchar', 'text')
ORDER BY table_name, ordinal_position;

-- Find all foreign keys
SELECT 
    tc.table_name AS source_table,
    kcu.column_name AS source_column,
    ccu.table_name AS target_table,
    ccu.column_name AS target_column,
    tc.constraint_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu 
    ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu 
    ON ccu.constraint_name = tc.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY'
ORDER BY tc.table_name;

PostgreSQL System Catalogs

-- List all schemas
SELECT 
    nspname AS schema_name,
    pg_get_userbyid(nspowner) AS owner,
    pg_size_pretty(pg_catalog.pg_namespace_size(oid)) AS size
FROM pg_namespace
WHERE nspname NOT LIKE 'pg_%'
    AND nspname != 'information_schema'
ORDER BY nspname;

-- List all tables with details
SELECT 
    c.relname AS table_name,
    pg_get_userbyid(c.relowner) AS owner,
    pg_size_pretty(pg_relation_size(c.oid)) AS table_size,
    pg_size_pretty(pg_total_relation_size(c.oid)) AS total_size,
    c.reltuples::bigint AS estimated_rows,
    pg_stat_get_live_tuples(c.oid) AS live_rows,
    pg_stat_get_dead_tuples(c.oid) AS dead_rows
FROM pg_class c
JOIN pg_namespace n ON c.relnamespace = n.oid
WHERE c.relkind = 'r'
    AND n.nspname = 'public'
ORDER BY pg_total_relation_size(c.oid) DESC;

-- List all indexes
SELECT 
    indexname,
    indexdef,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size
FROM pg_indexes
WHERE schemaname = 'public'
ORDER BY pg_relation_size(indexname::regclass) DESC;

Column Lineage Tracking

-- Create metadata tracking tables
CREATE TABLE data_sources (
    source_id SERIAL PRIMARY KEY,
    source_name VARCHAR(100),
    source_type VARCHAR(50),
    connection_info JSONB,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE data_lineage (
    lineage_id SERIAL PRIMARY KEY,
    target_table VARCHAR(100),
    target_column VARCHAR(100),
    source_table VARCHAR(100),
    source_column VARCHAR(100),
    transformation TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    created_by VARCHAR(100)
);

-- Insert lineage metadata
INSERT INTO data_lineage (target_table, target_column, source_table, source_column, transformation)
VALUES 
('sales_summary', 'total_revenue', 'orders', 'amount', 'SUM(amount) GROUP BY date'),
('sales_summary', 'order_count', 'orders', 'order_id', 'COUNT(order_id) GROUP BY date'),
('customer_360', 'lifetime_value', 'orders', 'amount', 'SUM(amount) WHERE status = ''completed''');

-- Query lineage
SELECT 
    target_table,
    target_column,
    source_table,
    source_column,
    transformation
FROM data_lineage
WHERE target_table = 'sales_summary'
ORDER BY target_column;

-- Find all upstream dependencies
WITH RECURSIVE lineage_tree AS (
    SELECT 
        target_table,
        target_column,
        source_table,
        source_column,
        transformation,
        1 AS depth
    FROM data_lineage
    WHERE target_table = 'sales_summary'
    
    UNION ALL
    
    SELECT 
        dl.target_table,
        dl.target_column,
        dl.source_table,
        dl.source_column,
        dl.transformation,
        lt.depth + 1
    FROM data_lineage dl
    JOIN lineage_tree lt ON dl.target_table = lt.source_table
    WHERE lt.depth < 10
)
SELECT * FROM lineage_tree
ORDER BY depth, source_table;

Table Statistics

-- Comprehensive table statistics
SELECT 
    schemaname,
    relname AS table_name,
    seq_scan,
    seq_tup_read,
    idx_scan,
    idx_tup_fetch,
    n_tup_ins,
    n_tup_upd,
    n_tup_del,
    n_live_tup,
    n_dead_tup,
    ROUND(n_dead_tup::DECIMAL / NULLIF(n_live_tup, 0) * 100, 2) AS dead_ratio_pct,
    last_vacuum,
    last_autovacuum,
    last_analyze,
    last_autoanalyze
FROM pg_stat_user_tables
ORDER BY n_live_tup DESC;

-- Index usage statistics
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan AS times_used,
    idx_tup_read,
    idx_tup_fetch,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS index_size,
    ROUND(idx_scan::DECIMAL / NULLIF(seq_scan + idx_scan, 0) * 100, 2) AS index_usage_pct
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;

Query Statistics

-- Top queries by total time
SELECT 
    query,
    calls,
    ROUND(total_time::NUMERIC, 2) AS total_time_ms,
    ROUND(mean_time::NUMERIC, 2) AS avg_time_ms,
    rows,
    shared_hit AS cache_hits,
    shared_read AS disk_reads
FROM pg_stat_statements
ORDER BY total_time DESC
LIMIT 20;

-- Queries with high I/O
SELECT 
    query,
    calls,
    shared_hit,
    shared_read,
    ROUND(shared_read::DECIMAL / NULLIF(shared_hit + shared_read, 0) * 100, 2) AS io_ratio_pct
FROM pg_stat_statements
WHERE calls > 100
ORDER BY shared_read DESC
LIMIT 10;

Database Schema Comparison

-- Compare schemas between databases
CREATE OR REPLACE FUNCTION compare_schemas(
    source_db TEXT,
    target_db TEXT
) RETURNS TABLE(
    object_type TEXT,
    object_name TEXT,
    source_exists BOOLEAN,
    target_exists BOOLEAN
) AS $$
BEGIN
    RETURN QUERY
    SELECT 
        'TABLE'::TEXT AS object_type,
        t.table_name,
        TRUE AS source_exists,
        EXISTS (
            SELECT 1 FROM information_schema.tables 
            WHERE table_name = t.table_name 
                AND table_schema = 'public'
        ) AS target_exists
    FROM information_schema.tables t
    WHERE t.table_schema = 'public'
    
    UNION ALL
    
    SELECT 
        'VIEW'::TEXT,
        v.table_name,
        TRUE,
        EXISTS (
            SELECT 1 FROM information_schema.views 
            WHERE table_name = v.table_name
        )
    FROM information_schema.views v
    WHERE v.table_schema = 'public';
END;
$$ LANGUAGE plpgsql;

Data Dictionary Views

-- Custom data dictionary view
CREATE OR REPLACE VIEW data_dictionary AS
SELECT 
    t.table_name,
    t.table_type,
    c.column_name,
    c.ordinal_position,
    c.data_type,
    c.character_maximum_length,
    c.is_nullable,
    c.column_default,
    pgd.description AS column_comment,
    pt.description AS table_comment
FROM information_schema.tables t
JOIN information_schema.columns c 
    ON t.table_name = c.table_name
    AND t.table_schema = c.table_schema
LEFT JOIN pg_catalog.pg_statio_all_tables st 
    ON t.table_name = st.relname
LEFT JOIN pg_catalog.pg_description pgd 
    ON pgd.objoid = st.relid 
    AND pgd.objsubid = c.ordinal_position
LEFT JOIN pg_catalog.pg_description pt 
    ON pt.objoid = st.relid 
    AND pt.objsubid = 0
WHERE t.table_schema = 'public'
ORDER BY t.table_name, c.ordinal_position;

-- Query the data dictionary
SELECT * FROM data_dictionary
WHERE table_name = 'orders'
ORDER BY ordinal_position;

Metadata Export

-- Export schema as DDL
SELECT 
    'CREATE TABLE ' || table_name || ' (' || E'\n' ||
    STRING_AGG(
        '    ' || column_name || ' ' || 
        data_type || 
        CASE WHEN character_maximum_length IS NOT NULL 
            THEN '(' || character_maximum_length || ')' 
            ELSE '' 
        END ||
        CASE WHEN is_nullable = 'NO' THEN ' NOT NULL' ELSE '' END ||
        CASE WHEN column_default IS NOT NULL 
            THEN ' DEFAULT ' || column_default 
            ELSE '' 
        END,
        ',' || E'\n'
        ORDER BY ordinal_position
    ) || E'\n);' AS ddl
FROM information_schema.columns
WHERE table_schema = 'public'
    AND table_name = 'orders'
GROUP BY table_name;

Mathematical Formulas

Table bloat estimation:

\text{Bloat} = \frac{\text{dead\_tuples}}{\text{live\_tuples} + \text{dead\_tuples}} \times 100\%

Index usage efficiency:

\text{Efficiency} = \frac{\text{index\_scans}}{\text{index\_scans} + \text{seq\_scans}} \times 100\%

ℹ️

Pro Tip: Use pg_stat_statements extension to track query performance over time. It's essential for identifying slow queries.

Common Metadata Queries

-- Find tables with specific column name
SELECT 
    table_name,
    column_name,
    data_type
FROM information_schema.columns
WHERE column_name LIKE '%email%'
    AND table_schema = 'public';

-- Find tables without primary keys
SELECT 
    t.table_name
FROM information_schema.tables t
LEFT JOIN information_schema.table_constraints tc 
    ON t.table_name = tc.table_name
    AND tc.constraint_type = 'PRIMARY KEY'
WHERE t.table_schema = 'public'
    AND tc.constraint_name IS NULL;

-- Find unused indexes
SELECT 
    indexname,
    idx_scan,
    pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
    AND indexname NOT LIKE '%_pkey'
ORDER BY pg_relation_size(indexname::regclass) DESC;

⚠️

Security Note: Restrict access to system catalogs. Use views to provide controlled access to metadata.