CW

Graph Data Science: Networks and Graph Analytics

Module 10: Specialized MLFree Lesson

Advertisement

Graph Data Science: Networks and Graph Analytics

Graph data science analyzes relationships between entities. This lesson covers network analysis, centrality, community detection, and graph neural networks.

Graph Fundamentals

<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
  <rect width="600" height="400" fill="#f8f9fa" rx="10"/>
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Graph Data Structures</text>
  
  <!-- Directed Graph -->
  <rect x="50" y="60" width="250" height="150" fill="white" stroke="#3498db" stroke-width="2" rx="5"/>
  <text x="175" y="80" text-anchor="middle" font-size="12" font-weight="bold" fill="#3498db">Directed Graph</text>
  
  <!-- Nodes -->
  <circle cx="100" cy="120" r="20" fill="#3498db"/>
  <text x="100" y="125" text-anchor="middle" font-size="10" fill="white">A</text>
  
  <circle cx="180" cy="120" r="20" fill="#2ecc71"/>
  <text x="180" y="125" text-anchor="middle" font-size="10" fill="white">B</text>
  
  <circle cx="260" cy="120" r="20" fill="#e74c3c"/>
  <text x="260" y="125" text-anchor="middle" font-size="10" fill="white">C</text>
  
  <!-- Edges with arrows -->
  <line x1="120" y1="120" x2="158" y2="120" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="200" y1="120" x2="238" y2="120" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="100" y1="140" x2="250" y2="140" stroke="#f39c12" stroke-width="1" marker-end="url(#arrow)"/>
  
  <text x="175" y="165" text-anchor="middle" font-size="10" fill="#7f8c8d">E = {(A,B), (B,C), (A,C)}</text>
  
  <!-- Undirected Graph -->
  <rect x="320" y="60" width="250" height="150" fill="white" stroke="#2ecc71" stroke-width="2" rx="5"/>
  <text x="445" y="80" text-anchor="middle" font-size="12" font-weight="bold" fill="#2ecc71">Undirected Graph</text>
  
  <circle cx="370" cy="120" r="20" fill="#3498db"/>
  <text x="370" y="125" text-anchor="middle" font-size="10" fill="white">1</text>
  
  <circle cx="450" cy="120" r="20" fill="#2ecc71"/>
  <text x="450" y="125" text-anchor="middle" font-size="10" fill="white">2</text>
  
  <circle cx="530" cy="120" r="20" fill="#e74c3c"/>
  <text x="530" y="125" text-anchor="middle" font-size="10" fill="white">3</text>
  
  <line x1="390" y1="120" x2="430" y2="120" stroke="#7f8c8d" stroke-width="2"/>
  <line x1="470" y1="120" x2="510" y2="120" stroke="#7f8c8d" stroke-width="2"/>
  <line x1="370" y1="140" x2="520" y2="140" stroke="#f39c12" stroke-width="1"/>
  
  <text x="445" y="165" text-anchor="middle" font-size="10" fill="#7f8c8d">E = {(1,2), (2,3), (1,3)}</text>
  
  <!-- Properties -->
  <text x="300" y="240" text-anchor="middle" font-size="14" font-weight="bold" fill="#2c3e50">Graph Properties:</text>
  
  <text x="100" y="270" font-size="11" fill="#2c3e50">• Nodes (V): Entities/vertices</text>
  <text x="100" y="290" font-size="11" fill="#2c3e50">• Edges (E): Relationships</text>
  <text x="100" y="310" font-size="11" fill="#2c3e50">• Weight: Edge importance</text>
  
  <text x="400" y="270" font-size="11" fill="#2c3e50">• Degree: # of connections</text>
  <text x="400" y="290" font-size="11" fill="#2c3e50">• Path: Sequence of edges</text>
  <text x="400" y="310" font-size="11" fill="#2c3e50">• Cycle: Path to self</text>
  
  <!-- Adjacency Matrix -->
  <text x="300" y="350" text-anchor="middle" font-size="12" font-weight="bold" fill="#2c3e50">Adjacency Matrix Representation:</text>
  
  <rect x="200" y="360" width="200" height="35" fill="white" stroke="#7f8c8d" stroke-width="1" rx="3"/>
  <text x="300" y="382" text-anchor="middle" font-size="11" fill="#2c3e50">A = [[0,1,1], [1,0,1], [1,1,0]]</text>
</svg>

NetworkX Basics

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Create graph
G = nx.Graph()

# Add nodes with attributes
G.add_node(1, name="Alice", age=25, role="Engineer")
G.add_node(2, name="Bob", age=30, role="Manager")
G.add_node(3, name="Charlie", age=35, role="Director")

# Add edges with weights
G.add_edge(1, 2, weight=0.8, relationship="colleague")
G.add_edge(2, 3, weight=0.6, relationship="reports_to")
G.add_edge(1, 3, weight=0.3, relationship="indirect")

# Basic properties
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Is directed: {G.is_directed()}")
print(f"Is connected: {nx.is_connected(G)}")

# Visualize
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', 
        node_size=500, font_size=10, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u,v): d['weight'] 
                                                   for u,v,d in G.edges(data=True)})
plt.title("Social Network Graph")
plt.show()

Centrality Measures

# Degree Centrality
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)

# Betweenness Centrality
betweenness = nx.betweenness_centrality(G, weight='weight')
print("Betweenness Centrality:", betweenness)

# Closeness Centrality
closeness = nx.closeness_centrality(G)
print("Closeness Centrality:", closeness)

# Eigenvector Centrality
eigenvector = nx.eigenvector_centrality(G, max_iter=1000, weight='weight')
print("Eigenvector Centrality:", eigenvector)

# PageRank
pagerank = nx.pagerank(G, alpha=0.85, weight='weight')
print("PageRank:", pagerank)

# Visualize centralities
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

centrality_measures = {
    'Degree': degree_centrality,
    'Betweenness': betweenness,
    'Closeness': closeness,
    'Eigenvector': eigenvector
}

for ax, (name, centrality) in zip(axes, centrality_measures.items()):
    node_colors = [centrality[node] for node in G.nodes()]
    nx.draw(G, pos, ax=ax, node_color=node_colors, 
            node_size=500, cmap=plt.cm.viridis, with_labels=True)
    ax.set_title(f'{name} Centrality')

plt.tight_layout()
plt.show()

Community Detection

from networkx.algorithms import community

# Louvain Community Detection
communities_louvain = community.louvain_communities(G, weight='weight', resolution=1)
print("Louvain Communities:", communities_louvain)

# Girvan-Newman (hierarchical)
communities_gn = community.girvan_newman(G)
top_level_communities = next(communities_gn)
print("Girvan-Newman Communities:", top_level_communities)

# Label Propagation
communities_lp = list(community.label_propagation_communities(G))
print("Label Propagation Communities:", communities_lp)

# Modularity score
modularity = community.modularity(G, communities_louvain)
print(f"Modularity: {modularity:.4f}")

# Visualize communities
def draw_communities(G, communities, title):
    pos = nx.spring_layout(G, seed=42)
    plt.figure(figsize=(10, 8))
    
    # Color nodes by community
    colors = plt.cm.Set3(np.linspace(0, 1, len(communities)))
    node_colors = {}
    
    for i, comm in enumerate(communities):
        for node in comm:
            node_colors[node] = colors[i]
    
    nx.draw(G, pos, node_color=[node_colors[node] for node in G.nodes()],
            node_size=500, with_labels=True, font_size=10)
    plt.title(title)
    plt.show()

draw_communities(G, communities_louvain, "Louvain Communities")

Graph Algorithms

# Shortest Path
shortest_path = nx.shortest_path(G, source=1, target=3, weight='weight')
shortest_path_length = nx.shortest_path_length(G, source=1, target=3, weight='weight')
print(f"Shortest path from 1 to 3: {shortest_path}")
print(f"Path length: {shortest_path_length:.4f}")

# All-pairs shortest paths
all_shortest = dict(nx.all_pairs_dijkstra_path_length(G))
print("All-pairs shortest paths:", all_shortest)

# Minimum Spanning Tree
mst = nx.minimum_spanning_tree(G, weight='weight')
print(f"Minimum Spanning Tree edges: {list(mst.edges(data=True))}")

# Clustering Coefficient
clustering = nx.clustering(G, weight='weight')
print("Clustering Coefficient:", clustering)

# Transitivity (global clustering)
transitivity = nx.transitivity(G)
print(f"Graph Transitivity: {transitivity:.4f}")

# Triangles
triangles = nx.triangles(G)
print("Triangles per node:", triangles)

Graph Neural Networks (GNNs)

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data

class GCN(torch.nn.Module):
    """Graph Convolutional Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

class GraphSAGE(torch.nn.Module):
    """GraphSAGE Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16):
        super().__init__()
        self.conv1 = SAGEConv(num_features, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

class GAT(torch.nn.Module):
    """Graph Attention Network"""
    def __init__(self, num_features, num_classes, hidden_channels=16, heads=8):
        super().__init__()
        self.conv1 = GATConv(num_features, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, num_classes)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Prepare data
def prepare_graph_data(G, labels):
    """Convert NetworkX graph to PyTorch Geometric format"""
    node_list = list(G.nodes())
    node_to_idx = {node: idx for idx, node in enumerate(node_list)}
    
    # Node features (degree, clustering coefficient, etc.)
    features = []
    for node in node_list:
        feat = [
            G.degree(node),
            nx.clustering(G, node),
            nx.pagerank(G)[node]
        ]
        features.append(feat)
    
    x = torch.tensor(features, dtype=torch.float)
    
    # Edge index
    edge_index = []
    for u, v in G.edges():
        edge_index.append([node_to_idx[u], node_to_idx[v]])
        edge_index.append([node_to_idx[v], node_to_idx[u]])  # Undirected
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    # Labels
    y = torch.tensor([labels[node] for node in node_list], dtype=torch.long)
    
    return Data(x=x, edge_index=edge_index, y=y)

Knowledge Graphs

import rdflib
from rdflib import URIRef, Literal, Namespace

# Create knowledge graph
g = rdflib.Graph()

# Define namespaces
EX = Namespace("http://example.org/")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

# Add triples (subject, predicate, object)
g.add((EX.Alice, RDF.type, EX.Person))
g.add((EX.Alice, EX.age, Literal(25)))
g.add((EX.Alice, EX.worksAt, EX.CompanyA))
g.add((EX.Bob, RDF.type, EX.Person))
g.add((EX.Bob, EX.age, Literal(30)))
g.add((EX.Bob, EX.manages, EX.Alice))

# Query with SPARQL
query = """
SELECT ?person ?age ?company
WHERE {
    ?person rdf:type ex:Person .
    ?person ex:age ?age .
    ?person ex:worksAt ?company .
    FILTER (?age > 25)
}
"""

# Execute query
results = g.query(query, initNs={'ex': EX, 'rdf': RDF})
for row in results:
    print(f"Person: {row.person}, Age: {row.age}, Company: {row.company}")

Graph Features for ML

def extract_graph_features(G):
    """Extract graph-level features for ML"""
    features = {}
    
    # Basic features
    features['num_nodes'] = G.number_of_nodes()
    features['num_edges'] = G.number_of_edges()
    features['density'] = nx.density(G)
    features['is_connected'] = nx.is_connected(G)
    
    # Degree statistics
    degrees = [d for n, d in G.degree()]
    features['avg_degree'] = np.mean(degrees)
    features['max_degree'] = max(degrees)
    features['degree_std'] = np.std(degrees)
    
    # Centrality statistics
    betweenness = list(nx.betweenness_centrality(G).values())
    features['avg_betweenness'] = np.mean(betweenness)
    
    # Clustering
    features['avg_clustering'] = nx.average_clustering(G)
    features['transitivity'] = nx.transitivity(G)
    
    # Connectivity
    if nx.is_connected(G):
        features['diameter'] = nx.diameter(G)
        features['avg_shortest_path'] = nx.average_shortest_path_length(G)
    else:
        largest_cc = max(nx.connected_components(G), key=len)
        subgraph = G.subgraph(largest_cc)
        features['diameter'] = nx.diameter(subgraph)
        features['avg_shortest_path'] = nx.average_shortest_path_length(subgraph)
    
    return features

# Extract features for multiple graphs
graph_features = []
for G in graph_list:
    features = extract_graph_features(G)
    graph_features.append(features)

features_df = pd.DataFrame(graph_features)

Key Takeaways

  1. Graphs model relationships between entities
  2. Centrality measures identify important nodes
  3. Community detection finds groups in networks
  4. GNNs learn node/graph representations
  5. Graph features enable ML on network data

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement