Graph Data Science: Networks and Graph Analytics
Graph data science analyzes relationships between entities. This lesson covers network analysis, centrality, community detection, and graph neural networks.
Graph Fundamentals
<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
<rect width="600" height="400" fill="#f8f9fa" rx="10"/>
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Graph Data Structures</text>
<!-- Directed Graph -->
<rect x="50" y="60" width="250" height="150" fill="white" stroke="#3498db" stroke-width="2" rx="5"/>
<text x="175" y="80" text-anchor="middle" font-size="12" font-weight="bold" fill="#3498db">Directed Graph</text>
<!-- Nodes -->
<circle cx="100" cy="120" r="20" fill="#3498db"/>
<text x="100" y="125" text-anchor="middle" font-size="10" fill="white">A</text>
<circle cx="180" cy="120" r="20" fill="#2ecc71"/>
<text x="180" y="125" text-anchor="middle" font-size="10" fill="white">B</text>
<circle cx="260" cy="120" r="20" fill="#e74c3c"/>
<text x="260" y="125" text-anchor="middle" font-size="10" fill="white">C</text>
<!-- Edges with arrows -->
<line x1="120" y1="120" x2="158" y2="120" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="200" y1="120" x2="238" y2="120" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="100" y1="140" x2="250" y2="140" stroke="#f39c12" stroke-width="1" marker-end="url(#arrow)"/>
<text x="175" y="165" text-anchor="middle" font-size="10" fill="#7f8c8d">E = {(A,B), (B,C), (A,C)}</text>
<!-- Undirected Graph -->
<rect x="320" y="60" width="250" height="150" fill="white" stroke="#2ecc71" stroke-width="2" rx="5"/>
<text x="445" y="80" text-anchor="middle" font-size="12" font-weight="bold" fill="#2ecc71">Undirected Graph</text>
<circle cx="370" cy="120" r="20" fill="#3498db"/>
<text x="370" y="125" text-anchor="middle" font-size="10" fill="white">1</text>
<circle cx="450" cy="120" r="20" fill="#2ecc71"/>
<text x="450" y="125" text-anchor="middle" font-size="10" fill="white">2</text>
<circle cx="530" cy="120" r="20" fill="#e74c3c"/>
<text x="530" y="125" text-anchor="middle" font-size="10" fill="white">3</text>
<line x1="390" y1="120" x2="430" y2="120" stroke="#7f8c8d" stroke-width="2"/>
<line x1="470" y1="120" x2="510" y2="120" stroke="#7f8c8d" stroke-width="2"/>
<line x1="370" y1="140" x2="520" y2="140" stroke="#f39c12" stroke-width="1"/>
<text x="445" y="165" text-anchor="middle" font-size="10" fill="#7f8c8d">E = {(1,2), (2,3), (1,3)}</text>
<!-- Properties -->
<text x="300" y="240" text-anchor="middle" font-size="14" font-weight="bold" fill="#2c3e50">Graph Properties:</text>
<text x="100" y="270" font-size="11" fill="#2c3e50">• Nodes (V): Entities/vertices</text>
<text x="100" y="290" font-size="11" fill="#2c3e50">• Edges (E): Relationships</text>
<text x="100" y="310" font-size="11" fill="#2c3e50">• Weight: Edge importance</text>
<text x="400" y="270" font-size="11" fill="#2c3e50">• Degree: # of connections</text>
<text x="400" y="290" font-size="11" fill="#2c3e50">• Path: Sequence of edges</text>
<text x="400" y="310" font-size="11" fill="#2c3e50">• Cycle: Path to self</text>
<!-- Adjacency Matrix -->
<text x="300" y="350" text-anchor="middle" font-size="12" font-weight="bold" fill="#2c3e50">Adjacency Matrix Representation:</text>
<rect x="200" y="360" width="200" height="35" fill="white" stroke="#7f8c8d" stroke-width="1" rx="3"/>
<text x="300" y="382" text-anchor="middle" font-size="11" fill="#2c3e50">A = [[0,1,1], [1,0,1], [1,1,0]]</text>
</svg>
NetworkX Basics
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Create graph
G = nx.Graph()
# Add nodes with attributes
G.add_node(1, name="Alice", age=25, role="Engineer")
G.add_node(2, name="Bob", age=30, role="Manager")
G.add_node(3, name="Charlie", age=35, role="Director")
# Add edges with weights
G.add_edge(1, 2, weight=0.8, relationship="colleague")
G.add_edge(2, 3, weight=0.6, relationship="reports_to")
G.add_edge(1, 3, weight=0.3, relationship="indirect")
# Basic properties
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Is directed: {G.is_directed()}")
print(f"Is connected: {nx.is_connected(G)}")
# Visualize
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue',
node_size=500, font_size=10, font_weight='bold')
nx.draw_networkx_edge_labels(G, pos, edge_labels={(u,v): d['weight']
for u,v,d in G.edges(data=True)})
plt.title("Social Network Graph")
plt.show()
Centrality Measures
# Degree Centrality
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)
# Betweenness Centrality
betweenness = nx.betweenness_centrality(G, weight='weight')
print("Betweenness Centrality:", betweenness)
# Closeness Centrality
closeness = nx.closeness_centrality(G)
print("Closeness Centrality:", closeness)
# Eigenvector Centrality
eigenvector = nx.eigenvector_centrality(G, max_iter=1000, weight='weight')
print("Eigenvector Centrality:", eigenvector)
# PageRank
pagerank = nx.pagerank(G, alpha=0.85, weight='weight')
print("PageRank:", pagerank)
# Visualize centralities
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
centrality_measures = {
'Degree': degree_centrality,
'Betweenness': betweenness,
'Closeness': closeness,
'Eigenvector': eigenvector
}
for ax, (name, centrality) in zip(axes, centrality_measures.items()):
node_colors = [centrality[node] for node in G.nodes()]
nx.draw(G, pos, ax=ax, node_color=node_colors,
node_size=500, cmap=plt.cm.viridis, with_labels=True)
ax.set_title(f'{name} Centrality')
plt.tight_layout()
plt.show()
Community Detection
from networkx.algorithms import community
# Louvain Community Detection
communities_louvain = community.louvain_communities(G, weight='weight', resolution=1)
print("Louvain Communities:", communities_louvain)
# Girvan-Newman (hierarchical)
communities_gn = community.girvan_newman(G)
top_level_communities = next(communities_gn)
print("Girvan-Newman Communities:", top_level_communities)
# Label Propagation
communities_lp = list(community.label_propagation_communities(G))
print("Label Propagation Communities:", communities_lp)
# Modularity score
modularity = community.modularity(G, communities_louvain)
print(f"Modularity: {modularity:.4f}")
# Visualize communities
def draw_communities(G, communities, title):
pos = nx.spring_layout(G, seed=42)
plt.figure(figsize=(10, 8))
# Color nodes by community
colors = plt.cm.Set3(np.linspace(0, 1, len(communities)))
node_colors = {}
for i, comm in enumerate(communities):
for node in comm:
node_colors[node] = colors[i]
nx.draw(G, pos, node_color=[node_colors[node] for node in G.nodes()],
node_size=500, with_labels=True, font_size=10)
plt.title(title)
plt.show()
draw_communities(G, communities_louvain, "Louvain Communities")
Graph Algorithms
# Shortest Path
shortest_path = nx.shortest_path(G, source=1, target=3, weight='weight')
shortest_path_length = nx.shortest_path_length(G, source=1, target=3, weight='weight')
print(f"Shortest path from 1 to 3: {shortest_path}")
print(f"Path length: {shortest_path_length:.4f}")
# All-pairs shortest paths
all_shortest = dict(nx.all_pairs_dijkstra_path_length(G))
print("All-pairs shortest paths:", all_shortest)
# Minimum Spanning Tree
mst = nx.minimum_spanning_tree(G, weight='weight')
print(f"Minimum Spanning Tree edges: {list(mst.edges(data=True))}")
# Clustering Coefficient
clustering = nx.clustering(G, weight='weight')
print("Clustering Coefficient:", clustering)
# Transitivity (global clustering)
transitivity = nx.transitivity(G)
print(f"Graph Transitivity: {transitivity:.4f}")
# Triangles
triangles = nx.triangles(G)
print("Triangles per node:", triangles)
Graph Neural Networks (GNNs)
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from torch_geometric.data import Data
class GCN(torch.nn.Module):
"""Graph Convolutional Network"""
def __init__(self, num_features, num_classes, hidden_channels=16):
super().__init__()
self.conv1 = GCNConv(num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
class GraphSAGE(torch.nn.Module):
"""GraphSAGE Network"""
def __init__(self, num_features, num_classes, hidden_channels=16):
super().__init__()
self.conv1 = SAGEConv(num_features, hidden_channels)
self.conv2 = SAGEConv(hidden_channels, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
class GAT(torch.nn.Module):
"""Graph Attention Network"""
def __init__(self, num_features, num_classes, hidden_channels=16, heads=8):
super().__init__()
self.conv1 = GATConv(num_features, hidden_channels, heads=heads)
self.conv2 = GATConv(hidden_channels * heads, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
# Prepare data
def prepare_graph_data(G, labels):
"""Convert NetworkX graph to PyTorch Geometric format"""
node_list = list(G.nodes())
node_to_idx = {node: idx for idx, node in enumerate(node_list)}
# Node features (degree, clustering coefficient, etc.)
features = []
for node in node_list:
feat = [
G.degree(node),
nx.clustering(G, node),
nx.pagerank(G)[node]
]
features.append(feat)
x = torch.tensor(features, dtype=torch.float)
# Edge index
edge_index = []
for u, v in G.edges():
edge_index.append([node_to_idx[u], node_to_idx[v]])
edge_index.append([node_to_idx[v], node_to_idx[u]]) # Undirected
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
# Labels
y = torch.tensor([labels[node] for node in node_list], dtype=torch.long)
return Data(x=x, edge_index=edge_index, y=y)
Knowledge Graphs
import rdflib
from rdflib import URIRef, Literal, Namespace
# Create knowledge graph
g = rdflib.Graph()
# Define namespaces
EX = Namespace("http://example.org/")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
# Add triples (subject, predicate, object)
g.add((EX.Alice, RDF.type, EX.Person))
g.add((EX.Alice, EX.age, Literal(25)))
g.add((EX.Alice, EX.worksAt, EX.CompanyA))
g.add((EX.Bob, RDF.type, EX.Person))
g.add((EX.Bob, EX.age, Literal(30)))
g.add((EX.Bob, EX.manages, EX.Alice))
# Query with SPARQL
query = """
SELECT ?person ?age ?company
WHERE {
?person rdf:type ex:Person .
?person ex:age ?age .
?person ex:worksAt ?company .
FILTER (?age > 25)
}
"""
# Execute query
results = g.query(query, initNs={'ex': EX, 'rdf': RDF})
for row in results:
print(f"Person: {row.person}, Age: {row.age}, Company: {row.company}")
Graph Features for ML
def extract_graph_features(G):
"""Extract graph-level features for ML"""
features = {}
# Basic features
features['num_nodes'] = G.number_of_nodes()
features['num_edges'] = G.number_of_edges()
features['density'] = nx.density(G)
features['is_connected'] = nx.is_connected(G)
# Degree statistics
degrees = [d for n, d in G.degree()]
features['avg_degree'] = np.mean(degrees)
features['max_degree'] = max(degrees)
features['degree_std'] = np.std(degrees)
# Centrality statistics
betweenness = list(nx.betweenness_centrality(G).values())
features['avg_betweenness'] = np.mean(betweenness)
# Clustering
features['avg_clustering'] = nx.average_clustering(G)
features['transitivity'] = nx.transitivity(G)
# Connectivity
if nx.is_connected(G):
features['diameter'] = nx.diameter(G)
features['avg_shortest_path'] = nx.average_shortest_path_length(G)
else:
largest_cc = max(nx.connected_components(G), key=len)
subgraph = G.subgraph(largest_cc)
features['diameter'] = nx.diameter(subgraph)
features['avg_shortest_path'] = nx.average_shortest_path_length(subgraph)
return features
# Extract features for multiple graphs
graph_features = []
for G in graph_list:
features = extract_graph_features(G)
graph_features.append(features)
features_df = pd.DataFrame(graph_features)
Key Takeaways
- Graphs model relationships between entities
- Centrality measures identify important nodes
- Community detection finds groups in networks
- GNNs learn node/graph representations
- Graph features enable ML on network data