Spatial Data Analysis
Spatial data is everywhere β from GPS coordinates to satellite imagery to census tracts. Analyzing spatial data requires specialized tools that understand geometry, projections, and spatial relationships.
Spatial Data Processing Pipeline
Why Spatial Analysis Matters
Location context transforms data. A crime report becomes a hotspot map. A sale becomes a demand surface. Population counts become a choropleth revealing inequality. Spatial analysis reveals patterns invisible in tabular data.
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, LineString
import geopandas as gpd
from scipy.spatial import cKDTree
from scipy.interpolate import griddata
import warnings
warnings.filterwarnings('ignore')
Creating GeoDataFrames
The fundamental spatial data structure is the GeoDataFrame.
# Create points from coordinates
np.random.seed(42)
n = 500
lons = np.random.uniform(-74.05, -73.75, n) # NYC area
lats = np.random.uniform(40.6, 40.9, n)
gdf = gpd.GeoDataFrame({
'id': range(n),
'value': np.random.lognormal(5, 1, n),
'category': np.random.choice(['A', 'B', 'C'], n),
'geometry': [Point(lon, lat) for lon, lat in zip(lons, lats)]
}, crs='EPSG:4326')
print(f"GeoDataFrame shape: {gdf.shape}")
print(f"CRS: {gdf.crs}")
print(gdf.head())
Coordinate Reference Systems
# WGS84 (lat/lon) to UTM Zone 18N (meters)
gdf_utm = gdf.to_crs(epsg=32618)
print(f"Original CRS: {gdf.crs}")
print(f"Projected CRS: {gdf_utm.crs}")
# Check coordinates are in meters
print(f"X range: {gdf_utm.geometry.x.min():.0f} to {gdf_utm.geometry.x.max():.0f} meters")
# Common CRS transformations
# Web Mercator for web mapping
gdf_web = gdf.to_crs(epsg=3857)
# State Plane for local areas
gdf_stateplane = gdf.to_crs(epsg=2263) # NY State Plane
Spatial Operations
# Buffer analysis β create areas around points
gdf_utm['buffer_100m'] = gdf_utm.geometry.buffer(100)
gdf_utm['buffer_500m'] = gdf_utm.geometry.buffer(500)
# Area of buffers
gdf_utm['buffer_area'] = gdf_utm['buffer_100m'].area
print(f"Buffer area (mΒ²): {gdf_utm['buffer_area'].iloc[0]:.0f}")
# Distance between points
point1 = gdf_utm.geometry.iloc[0]
point2 = gdf_utm.geometry.iloc[1]
distance = point1.distance(point2)
print(f"Distance between first two points: {distance:.0f} meters")
# Nearest neighbor
from scipy.spatial import cKDTree
coords = np.array(list(zip(gdf_utm.geometry.x, gdf_utm.geometry.y)))
tree = cKDTree(coords)
distances, indices = tree.query(coords, k=5) # 5 nearest neighbors
gdf['nn_distance'] = distances[:, 1] # distance to nearest neighbor
print(f"Average nearest neighbor distance: {gdf['nn_distance'].mean():.0f} meters")
Spatial Joins
# Create polygon regions
regions = gpd.GeoDataFrame({
'region_id': [1, 2, 3],
'region_name': ['Downtown', 'Midtown', 'Uptown'],
'geometry': [
Polygon([(-73.98, 40.75), (-73.96, 40.75), (-73.96, 40.77), (-73.98, 40.77)]),
Polygon([(-73.98, 40.73), (-73.96, 40.73), (-73.96, 40.75), (-73.98, 40.75)]),
Polygon([(-73.98, 40.71), (-73.96, 40.71), (-73.96, 40.73), (-73.98, 40.73)])
]
}, crs='EPSG:4326')
# Spatial join β which region contains each point?
joined = gpd.sjoin(gdf, regions, how='left', predicate='within')
print(f"Points per region:")
print(joined.groupby('region_name').size())
# Points in polygons
points_in_polygons = gpd.sjoin(regions, gdf, how='left', predicate='contains')
print(f"\nPoints per region (from polygon side):")
print(points_in_polygons.groupby('region_name')['value'].agg(['count', 'mean']))
Heatmap / Density Estimation
from scipy.stats import gaussian_kde
# Kernel density estimation for heatmap
x = gdf.geometry.x.values
y = gdf.geometry.y.values
# Create grid
xi = np.linspace(x.min(), x.max(), 100)
yi = np.linspace(y.min(), y.max(), 100)
Xi, Yi = np.meshgrid(xi, yi)
positions = np.vstack([Xi.ravel(), Yi.ravel()])
# KDE
kernel = gaussian_kde(np.vstack([x, y]))
Z = kernel(positions).reshape(Xi.shape)
# Create density GeoDataFrame
density_gdf = gpd.GeoDataFrame({
'density': Z.ravel()
}, geometry=[Point(px, py) for px, py in zip(Xi.ravel(), Yi.ravel())],
crs='EPSG:4326'
)
print(f"Density range: {Z.min():.6f} to {Z.max():.6f}")
print(f"Peak location: ({xi[np.unravel_index(Z.argmax(), Z.shape)[1]]:.4f}, "
f"{yi[np.unravel_index(Z.argmax(), Z.shape)[0]]:.4f})")
Kriging Interpolation
# Ordinary Kriging for spatial interpolation
from scipy.spatial.distance import cdist
def ordinary_kriging(coords, values, target_coords, variogram_model='spherical'):
"""Simple ordinary Kriging implementation."""
n = len(coords)
# Compute pairwise distances
D = cdist(coords, coords)
# Fit variogram (simplified)
max_dist = D.max() / 2
lags = np.linspace(0, max_dist, 20)
semivariances = []
for lag in lags[:-1]:
mask = (D >= lag) & (D < lag + max_dist / 20)
if mask.sum() > 0:
pairs = np.where(mask)
gamma = 0.5 * np.mean((values[pairs[0]] - values[pairs[1]]) ** 2)
semivariances.append(gamma)
else:
semivariances.append(np.nan)
semivariances = np.array(semivariances)
# Simple exponential variogram fit
sill = np.nanmax(semivariances)
nugget = semivariances[0] if not np.isnan(semivariances[0]) else 0
range_param = lags[np.nanargmin(np.abs(semivariances - sill * 0.95))]
def variogram(h):
if h == 0:
return 0
return nugget + (sill - nugget) * (1 - np.exp(-3 * h / range_param))
# Kriging weights
n_target = len(target_coords)
predictions = np.zeros(n_target)
for i, target in enumerate(target_coords):
distances = cdist(target.reshape(1, -1), coords).flatten()
gamma_values = np.array([variogram(d) for d in distances])
# Simple inverse distance weighting as approximation
weights = 1 / (gamma_values + 1e-10)
weights /= weights.sum()
predictions[i] = np.dot(weights, values)
return predictions
# Generate sample data
np.random.seed(42)
sample_coords = np.column_stack([
np.random.uniform(-74.05, -73.75, 30),
np.random.uniform(40.6, 40.9, 30)
])
sample_values = np.random.normal(100, 20, 30)
# Interpolate to grid
target_x = np.linspace(-74.05, -73.75, 50)
target_y = np.linspace(40.6, 40.9, 50)
target_grid = np.array([(x, y) for x in target_x for y in target_y])
kriged_values = ordinary_kriging(sample_coords, sample_values, target_grid)
kriged_grid = kriged_values.reshape(50, 50)
print(f"Kriged grid shape: {kriged_grid.shape}")
print(f"Interpolated range: {kriged_values.min():.1f} to {kriged_values.max():.1f}")
Spatial Autocorrelation
from pysal.lib import weights
from esda.moran import Moran
# Create spatial weights matrix
w = weights.KNN.from_dataframe(gdf, k=8)
w.transform = 'r'
# Moran's I test for spatial autocorrelation
moran = Moran(gdf['value'], w)
print(f"Moran's I: {moran.I:.4f}")
print(f"Expected I: {moran.EI:.4f}")
print(f"P-value: {moran.p_sim:.4f}")
print(f"Interpretation: {'Clustered' if moran.p_sim < 0.05 else 'Random'}")
Choropleth Mapping
# Create choropleth data
region_stats = joined.groupby('region_name').agg(
count=('id', 'count'),
mean_value=('value', 'mean'),
total_value=('value', 'sum')
).reset_index()
regions_with_stats = regions.merge(region_stats, on='region_name')
print("Region statistics:")
print(regions_with_stats[['region_name', 'count', 'mean_value', 'total_value']])
Spatial Statistics
# Getis-Ord Gi* hotspot analysis
def getis_ord_star(gdf, column, k=8):
"""Simplified Getis-Ord Gi* statistic."""
w = weights.KNN.from_dataframe(gdf, k=k)
w.transform = 'r'
values = gdf[column].values
n = len(values)
mean = values.mean()
std = values.std()
z_scores = np.zeros(n)
for i in range(n):
neighbors = list(w.neighbors[i])
if len(neighbors) > 0:
neighbor_values = values[neighbors]
local_sum = neighbor_values.sum()
expected = len(neighbors) * mean
variance = len(neighbors) * (std ** 2) * (n - len(neighbors)) / (n - 1)
z_scores[i] = (local_sum - expected) / (np.sqrt(variance) + 1e-10)
return z_scores
gdf['hotspot_z'] = getis_ord_star(gdf, 'value')
gdf['hotspot'] = pd.cut(
gdf['hotspot_z'],
bins=[-np.inf, -1.96, -1, 1, 1.96, np.inf],
labels=['cold spot', 'cool', 'neutral', 'warm', 'hot spot']
)
print("Hotspot distribution:")
print(gdf['hotspot'].value_counts())
Best Practices
- Always check CRS β mismatched projections cause wrong distances
- Use appropriate projections β local vs global analysis
- Account for spatial autocorrelation β violates independence assumptions
- Validate kriging β use cross-validation with RMSE
- Handle edge effects β boundary points have fewer neighbors
- Scale matters β results change with aggregation level (MAUP)
Summary
Spatial data analysis unlocks location-based insights through GeoDataFrames, spatial joins, interpolation, and autocorrelation statistics. Master these tools to analyze geographic patterns, interpolate surfaces, and build location-aware models.