Spatial Data Analysis

Spatial data is everywhere – from GPS coordinates to satellite imagery to census tracts. Analyzing spatial data requires specialized tools that understand geometry, projections, and spatial relationships.

Spatial Data Processing Pipeline

Why Spatial Analysis Matters

Location context transforms data. A crime report becomes a hotspot map. A sale becomes a demand surface. Population counts become a choropleth revealing inequality. Spatial analysis reveals patterns invisible in tabular data.

import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon, LineString
import geopandas as gpd
from scipy.spatial import cKDTree
from scipy.interpolate import griddata
import warnings
warnings.filterwarnings('ignore')

Creating GeoDataFrames

The fundamental spatial data structure is the GeoDataFrame.

# Create points from coordinates
np.random.seed(42)
n = 500

lons = np.random.uniform(-74.05, -73.75, n)  # NYC area
lats = np.random.uniform(40.6, 40.9, n)

gdf = gpd.GeoDataFrame({
    'id': range(n),
    'value': np.random.lognormal(5, 1, n),
    'category': np.random.choice(['A', 'B', 'C'], n),
    'geometry': [Point(lon, lat) for lon, lat in zip(lons, lats)]
}, crs='EPSG:4326')

print(f"GeoDataFrame shape: {gdf.shape}")
print(f"CRS: {gdf.crs}")
print(gdf.head())

Coordinate Reference Systems

# WGS84 (lat/lon) to UTM Zone 18N (meters)
gdf_utm = gdf.to_crs(epsg=32618)
print(f"Original CRS: {gdf.crs}")
print(f"Projected CRS: {gdf_utm.crs}")

# Check coordinates are in meters
print(f"X range: {gdf_utm.geometry.x.min():.0f} to {gdf_utm.geometry.x.max():.0f} meters")

# Common CRS transformations
# Web Mercator for web mapping
gdf_web = gdf.to_crs(epsg=3857)
# State Plane for local areas
gdf_stateplane = gdf.to_crs(epsg=2263)  # NY State Plane

Spatial Operations

# Buffer analysis – create areas around points
gdf_utm['buffer_100m'] = gdf_utm.geometry.buffer(100)
gdf_utm['buffer_500m'] = gdf_utm.geometry.buffer(500)

# Area of buffers
gdf_utm['buffer_area'] = gdf_utm['buffer_100m'].area
print(f"Buffer area (m²): {gdf_utm['buffer_area'].iloc[0]:.0f}")

# Distance between points
point1 = gdf_utm.geometry.iloc[0]
point2 = gdf_utm.geometry.iloc[1]
distance = point1.distance(point2)
print(f"Distance between first two points: {distance:.0f} meters")

# Nearest neighbor
from scipy.spatial import cKDTree

coords = np.array(list(zip(gdf_utm.geometry.x, gdf_utm.geometry.y)))
tree = cKDTree(coords)
distances, indices = tree.query(coords, k=5)  # 5 nearest neighbors
gdf['nn_distance'] = distances[:, 1]  # distance to nearest neighbor
print(f"Average nearest neighbor distance: {gdf['nn_distance'].mean():.0f} meters")

Spatial Joins

# Create polygon regions
regions = gpd.GeoDataFrame({
    'region_id': [1, 2, 3],
    'region_name': ['Downtown', 'Midtown', 'Uptown'],
    'geometry': [
        Polygon([(-73.98, 40.75), (-73.96, 40.75), (-73.96, 40.77), (-73.98, 40.77)]),
        Polygon([(-73.98, 40.73), (-73.96, 40.73), (-73.96, 40.75), (-73.98, 40.75)]),
        Polygon([(-73.98, 40.71), (-73.96, 40.71), (-73.96, 40.73), (-73.98, 40.73)])
    ]
}, crs='EPSG:4326')

# Spatial join – which region contains each point?
joined = gpd.sjoin(gdf, regions, how='left', predicate='within')
print(f"Points per region:")
print(joined.groupby('region_name').size())

# Points in polygons
points_in_polygons = gpd.sjoin(regions, gdf, how='left', predicate='contains')
print(f"\nPoints per region (from polygon side):")
print(points_in_polygons.groupby('region_name')['value'].agg(['count', 'mean']))

Heatmap / Density Estimation

from scipy.stats import gaussian_kde

# Kernel density estimation for heatmap
x = gdf.geometry.x.values
y = gdf.geometry.y.values

# Create grid
xi = np.linspace(x.min(), x.max(), 100)
yi = np.linspace(y.min(), y.max(), 100)
Xi, Yi = np.meshgrid(xi, yi)
positions = np.vstack([Xi.ravel(), Yi.ravel()])

# KDE
kernel = gaussian_kde(np.vstack([x, y]))
Z = kernel(positions).reshape(Xi.shape)

# Create density GeoDataFrame
density_gdf = gpd.GeoDataFrame({
    'density': Z.ravel()
}, geometry=[Point(px, py) for px, py in zip(Xi.ravel(), Yi.ravel())],
    crs='EPSG:4326'
)

print(f"Density range: {Z.min():.6f} to {Z.max():.6f}")
print(f"Peak location: ({xi[np.unravel_index(Z.argmax(), Z.shape)[1]]:.4f}, "
      f"{yi[np.unravel_index(Z.argmax(), Z.shape)[0]]:.4f})")

Kriging Interpolation

# Ordinary Kriging for spatial interpolation
from scipy.spatial.distance import cdist

def ordinary_kriging(coords, values, target_coords, variogram_model='spherical'):
    """Simple ordinary Kriging implementation."""
    n = len(coords)
    
    # Compute pairwise distances
    D = cdist(coords, coords)
    
    # Fit variogram (simplified)
    max_dist = D.max() / 2
    lags = np.linspace(0, max_dist, 20)
    semivariances = []
    
    for lag in lags[:-1]:
        mask = (D >= lag) & (D < lag + max_dist / 20)
        if mask.sum() > 0:
            pairs = np.where(mask)
            gamma = 0.5 * np.mean((values[pairs[0]] - values[pairs[1]]) ** 2)
            semivariances.append(gamma)
        else:
            semivariances.append(np.nan)
    
    semivariances = np.array(semivariances)
    
    # Simple exponential variogram fit
    sill = np.nanmax(semivariances)
    nugget = semivariances[0] if not np.isnan(semivariances[0]) else 0
    range_param = lags[np.nanargmin(np.abs(semivariances - sill * 0.95))]
    
    def variogram(h):
        if h == 0:
            return 0
        return nugget + (sill - nugget) * (1 - np.exp(-3 * h / range_param))
    
    # Kriging weights
    n_target = len(target_coords)
    predictions = np.zeros(n_target)
    
    for i, target in enumerate(target_coords):
        distances = cdist(target.reshape(1, -1), coords).flatten()
        gamma_values = np.array([variogram(d) for d in distances])
        
        # Simple inverse distance weighting as approximation
        weights = 1 / (gamma_values + 1e-10)
        weights /= weights.sum()
        predictions[i] = np.dot(weights, values)
    
    return predictions

# Generate sample data
np.random.seed(42)
sample_coords = np.column_stack([
    np.random.uniform(-74.05, -73.75, 30),
    np.random.uniform(40.6, 40.9, 30)
])
sample_values = np.random.normal(100, 20, 30)

# Interpolate to grid
target_x = np.linspace(-74.05, -73.75, 50)
target_y = np.linspace(40.6, 40.9, 50)
target_grid = np.array([(x, y) for x in target_x for y in target_y])

kriged_values = ordinary_kriging(sample_coords, sample_values, target_grid)
kriged_grid = kriged_values.reshape(50, 50)

print(f"Kriged grid shape: {kriged_grid.shape}")
print(f"Interpolated range: {kriged_values.min():.1f} to {kriged_values.max():.1f}")

Spatial Autocorrelation

from pysal.lib import weights
from esda.moran import Moran

# Create spatial weights matrix
w = weights.KNN.from_dataframe(gdf, k=8)
w.transform = 'r'

# Moran's I test for spatial autocorrelation
moran = Moran(gdf['value'], w)
print(f"Moran's I: {moran.I:.4f}")
print(f"Expected I: {moran.EI:.4f}")
print(f"P-value: {moran.p_sim:.4f}")
print(f"Interpretation: {'Clustered' if moran.p_sim < 0.05 else 'Random'}")

Choropleth Mapping

# Create choropleth data
region_stats = joined.groupby('region_name').agg(
    count=('id', 'count'),
    mean_value=('value', 'mean'),
    total_value=('value', 'sum')
).reset_index()

regions_with_stats = regions.merge(region_stats, on='region_name')
print("Region statistics:")
print(regions_with_stats[['region_name', 'count', 'mean_value', 'total_value']])

Spatial Statistics

# Getis-Ord Gi* hotspot analysis
def getis_ord_star(gdf, column, k=8):
    """Simplified Getis-Ord Gi* statistic."""
    w = weights.KNN.from_dataframe(gdf, k=k)
    w.transform = 'r'
    
    values = gdf[column].values
    n = len(values)
    mean = values.mean()
    std = values.std()
    
    z_scores = np.zeros(n)
    for i in range(n):
        neighbors = list(w.neighbors[i])
        if len(neighbors) > 0:
            neighbor_values = values[neighbors]
            local_sum = neighbor_values.sum()
            expected = len(neighbors) * mean
            variance = len(neighbors) * (std ** 2) * (n - len(neighbors)) / (n - 1)
            z_scores[i] = (local_sum - expected) / (np.sqrt(variance) + 1e-10)
    
    return z_scores

gdf['hotspot_z'] = getis_ord_star(gdf, 'value')
gdf['hotspot'] = pd.cut(
    gdf['hotspot_z'],
    bins=[-np.inf, -1.96, -1, 1, 1.96, np.inf],
    labels=['cold spot', 'cool', 'neutral', 'warm', 'hot spot']
)

print("Hotspot distribution:")
print(gdf['hotspot'].value_counts())

Best Practices

Always check CRS – mismatched projections cause wrong distances
Use appropriate projections – local vs global analysis
Account for spatial autocorrelation – violates independence assumptions
Validate kriging – use cross-validation with RMSE
Handle edge effects – boundary points have fewer neighbors
Scale matters – results change with aggregation level (MAUP)

Summary

Spatial data analysis unlocks location-based insights through GeoDataFrames, spatial joins, interpolation, and autocorrelation statistics. Master these tools to analyze geographic patterns, interpolate surfaces, and build location-aware models.