Matplotlib and Seaborn: Data Visualization

The Grammar of Graphics

Data visualization is the graphical representation of information and data. A good visualization tells a story and reveals patterns that numbers alone cannot.

The Grammar of Graphics Pipeline
Data
Raw dataset
Aesthetics
x, y, color, size
Geometries
Points, lines, bars
Statistics
Smooth, bin, model
Theme
Fonts, colors, grid
Library Mapping:
Matplotlib: Low-level
Seaborn: Statistical
Plotly: Interactive
Key Principle:
Data → Aesthetics → Geometries → Statistics → Coordinates → Facets → Theme

Matplotlib: The Foundation

Basic Plot Structure

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Figure anatomy — object-oriented interface
x = np.linspace(0, 10, 100)
y = np.sin(x)

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, label='sin(x)', color='blue', linewidth=2)
ax.set_xlabel('X Axis', fontsize=12)
ax.set_ylabel('Y Axis', fontsize=12)
ax.set_title('Basic Line Plot', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Essential Plot Types

# Line Plot (trends over time)
dates = pd.date_range('2024-01-01', periods=12)
values = np.random.randn(12).cumsum() + 100

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(dates, values, marker='o', linestyle='-', color='#2196F3')
ax.fill_between(dates, values - 10, values + 10, alpha=0.2)
ax.set_title('Stock Price Trend')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar Plot (comparisons)
categories = ['A', 'B', 'C', 'D', 'E']
values = [23, 45, 56, 78, 32]

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].bar(categories, values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
axes[0].set_title('Vertical Bar Plot')
axes[1].barh(categories, values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'])
axes[1].set_title('Horizontal Bar Plot')
plt.tight_layout()
plt.show()

# Scatter Plot (relationships)
x = np.random.randn(100)
y = x * 2 + np.random.randn(100) * 0.5
colors = np.random.rand(100)
sizes = np.random.rand(100) * 200

plt.figure(figsize=(10, 6))
scatter = plt.scatter(x, y, c=colors, s=sizes, alpha=0.6, cmap='viridis')
plt.colorbar(scatter)
plt.title('Scatter Plot with Color and Size')
plt.show()

Subplots and Layouts

# Complex layout with GridSpec
import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(14, 10))
gs = gridspec.GridSpec(3, 3, figure=fig)

# Large plot spanning 2 rows, 2 columns
ax_main = fig.add_subplot(gs[0:2, 0:2])
ax_main.plot(x, y, 'b-')
ax_main.set_title('Main Plot')

# Side plots
ax_right1 = fig.add_subplot(gs[0, 2])
ax_right1.barh(categories[:3], values[:3])

ax_right2 = fig.add_subplot(gs[1, 2])
ax_right2.pie(sizes[:3], labels=labels[:3])

# Bottom plot
ax_bottom = fig.add_subplot(gs[2, :])
ax_bottom.plot(x, np.sin(x) * 100, 'r-')
ax_bottom.set_title('Bottom Plot')

plt.tight_layout()
plt.show()

Seaborn: Statistical Visualization

Distribution Plots

import seaborn as sns

sns.set_theme(style="whitegrid")
tips = sns.load_dataset('tips')

# Histogram with KDE
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(data=tips, x='total_bill', kde=True, ax=axes[0])
axes[0].set_title('Histogram with KDE')
sns.histplot(data=tips, x='total_bill', hue='time', kde=True, ax=axes[1])
axes[1].set_title('Histogram by Time')
plt.tight_layout()
plt.show()

# KDE Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.kdeplot(data=tips, x='total_bill', fill=True, ax=axes[0])
axes[0].set_title('KDE Plot')
sns.kdeplot(data=tips, x='total_bill', hue='day', fill=True, ax=axes[1])
axes[1].set_title('KDE by Day')
plt.tight_layout()
plt.show()

Categorical Plots

# Box Plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=tips, x='day', y='total_bill', hue='sex')
plt.title('Total Bill by Day and Gender')
plt.show()

# Violin Plot
plt.figure(figsize=(12, 6))
sns.violinplot(data=tips, x='day', y='total_bill', hue='sex', split=True)
plt.title('Violin Plot')
plt.show()

# Swarm Plot
plt.figure(figsize=(12, 6))
sns.swarmplot(data=tips, x='day', y='total_bill', hue='sex', size=4)
plt.title('Swarm Plot')
plt.show()

Relationship Plots

# Scatter Plot with Regression
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(data=tips, x='total_bill', y='tip', ax=axes[0])
axes[0].set_title('Basic Scatter')
sns.regplot(data=tips, x='total_bill', y='tip', ax=axes[1])
axes[1].set_title('Scatter with Regression')
plt.tight_layout()
plt.show()

# Joint Plot
g = sns.jointplot(data=tips, x='total_bill', y='tip', kind='scatter')
plt.show()

# Pair Plot (matrix of relationships)
iris = sns.load_dataset('iris')
g = sns.pairplot(iris, hue='species')
plt.show()

# Heatmap (correlation matrix)
plt.figure(figsize=(10, 8))
corr = tips.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

Matrix Plots

# FacetGrid for complex layouts
g = sns.FacetGrid(tips, col='time', row='sex', height=4, aspect=1.2)
g.map_dataframe(sns.histplot, x='total_bill', bins=15)
g.set_titles('{row_name} - {col_name}')
plt.show()

# PairGrid
g = sns.PairGrid(iris, hue='species')
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot)
g.add_legend()
plt.show()

Customization and Themes

# Set theme
sns.set_theme(style="whitegrid", palette="muted")

# Custom color palettes
palette = sns.color_palette("husl", 10)
sns.set_palette(palette)

# Custom styling
plt.rcParams.update({
    'figure.figsize': (10, 6),
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'figure.dpi': 100,
    'savefig.dpi': 300,
    'savefig.bbox': 'tight'
})

Publication-Quality Visualizations

def create_publication_plot(data, x, y, hue=None, title="", filename=None):
    sns.set_theme(style="whitegrid", context="paper")
    fig, ax = plt.subplots(figsize=(8, 6))

    if hue:
        sns.scatterplot(data=data, x=x, y=y, hue=hue, s=100, alpha=0.7, ax=ax)
    else:
        sns.scatterplot(data=data, x=x, y=y, s=100, alpha=0.7, ax=ax)

    ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(True, alpha=0.3, linestyle='--')
    plt.tight_layout()

    if filename:
        plt.savefig(f'{filename}.png', dpi=300, bbox_inches='tight')
        plt.savefig(f'{filename}.pdf', bbox_inches='tight')
    plt.show()

Practical Example: Sales Dashboard

np.random.seed(42)
dates = pd.date_range('2024-01-01', '2024-12-31', freq='D')
products = ['Product A', 'Product B', 'Product C']
regions = ['North', 'South', 'East', 'West']

data = {
    'date': np.random.choice(dates, 500),
    'product': np.random.choice(products, 500),
    'region': np.random.choice(regions, 500),
    'sales': np.random.randint(100, 1000, 500),
    'quantity': np.random.randint(1, 50, 500)
}
df = pd.DataFrame(data)
df['revenue'] = df['sales'] * df['quantity']

# Create dashboard
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

ax1 = fig.add_subplot(gs[0, :2])
daily_sales = df.groupby('date')['revenue'].sum()
ax1.plot(daily_sales.index, daily_sales.values, color='#2196F3', linewidth=1)
ax1.fill_between(daily_sales.index, daily_sales.values, alpha=0.2)
ax1.set_title('Daily Revenue Trend', fontweight='bold')

ax2 = fig.add_subplot(gs[0, 2])
product_revenue = df.groupby('product')['revenue'].sum()
ax2.pie(product_revenue, labels=product_revenue.index, autopct='%1.1f%%',
        colors=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax2.set_title('Revenue by Product')

ax3 = fig.add_subplot(gs[1, :2])
region_product = df.groupby(['region', 'product'])['revenue'].sum().unstack()
region_product.plot(kind='bar', ax=ax3, colormap='Set2')
ax3.set_title('Revenue by Region & Product')

ax4 = fig.add_subplot(gs[1, 2])
sns.histplot(data=df, x='sales', kde=True, ax=ax4, color='#45B7D1')
ax4.set_title('Sales Distribution')

plt.suptitle('Sales Dashboard 2024', fontsize=16, fontweight='bold', y=1.02)
plt.show()

Key Takeaways

Practice Exercise

Create a multi-panel figure with 4 different plot types using plt.subplots(2, 2)
Customize colors, fonts, and layout using sns.set_theme and plt.rcParams
Build a Seaborn FacetGrid that facets a dataset by two categorical variables
Compute and visualize a correlation heatmap with annotations
Create a publication-quality scatter plot with regression line, removing top/right spines
Export your final figure in both PNG (300 DPI) and PDF formats
Build a mini-dashboard with at least 5 panels summarizing a dataset

Matplotlib and Seaborn: Data Visualization

The Grammar of Graphics

Matplotlib: The Foundation

Basic Plot Structure

Essential Plot Types

Subplots and Layouts

Seaborn: Statistical Visualization

Distribution Plots

Categorical Plots

Relationship Plots

Matrix Plots

Customization and Themes

Publication-Quality Visualizations

Practical Example: Sales Dashboard

Key Takeaways

Practice Exercise

Need Expert Data Science Help?