Data Visualization — matplotlib and seaborn Complete Guide

Data Visualization in Python

Visualization is not decoration — it is the primary tool for understanding data. Python's ecosystem offers matplotlib for full control and seaborn for beautiful statistical plots.

matplotlib — The Foundation

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, gaussian_kde
import warnings; warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8-whitegrid")
np.random.seed(42)

# ── 1. Line Plot ──────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 5))
x = np.linspace(0, 4 * np.pi, 300)

ax.plot(x, np.sin(x),       "b-",  lw=2.5, label="sin(x)")
ax.plot(x, np.cos(x),       "r--", lw=2.5, label="cos(x)")
ax.plot(x, np.sin(x)*np.exp(-0.3*x), "g-.", lw=2, label="damped sin")

ax.axhline(0, color="black", lw=0.8)
ax.set_xlabel("x (radians)", fontsize=13)
ax.set_ylabel("Amplitude",   fontsize=13)
ax.set_title("Trigonometric Functions", fontsize=15, fontweight="bold")
ax.legend(fontsize=12, loc="upper right")
ax.set_xlim(0, 4*np.pi)

# Annotate a feature
ax.annotate("Maximum",
            xy=(np.pi/2, 1), xytext=(np.pi/2+0.8, 1.1),
            arrowprops=dict(arrowstyle="->", color="blue"),
            fontsize=11, color="blue")

plt.tight_layout()
plt.savefig("line_plot.png", dpi=150, bbox_inches="tight")
plt.show()

# ── 2. Scatter Plot with Color Encoding ──────────────────
n = 300
x_data = np.random.randn(n)
y_data = 0.8 * x_data + np.random.randn(n) * 0.6
colors = np.random.choice(["A","B","C"], n)
color_map = {"A": "#2196F3", "B": "#F44336", "C": "#4CAF50"}

fig, ax = plt.subplots(figsize=(8, 6))
for cat in ["A","B","C"]:
    mask = colors == cat
    ax.scatter(x_data[mask], y_data[mask],
               c=color_map[cat], alpha=0.7, s=50,
               edgecolors="white", lw=0.5, label=f"Group {cat}")

# Regression line
m, b = np.polyfit(x_data, y_data, 1)
xl = np.linspace(x_data.min(), x_data.max(), 100)
ax.plot(xl, m*xl+b, "k--", lw=2, label=f"y = {m:.2f}x + {b:.2f}")

ax.set_xlabel("X Variable"); ax.set_ylabel("Y Variable")
ax.set_title("Scatter Plot with Groups"); ax.legend()
plt.tight_layout(); plt.savefig("scatter.png", dpi=150); plt.show()

# ── 3. Histogram with KDE overlay ──────────────────────
data_a = np.random.normal(60, 12, 500)
data_b = np.random.normal(75, 10, 500)

fig, ax = plt.subplots(figsize=(9, 5))
ax.hist(data_a, bins=30, density=True, alpha=0.5, color="#2196F3",
        edgecolor="white", label="Group A")
ax.hist(data_b, bins=30, density=True, alpha=0.5, color="#F44336",
        edgecolor="white", label="Group B")

for data, color in [(data_a,"#2196F3"), (data_b,"#F44336")]:
    kde = gaussian_kde(data)
    xk  = np.linspace(data.min()-5, data.max()+5, 200)
    ax.plot(xk, kde(xk), color=color, lw=2.5)

ax.set_xlabel("Value"); ax.set_ylabel("Density")
ax.set_title("Histogram + KDE Comparison"); ax.legend()
plt.tight_layout(); plt.savefig("histogram_kde.png", dpi=150); plt.show()

# ── 4. Subplots Grid ──────────────────────────────────────
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
data = np.random.randn(200)

# Bar chart
categories = ["Q1","Q2","Q3","Q4"]
values = [42, 58, 51, 67]
axes[0,0].bar(categories, values, color=["steelblue","coral","mediumseagreen","gold"],
               edgecolor="black", alpha=0.8)
for i, v in enumerate(values):
    axes[0,0].text(i, v+0.5, str(v), ha="center", fontsize=11)
axes[0,0].set_title("Bar Chart"); axes[0,0].set_ylabel("Count")

# Box plot
groups = [np.random.normal(m, s, 50) for m, s in [(65,8),(72,10),(60,12),(78,7)]]
bp = axes[0,1].boxplot(groups, patch_artist=True, labels=["A","B","C","D"],
                        medianprops=dict(color="red", lw=2))
colors_bp = ["#AED6F1","#A9DFBF","#F9E79F","#F1948A"]
for patch, c in zip(bp["boxes"], colors_bp):
    patch.set_facecolor(c)
axes[0,1].set_title("Box Plot"); axes[0,1].set_ylabel("Score")

# Pie chart
sizes  = [35, 28, 18, 12, 7]
labels_pie = ["Alpha","Beta","Gamma","Delta","Others"]
axes[0,2].pie(sizes, labels=labels_pie, autopct="%1.1f%%",
               startangle=90, colors=plt.cm.Set3.colors[:5])
axes[0,2].set_title("Market Share")

# Heatmap (manual)
corr = np.array([[1.00, 0.72, -0.35, 0.15],
                  [0.72, 1.00,  0.10, 0.60],
                  [-0.35,0.10,  1.00, 0.45],
                  [0.15, 0.60,  0.45, 1.00]])
im = axes[1,0].imshow(corr, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
axes[1,0].set_xticks(range(4)); axes[1,0].set_xticklabels(["F1","F2","F3","F4"])
axes[1,0].set_yticks(range(4)); axes[1,0].set_yticklabels(["F1","F2","F3","F4"])
for i in range(4):
    for j in range(4):
        axes[1,0].text(j, i, f"{corr[i,j]:.2f}", ha="center", va="center", fontsize=9)
plt.colorbar(im, ax=axes[1,0])
axes[1,0].set_title("Correlation Heatmap")

# Q-Q plot
from scipy import stats
stats.probplot(data, dist="norm", plot=axes[1,1])
axes[1,1].set_title("Q-Q Plot (Normality Check)")

# Violin plot
vdata = [np.random.normal(m,s,80) for m,s in [(70,8),(78,10),(65,15)]]
parts = axes[1,2].violinplot(vdata, positions=[1,2,3], showmedians=True)
for pc in parts["bodies"]:
    pc.set_facecolor("lightblue"); pc.set_alpha(0.8)
axes[1,2].set_xticks([1,2,3]); axes[1,2].set_xticklabels(["A","B","C"])
axes[1,2].set_title("Violin Plot"); axes[1,2].set_ylabel("Score")

plt.suptitle("matplotlib Chart Gallery", fontsize=16, fontweight="bold")
plt.tight_layout(); plt.savefig("chart_gallery.png", dpi=150); plt.show()

seaborn — Statistical Visualization

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid", palette="husl", font_scale=1.1)
tips   = sns.load_dataset("tips")
iris   = sns.load_dataset("iris")
titanic = sns.load_dataset("titanic")

fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# 1. Histogram + KDE
sns.histplot(tips["total_bill"], kde=True, bins=25, ax=axes[0,0],
             color="steelblue", edgecolor="white")
axes[0,0].set_title("Bill Distribution")

# 2. Box by category
sns.boxplot(x="day", y="total_bill", hue="time", data=tips,
            order=["Thur","Fri","Sat","Sun"], ax=axes[0,1], palette="Set2")
axes[0,1].set_title("Bill by Day and Time")

# 3. Regression plot
sns.regplot(x="total_bill", y="tip", data=tips, ax=axes[0,2],
            scatter_kws={"alpha":0.5,"s":30},
            line_kws={"color":"red","lw":2.5})
r = tips["total_bill"].corr(tips["tip"])
axes[0,2].set_title(f"Bill vs Tip  (r = {r:.3f})")

# 4. Violin split by category
sns.violinplot(x="day", y="tip", hue="sex", data=tips, split=True,
               order=["Thur","Fri","Sat","Sun"],
               palette="pastel", ax=axes[1,0])
axes[1,0].set_title("Tip Distribution: Split Violin")

# 5. Heatmap
corr_mat = iris.drop("species", axis=1).corr()
sns.heatmap(corr_mat, annot=True, fmt=".3f", cmap="RdBu_r",
            center=0, linewidths=0.5, ax=axes[1,1])
axes[1,1].set_title("Iris Feature Correlations")

# 6. Count plot
sns.countplot(x="pclass", hue="survived", data=titanic,
              palette={"0":"#F44336","1":"#4CAF50"}, ax=axes[1,2])
axes[1,2].set_title("Titanic Survival by Class")
axes[1,2].legend(title="Survived", labels=["No","Yes"])

plt.suptitle("seaborn Statistical Plots", fontsize=15, fontweight="bold")
plt.tight_layout(); plt.savefig("seaborn_gallery.png", dpi=150); plt.show()

# Pair Plot — relationships between all features
g = sns.pairplot(iris, hue="species", diag_kind="kde",
                 plot_kws={"alpha":0.6,"s":30},
                 diag_kws={"linewidth":2.5})
g.fig.suptitle("Iris Pair Plot", y=1.02, fontsize=14)
plt.savefig("pair_plot.png", dpi=150, bbox_inches="tight"); plt.show()

Visualization Best Practices

Rule	Why It Matters
Always label axes with units	Without units, numbers are meaningless
Start bar charts at zero	Truncated y-axis exaggerates differences
Use colorblind-safe palettes	8% of men have color vision deficiency
Don't use 3D charts	Distorts perception, adds no information
Show sample size	n=3 bars look same as n=3000 bars
Annotate key findings	Guide the reader to your conclusion

# Best practice: annotate your conclusions
fig, ax = plt.subplots(figsize=(9, 5))
months = ["Jan","Feb","Mar","Apr","May","Jun"]
revenue = [42, 38, 55, 61, 58, 72]
prev_year = [40, 42, 48, 52, 54, 65]

ax.plot(months, revenue,   "bo-", lw=2.5, markersize=8, label="2024")
ax.plot(months, prev_year, "r--o",lw=2,   markersize=7, label="2023")
ax.fill_between(range(6), revenue, prev_year, alpha=0.15, color="blue")

# Annotate max growth
max_idx = (np.array(revenue) - np.array(prev_year)).argmax()
ax.annotate(f"+{revenue[max_idx]-prev_year[max_idx]}K peak gap",
            xy=(max_idx, revenue[max_idx]),
            xytext=(max_idx+0.5, revenue[max_idx]+3),
            arrowprops=dict(arrowstyle="->", color="darkblue"),
            fontsize=11, color="darkblue")

ax.set_ylabel("Revenue ($K)"); ax.set_title("Monthly Revenue: 2024 vs 2023")
ax.legend(); ax.set_ylim(0, None)  # start at 0!
plt.tight_layout(); plt.savefig("annotated_chart.png", dpi=150); plt.show()

Key Takeaways

matplotlib gives full control; seaborn provides beautiful statistical plots with less code
Always use subplots for multi-panel figures — fig, axes = plt.subplots(rows, cols)
Choose chart type by question: distribution → histogram/violin; relationship → scatter; change → line
plt.tight_layout() prevents labels from overlapping
plt.savefig("fig.png", dpi=150, bbox_inches="tight") saves publication-quality figures
seaborn.pairplot shows all pairwise relationships at once — essential for EDA

→ Data Cleaning → Statistics

Data Visualization — matplotlib and seaborn Complete Guide

Data Visualization in Python

matplotlib — The Foundation

seaborn — Statistical Visualization

Visualization Best Practices

Key Takeaways

Need Expert Python Help?