Project 1: EDA on Real Dataset
This project guides you through a complete Exploratory Data Analysis workflow using a real-world dataset.
EDA Workflow
<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
<rect width="600" height="400" fill="#f8f9fa" rx="10"/>
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">EDA Workflow Pipeline</text>
<!-- Steps -->
<rect x="50" y="60" width="100" height="50" fill="#3498db" rx="5"/>
<text x="100" y="90" text-anchor="middle" font-size="12" fill="white">1. Load Data</text>
<rect x="170" y="60" width="100" height="50" fill="#2ecc71" rx="5"/>
<text x="220" y="90" text-anchor="middle" font-size="12" fill="white">2. Clean Data</text>
<rect x="290" y="60" width="100" height="50" fill="#e74c3c" rx="5"/>
<text x="340" y="90" text-anchor="middle" font-size="12" fill="white">3. Explore</text>
<rect x="410" y="60" width="100" height="50" fill="#f39c12" rx="5"/>
<text x="460" y="90" text-anchor="middle" font-size="12" fill="white">4. Visualize</text>
<rect x="530" y="60" width="60" height="50" fill="#9b59b6" rx="5"/>
<text x="560" y="90" text-anchor="middle" font-size="10" fill="white">5. Insights</text>
<!-- Arrows -->
<line x1="150" y1="85" x2="170" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="270" y1="85" x2="290" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="390" y1="85" x2="410" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="510" y1="85" x2="530" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<!-- Details -->
<text x="300" y="150" text-anchor="middle" font-size="14" fill="#2c3e50">Key Activities at Each Stage:</text>
<text x="50" y="180" font-size="11" fill="#3498db">⢠Load CSV/Excel/SQL</text>
<text x="50" y="200" font-size="11" fill="#3498db">⢠Check dtypes</text>
<text x="50" y="220" font-size="11" fill="#3498db">⢠First/last rows</text>
<text x="170" y="180" font-size="11" fill="#2ecc71">⢠Handle missing</text>
<text x="170" y="200" font-size="11" fill="#2ecc71">⢠Remove duplicates</text>
<text x="170" y="220" font-size="11" fill="#2ecc71">⢠Fix data types</text>
<text x="290" y="180" font-size="11" fill="#e74c3c">⢠Distributions</text>
<text x="290" y="200" font-size="11" fill="#e74c3c">⢠Correlations</text>
<text x="290" y="220" font-size="11" fill="#e74c3c">⢠Outliers</text>
<text x="410" y="180" font-size="11" fill="#f39c12">⢠Histograms</text>
<text x="410" y="200" font-size="11" fill="#f39c12">⢠Box plots</text>
<text x="410" y="220" font-size="11" fill="#f39c12">⢠Heatmaps</text>
<text x="500" y="180" font-size="11" fill="#9b59b6">⢠Patterns</text>
<text x="500" y="200" font-size="11" fill="#9b59b6">⢠Anomalies</text>
<text x="500" y="220" font-size="11" fill="#9b59b6">⢠Features</text>
<defs>
<marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
<path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
</marker>
</defs>
</svg>
Project Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Load dataset
df = pd.read_csv('dataset.csv')
# Initial exploration
print(f"Shape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes}")
print(f"\nFirst 5 rows:\n{df.head()}")
print(f"\nBasic statistics:\n{df.describe()}")
Data Cleaning
# Check for missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
'Missing': missing,
'Percent': missing_percent
}).sort_values('Percent', ascending=False)
# Handle missing values
def handle_missing(df):
# Numerical columns: fill with median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
# Categorical columns: fill with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
df[col] = df[col].fillna(df[col].mode()[0])
return df
df_clean = handle_missing(df)
# Remove duplicates
print(f"Duplicates before: {df_clean.duplicated().sum()}")
df_clean = df_clean.drop_duplicates()
print(f"Duplicates after: {df_clean.duplicated().sum()}")
Univariate Analysis
def univariate_analysis(df, column):
"""Comprehensive univariate analysis for a single column"""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
if df[column].dtype in ['int64', 'float64']:
# Numerical
axes[0].hist(df[column], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title(f'{column} Distribution')
axes[1].boxplot(df[column])
axes[1].set_title(f'{column} Boxplot')
# QQ plot
stats.probplot(df[column], dist="norm", plot=axes[2])
axes[2].set_title(f'{column} QQ Plot')
else:
# Categorical
value_counts = df[column].value_counts()
axes[0].bar(value_counts.index, value_counts.values)
axes[0].set_title(f'{column} Counts')
axes[0].tick_params(axis='x', rotation=45)
# Pie chart
axes[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%')
axes[1].set_title(f'{column} Distribution')
plt.tight_layout()
plt.show()
print(f"\nStatistics for {column}:")
print(df[column].describe())
# Apply to all columns
for col in df_clean.columns:
univariate_analysis(df_clean, col)
Bivariate Analysis
# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = df_clean.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()
# Scatter plot matrix
sns.pairplot(df_clean, hue='target_column')
plt.show()
# Categorical vs Numerical
def cat_vs_num_analysis(df, cat_col, num_col):
plt.figure(figsize=(10, 6))
sns.boxplot(x=cat_col, y=num_col, data=df)
plt.title(f'{num_col} by {cat_col}')
plt.xticks(rotation=45)
plt.show()
# Statistical test
groups = [group[num_col].dropna() for name, group in df.groupby(cat_col)]
if len(groups) == 2:
stat, p_value = stats.mannwhitneyu(*groups)
else:
stat, p_value = stats.kruskal(*groups)
print(f"Statistical test p-value: {p_value:.4f}")
Visualization Best Practices
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
# Time series visualization
def plot_time_series(df, date_col, value_col):
plt.figure(figsize=(14, 6))
plt.plot(df[date_col], df[value_col], marker='o', markersize=3)
plt.title(f'{value_col} Over Time')
plt.xlabel('Date')
plt.ylabel(value_col)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Distribution comparison
def compare_distributions(df, col, group_col):
fig, ax = plt.subplots(figsize=(10, 6))
for group in df[group_col].unique():
subset = df[df[group_col] == group][col]
ax.hist(subset, alpha=0.5, label=group, bins=30)
ax.set_title(f'{col} Distribution by {group_col}')
ax.legend()
plt.show()
Key Takeaways
- Always start with data shape and types
- Handle missing values systematically
- Use multiple visualization types for comprehensive understanding
- Document findings at each step
- Generate actionable insights from patterns