CW

Project 1: EDA on Real Dataset

Module 6: EDA ProjectFree Lesson

Advertisement

Project 1: EDA on Real Dataset

This project guides you through a complete Exploratory Data Analysis workflow using a real-world dataset.

EDA Workflow

<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
  <rect width="600" height="400" fill="#f8f9fa" rx="10"/>
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">EDA Workflow Pipeline</text>
  
  <!-- Steps -->
  <rect x="50" y="60" width="100" height="50" fill="#3498db" rx="5"/>
  <text x="100" y="90" text-anchor="middle" font-size="12" fill="white">1. Load Data</text>
  
  <rect x="170" y="60" width="100" height="50" fill="#2ecc71" rx="5"/>
  <text x="220" y="90" text-anchor="middle" font-size="12" fill="white">2. Clean Data</text>
  
  <rect x="290" y="60" width="100" height="50" fill="#e74c3c" rx="5"/>
  <text x="340" y="90" text-anchor="middle" font-size="12" fill="white">3. Explore</text>
  
  <rect x="410" y="60" width="100" height="50" fill="#f39c12" rx="5"/>
  <text x="460" y="90" text-anchor="middle" font-size="12" fill="white">4. Visualize</text>
  
  <rect x="530" y="60" width="60" height="50" fill="#9b59b6" rx="5"/>
  <text x="560" y="90" text-anchor="middle" font-size="10" fill="white">5. Insights</text>
  
  <!-- Arrows -->
  <line x1="150" y1="85" x2="170" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="270" y1="85" x2="290" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="390" y1="85" x2="410" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="510" y1="85" x2="530" y2="85" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  
  <!-- Details -->
  <text x="300" y="150" text-anchor="middle" font-size="14" fill="#2c3e50">Key Activities at Each Stage:</text>
  
  <text x="50" y="180" font-size="11" fill="#3498db">• Load CSV/Excel/SQL</text>
  <text x="50" y="200" font-size="11" fill="#3498db">• Check dtypes</text>
  <text x="50" y="220" font-size="11" fill="#3498db">• First/last rows</text>
  
  <text x="170" y="180" font-size="11" fill="#2ecc71">• Handle missing</text>
  <text x="170" y="200" font-size="11" fill="#2ecc71">• Remove duplicates</text>
  <text x="170" y="220" font-size="11" fill="#2ecc71">• Fix data types</text>
  
  <text x="290" y="180" font-size="11" fill="#e74c3c">• Distributions</text>
  <text x="290" y="200" font-size="11" fill="#e74c3c">• Correlations</text>
  <text x="290" y="220" font-size="11" fill="#e74c3c">• Outliers</text>
  
  <text x="410" y="180" font-size="11" fill="#f39c12">• Histograms</text>
  <text x="410" y="200" font-size="11" fill="#f39c12">• Box plots</text>
  <text x="410" y="220" font-size="11" fill="#f39c12">• Heatmaps</text>
  
  <text x="500" y="180" font-size="11" fill="#9b59b6">• Patterns</text>
  <text x="500" y="200" font-size="11" fill="#9b59b6">• Anomalies</text>
  <text x="500" y="220" font-size="11" fill="#9b59b6">• Features</text>
  
  <defs>
    <marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
      <path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
    </marker>
  </defs>
</svg>

Project Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('dataset.csv')

# Initial exploration
print(f"Shape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes}")
print(f"\nFirst 5 rows:\n{df.head()}")
print(f"\nBasic statistics:\n{df.describe()}")

Data Cleaning

# Check for missing values
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing': missing,
    'Percent': missing_percent
}).sort_values('Percent', ascending=False)

# Handle missing values
def handle_missing(df):
    # Numerical columns: fill with median
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    
    # Categorical columns: fill with mode
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

df_clean = handle_missing(df)

# Remove duplicates
print(f"Duplicates before: {df_clean.duplicated().sum()}")
df_clean = df_clean.drop_duplicates()
print(f"Duplicates after: {df_clean.duplicated().sum()}")

Univariate Analysis

def univariate_analysis(df, column):
    """Comprehensive univariate analysis for a single column"""
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    if df[column].dtype in ['int64', 'float64']:
        # Numerical
        axes[0].hist(df[column], bins=30, edgecolor='black', alpha=0.7)
        axes[0].set_title(f'{column} Distribution')
        
        axes[1].boxplot(df[column])
        axes[1].set_title(f'{column} Boxplot')
        
        # QQ plot
        stats.probplot(df[column], dist="norm", plot=axes[2])
        axes[2].set_title(f'{column} QQ Plot')
    else:
        # Categorical
        value_counts = df[column].value_counts()
        axes[0].bar(value_counts.index, value_counts.values)
        axes[0].set_title(f'{column} Counts')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Pie chart
        axes[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%')
        axes[1].set_title(f'{column} Distribution')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nStatistics for {column}:")
    print(df[column].describe())

# Apply to all columns
for col in df_clean.columns:
    univariate_analysis(df_clean, col)

Bivariate Analysis

# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = df_clean.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Scatter plot matrix
sns.pairplot(df_clean, hue='target_column')
plt.show()

# Categorical vs Numerical
def cat_vs_num_analysis(df, cat_col, num_col):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=cat_col, y=num_col, data=df)
    plt.title(f'{num_col} by {cat_col}')
    plt.xticks(rotation=45)
    plt.show()
    
    # Statistical test
    groups = [group[num_col].dropna() for name, group in df.groupby(cat_col)]
    if len(groups) == 2:
        stat, p_value = stats.mannwhitneyu(*groups)
    else:
        stat, p_value = stats.kruskal(*groups)
    print(f"Statistical test p-value: {p_value:.4f}")

Visualization Best Practices

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Time series visualization
def plot_time_series(df, date_col, value_col):
    plt.figure(figsize=(14, 6))
    plt.plot(df[date_col], df[value_col], marker='o', markersize=3)
    plt.title(f'{value_col} Over Time')
    plt.xlabel('Date')
    plt.ylabel(value_col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Distribution comparison
def compare_distributions(df, col, group_col):
    fig, ax = plt.subplots(figsize=(10, 6))
    for group in df[group_col].unique():
        subset = df[df[group_col] == group][col]
        ax.hist(subset, alpha=0.5, label=group, bins=30)
    ax.set_title(f'{col} Distribution by {group_col}')
    ax.legend()
    plt.show()

Key Takeaways

  1. Always start with data shape and types
  2. Handle missing values systematically
  3. Use multiple visualization types for comprehensive understanding
  4. Document findings at each step
  5. Generate actionable insights from patterns

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement