CW

Capstone Foundation Project

Module 6: EDA ProjectFree Lesson

Advertisement

Capstone Foundation Project

This capstone project integrates all foundation skills into a complete data analysis pipeline.

Project Structure

<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
  <rect width="600" height="400" fill="#f8f9fa" rx="10"/>
  <text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Data Analysis Pipeline</text>
  
  <!-- Pipeline Stages -->
  <rect x="50" y="80" width="120" height="60" fill="#3498db" rx="5"/>
  <text x="110" y="110" text-anchor="middle" font-size="11" fill="white">Data Collection</text>
  <text x="110" y="125" text-anchor="middle" font-size="9" fill="white">SQL/API/Web</text>
  
  <rect x="200" y="80" width="120" height="60" fill="#2ecc71" rx="5"/>
  <text x="260" y="110" text-anchor="middle" font-size="11" fill="white">Data Cleaning</text>
  <text x="260" y="125" text-anchor="middle" font-size="9" fill="white">Pandas/NumPy</text>
  
  <rect x="350" y="80" width="120" height="60" fill="#e74c3c" rx="5"/>
  <text x="410" y="110" text-anchor="middle" font-size="11" fill="white">Exploration</text>
  <text x="410" y="125" text-anchor="middle" font-size="9" fill="white">Stats/Visualization</text>
  
  <rect x="500" y="80" width="80" height="60" fill="#f39c12" rx="5"/>
  <text x="540" y="110" text-anchor="middle" font-size="11" fill="white">Insights</text>
  <text x="540" y="125" text-anchor="middle" font-size="9" fill="white">Report</text>
  
  <!-- Arrows -->
  <line x1="170" y1="110" x2="200" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="320" y1="110" x2="350" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  <line x1="470" y1="110" x2="500" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
  
  <!-- File Structure -->
  <text x="300" y="180" text-anchor="middle" font-size="14" fill="#2c3e50">Project File Structure:</text>
  <text x="100" y="210" font-size="11" fill="#7f8c8d">capstone-project/</text>
  <text x="120" y="230" font-size="11" fill="#3498db">ā”œā”€ā”€ data/</text>
  <text x="140" y="250" font-size="11" fill="#3498db">│   ā”œā”€ā”€ raw/</text>
  <text x="140" y="270" font-size="11" fill="#3498db">│   └── processed/</text>
  <text x="120" y="290" font-size="11" fill="#2ecc71">ā”œā”€ā”€ notebooks/</text>
  <text x="140" y="310" font-size="11" fill="#2ecc71">│   ā”œā”€ā”€ 01_data_collection.ipynb</text>
  <text x="140" y="330" font-size="11" fill="#2ecc71">│   ā”œā”€ā”€ 02_cleaning.ipynb</text>
  <text x="140" y="350" font-size="11" fill="#2ecc71">│   └── 03_eda.ipynb</text>
  <text x="120" y="370" font-size="11" fill="#e74c3c">ā”œā”€ā”€ src/</text>
  <text x="120" y="390" font-size="11" fill="#f39c12">└── reports/</text>
  
  <defs>
    <marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
      <path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
    </marker>
  </defs>
</svg>

Data Collection Module

# data_collection.py
import pandas as pd
import requests
from sqlalchemy import create_engine

class DataCollector:
    def __init__(self):
        self.engine = create_engine('postgresql://user:pass@localhost/db')
    
    def from_sql(self, query):
        """Load data from SQL database"""
        return pd.read_sql(query, self.engine)
    
    def from_api(self, url, params=None):
        """Load data from REST API"""
        response = requests.get(url, params=params)
        response.raise_for_status()
        return pd.DataFrame(response.json()['data'])
    
    def from_csv(self, filepath):
        """Load data from CSV"""
        return pd.read_csv(filepath)
    
    def combine_sources(self, df1, df2, key, how='inner'):
        """Merge multiple data sources"""
        return pd.merge(df1, df2, on=key, how=how)

# Usage
collector = DataCollector()
sql_data = collector.from_sql("SELECT * FROM transactions")
api_data = collector.from_api("https://api.example.com/users")
combined = collector.combine_sources(sql_data, api_data, key='user_id')

Data Cleaning Pipeline

# data_cleaning.py
import pandas as pd
import numpy as np

class DataCleaner:
    def __init__(self, df):
        self.df = df.copy()
        self.cleaning_log = []
    
    def log_action(self, action, details):
        self.cleaning_log.append({
            'action': action,
            'details': details,
            'timestamp': pd.Timestamp.now()
        })
    
    def handle_missing(self, strategy='auto'):
        """Handle missing values"""
        before = self.df.isnull().sum().sum()
        
        if strategy == 'auto':
            # Numerical: median, Categorical: mode
            num_cols = self.df.select_dtypes(include=[np.number]).columns
            cat_cols = self.df.select_dtypes(include=['object']).columns
            
            self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].median())
            for col in cat_cols:
                self.df[col] = self.df[col].fillna(self.df[col].mode()[0])
        
        after = self.df.isnull().sum().sum()
        self.log_action('handle_missing', f'{before} -> {after} missing values')
        return self
    
    def remove_duplicates(self):
        """Remove duplicate rows"""
        before = len(self.df)
        self.df = self.df.drop_duplicates()
        after = len(self.df)
        self.log_action('remove_duplicates', f'{before} -> {after} rows')
        return self
    
    def fix_types(self, type_dict):
        """Fix data types"""
        for col, dtype in type_dict.items():
            if dtype == 'datetime':
                self.df[col] = pd.to_datetime(self.df[col])
            elif dtype == 'category':
                self.df[col] = self.df[col].astype('category')
            elif dtype == 'numeric':
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
        self.log_action('fix_types', f'Converted {list(type_dict.keys())}')
        return self
    
    def get_clean_data(self):
        return self.df

# Usage
cleaner = DataCleaner(df)
df_clean = (cleaner
    .handle_missing()
    .remove_duplicates()
    .fix_types({
        'date': 'datetime',
        'category': 'category',
        'amount': 'numeric'
    })
    .get_clean_data())

Analysis Module

# analysis.py
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

class DataAnalyzer:
    def __init__(self, df):
        self.df = df
        self.results = {}
    
    def descriptive_stats(self):
        """Calculate descriptive statistics"""
        self.results['descriptive'] = {
            'numerical': self.df.describe(),
            'categorical': self.df.describe(include=['object', 'category'])
        }
        return self
    
    def correlation_analysis(self, method='pearson'):
        """Analyze correlations"""
        corr_matrix = self.df.select_dtypes(include=[np.number]).corr(method=method)
        self.results['correlation'] = corr_matrix
        return self
    
    def distribution_test(self, column):
        """Test distribution normality"""
        stat, p_value = stats.shapiro(self.df[column].dropna())
        self.results[f'distribution_{column}'] = {
            'statistic': stat,
            'p_value': p_value,
            'normal': p_value > 0.05
        }
        return self
    
    def outlier_detection(self, method='iqr'):
        """Detect outliers"""
        num_cols = self.df.select_dtypes(include=[np.number]).columns
        outliers = {}
        
        for col in num_cols:
            if method == 'iqr':
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - 1.5 * IQR
                upper = Q3 + 1.5 * IQR
                outliers[col] = {
                    'count': ((self.df[col] < lower) | (self.df[col] > upper)).sum(),
                    'bounds': (lower, upper)
                }
        
        self.results['outliers'] = outliers
        return self
    
    def visualize_findings(self):
        """Create visualization dashboard"""
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # Correlation heatmap
        if 'correlation' in self.results:
            sns.heatmap(self.results['correlation'], annot=True, 
                       cmap='coolwarm', center=0, ax=axes[0, 0])
            axes[0, 0].set_title('Correlation Matrix')
        
        # Distribution of first numerical column
        num_col = self.df.select_dtypes(include=[np.number]).columns[0]
        axes[0, 1].hist(self.df[num_col].dropna(), bins=30, edgecolor='black')
        axes[0, 1].set_title(f'{num_col} Distribution')
        
        # Box plot for outliers
        if 'outliers' in self.results:
            outlier_cols = list(self.results['outliers'].keys())[:4]
            self.df[outlier_cols].boxplot(ax=axes[1, 0])
            axes[1, 0].set_title('Outlier Analysis')
        
        # Missing values
        missing = self.df.isnull().sum()
        missing[missing > 0].plot(kind='bar', ax=axes[1, 1])
        axes[1, 1].set_title('Missing Values by Column')
        
        plt.tight_layout()
        plt.show()
        
        return self
    
    def generate_report(self):
        """Generate analysis summary"""
        report = "=" * 60 + "\n"
        report += "DATA ANALYSIS REPORT\n"
        report += "=" * 60 + "\n\n"
        
        report += f"Dataset Shape: {self.df.shape}\n"
        report += f"Missing Values: {self.df.isnull().sum().sum()}\n"
        report += f"Duplicates: {self.df.duplicated().sum()}\n\n"
        
        if 'correlation' in self.results:
            report += "Top Correlations:\n"
            corr = self.results['correlation']
            # Get top correlations excluding diagonal
            mask = np.triu(np.ones_like(corr, dtype=bool))
            corr_pairs = corr.where(~mask).stack().sort_values(ascending=False)
            for (col1, col2), value in corr_pairs.head(5).items():
                report += f"  {col1} - {col2}: {value:.3f}\n"
        
        return report

Project Execution

# main.py
from data_collection import DataCollector
from data_cleaning import DataCleaner
from analysis import DataAnalyzer

# 1. Collect Data
collector = DataCollector()
df = collector.from_sql("SELECT * FROM sales_data")

# 2. Clean Data
cleaner = DataCleaner(df)
df_clean = (cleaner
    .handle_missing()
    .remove_duplicates()
    .fix_types({'sale_date': 'datetime', 'amount': 'numeric'})
    .get_clean_data())

# 3. Analyze Data
analyzer = DataAnalyzer(df_clean)
results = (analyzer
    .descriptive_stats()
    .correlation_analysis()
    .outlier_detection()
    .visualize_findings()
    .generate_report())

print(results)

Key Takeaways

  1. Structure your project for reproducibility
  2. Create reusable modules for each pipeline stage
  3. Document all assumptions and decisions
  4. Generate visualizations alongside statistics
  5. Produce actionable insights with clear recommendations

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement