Capstone Foundation Project
This capstone project integrates all foundation skills into a complete data analysis pipeline.
Project Structure
<svg width="600" height="400" viewBox="0 0 600 400" xmlns="http://www.w3.org/2000/svg">
<rect width="600" height="400" fill="#f8f9fa" rx="10"/>
<text x="300" y="30" text-anchor="middle" font-size="18" font-weight="bold" fill="#2c3e50">Data Analysis Pipeline</text>
<!-- Pipeline Stages -->
<rect x="50" y="80" width="120" height="60" fill="#3498db" rx="5"/>
<text x="110" y="110" text-anchor="middle" font-size="11" fill="white">Data Collection</text>
<text x="110" y="125" text-anchor="middle" font-size="9" fill="white">SQL/API/Web</text>
<rect x="200" y="80" width="120" height="60" fill="#2ecc71" rx="5"/>
<text x="260" y="110" text-anchor="middle" font-size="11" fill="white">Data Cleaning</text>
<text x="260" y="125" text-anchor="middle" font-size="9" fill="white">Pandas/NumPy</text>
<rect x="350" y="80" width="120" height="60" fill="#e74c3c" rx="5"/>
<text x="410" y="110" text-anchor="middle" font-size="11" fill="white">Exploration</text>
<text x="410" y="125" text-anchor="middle" font-size="9" fill="white">Stats/Visualization</text>
<rect x="500" y="80" width="80" height="60" fill="#f39c12" rx="5"/>
<text x="540" y="110" text-anchor="middle" font-size="11" fill="white">Insights</text>
<text x="540" y="125" text-anchor="middle" font-size="9" fill="white">Report</text>
<!-- Arrows -->
<line x1="170" y1="110" x2="200" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="320" y1="110" x2="350" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<line x1="470" y1="110" x2="500" y2="110" stroke="#7f8c8d" stroke-width="2" marker-end="url(#arrow)"/>
<!-- File Structure -->
<text x="300" y="180" text-anchor="middle" font-size="14" fill="#2c3e50">Project File Structure:</text>
<text x="100" y="210" font-size="11" fill="#7f8c8d">capstone-project/</text>
<text x="120" y="230" font-size="11" fill="#3498db">āāā data/</text>
<text x="140" y="250" font-size="11" fill="#3498db">ā āāā raw/</text>
<text x="140" y="270" font-size="11" fill="#3498db">ā āāā processed/</text>
<text x="120" y="290" font-size="11" fill="#2ecc71">āāā notebooks/</text>
<text x="140" y="310" font-size="11" fill="#2ecc71">ā āāā 01_data_collection.ipynb</text>
<text x="140" y="330" font-size="11" fill="#2ecc71">ā āāā 02_cleaning.ipynb</text>
<text x="140" y="350" font-size="11" fill="#2ecc71">ā āāā 03_eda.ipynb</text>
<text x="120" y="370" font-size="11" fill="#e74c3c">āāā src/</text>
<text x="120" y="390" font-size="11" fill="#f39c12">āāā reports/</text>
<defs>
<marker id="arrow" markerWidth="10" markerHeight="10" refX="0" refY="3" orient="auto">
<path d="M0,0 L0,6 L9,3 z" fill="#7f8c8d"/>
</marker>
</defs>
</svg>
Data Collection Module
# data_collection.py
import pandas as pd
import requests
from sqlalchemy import create_engine
class DataCollector:
def __init__(self):
self.engine = create_engine('postgresql://user:pass@localhost/db')
def from_sql(self, query):
"""Load data from SQL database"""
return pd.read_sql(query, self.engine)
def from_api(self, url, params=None):
"""Load data from REST API"""
response = requests.get(url, params=params)
response.raise_for_status()
return pd.DataFrame(response.json()['data'])
def from_csv(self, filepath):
"""Load data from CSV"""
return pd.read_csv(filepath)
def combine_sources(self, df1, df2, key, how='inner'):
"""Merge multiple data sources"""
return pd.merge(df1, df2, on=key, how=how)
# Usage
collector = DataCollector()
sql_data = collector.from_sql("SELECT * FROM transactions")
api_data = collector.from_api("https://api.example.com/users")
combined = collector.combine_sources(sql_data, api_data, key='user_id')
Data Cleaning Pipeline
# data_cleaning.py
import pandas as pd
import numpy as np
class DataCleaner:
def __init__(self, df):
self.df = df.copy()
self.cleaning_log = []
def log_action(self, action, details):
self.cleaning_log.append({
'action': action,
'details': details,
'timestamp': pd.Timestamp.now()
})
def handle_missing(self, strategy='auto'):
"""Handle missing values"""
before = self.df.isnull().sum().sum()
if strategy == 'auto':
# Numerical: median, Categorical: mode
num_cols = self.df.select_dtypes(include=[np.number]).columns
cat_cols = self.df.select_dtypes(include=['object']).columns
self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].median())
for col in cat_cols:
self.df[col] = self.df[col].fillna(self.df[col].mode()[0])
after = self.df.isnull().sum().sum()
self.log_action('handle_missing', f'{before} -> {after} missing values')
return self
def remove_duplicates(self):
"""Remove duplicate rows"""
before = len(self.df)
self.df = self.df.drop_duplicates()
after = len(self.df)
self.log_action('remove_duplicates', f'{before} -> {after} rows')
return self
def fix_types(self, type_dict):
"""Fix data types"""
for col, dtype in type_dict.items():
if dtype == 'datetime':
self.df[col] = pd.to_datetime(self.df[col])
elif dtype == 'category':
self.df[col] = self.df[col].astype('category')
elif dtype == 'numeric':
self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
self.log_action('fix_types', f'Converted {list(type_dict.keys())}')
return self
def get_clean_data(self):
return self.df
# Usage
cleaner = DataCleaner(df)
df_clean = (cleaner
.handle_missing()
.remove_duplicates()
.fix_types({
'date': 'datetime',
'category': 'category',
'amount': 'numeric'
})
.get_clean_data())
Analysis Module
# analysis.py
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
class DataAnalyzer:
def __init__(self, df):
self.df = df
self.results = {}
def descriptive_stats(self):
"""Calculate descriptive statistics"""
self.results['descriptive'] = {
'numerical': self.df.describe(),
'categorical': self.df.describe(include=['object', 'category'])
}
return self
def correlation_analysis(self, method='pearson'):
"""Analyze correlations"""
corr_matrix = self.df.select_dtypes(include=[np.number]).corr(method=method)
self.results['correlation'] = corr_matrix
return self
def distribution_test(self, column):
"""Test distribution normality"""
stat, p_value = stats.shapiro(self.df[column].dropna())
self.results[f'distribution_{column}'] = {
'statistic': stat,
'p_value': p_value,
'normal': p_value > 0.05
}
return self
def outlier_detection(self, method='iqr'):
"""Detect outliers"""
num_cols = self.df.select_dtypes(include=[np.number]).columns
outliers = {}
for col in num_cols:
if method == 'iqr':
Q1 = self.df[col].quantile(0.25)
Q3 = self.df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers[col] = {
'count': ((self.df[col] < lower) | (self.df[col] > upper)).sum(),
'bounds': (lower, upper)
}
self.results['outliers'] = outliers
return self
def visualize_findings(self):
"""Create visualization dashboard"""
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Correlation heatmap
if 'correlation' in self.results:
sns.heatmap(self.results['correlation'], annot=True,
cmap='coolwarm', center=0, ax=axes[0, 0])
axes[0, 0].set_title('Correlation Matrix')
# Distribution of first numerical column
num_col = self.df.select_dtypes(include=[np.number]).columns[0]
axes[0, 1].hist(self.df[num_col].dropna(), bins=30, edgecolor='black')
axes[0, 1].set_title(f'{num_col} Distribution')
# Box plot for outliers
if 'outliers' in self.results:
outlier_cols = list(self.results['outliers'].keys())[:4]
self.df[outlier_cols].boxplot(ax=axes[1, 0])
axes[1, 0].set_title('Outlier Analysis')
# Missing values
missing = self.df.isnull().sum()
missing[missing > 0].plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Missing Values by Column')
plt.tight_layout()
plt.show()
return self
def generate_report(self):
"""Generate analysis summary"""
report = "=" * 60 + "\n"
report += "DATA ANALYSIS REPORT\n"
report += "=" * 60 + "\n\n"
report += f"Dataset Shape: {self.df.shape}\n"
report += f"Missing Values: {self.df.isnull().sum().sum()}\n"
report += f"Duplicates: {self.df.duplicated().sum()}\n\n"
if 'correlation' in self.results:
report += "Top Correlations:\n"
corr = self.results['correlation']
# Get top correlations excluding diagonal
mask = np.triu(np.ones_like(corr, dtype=bool))
corr_pairs = corr.where(~mask).stack().sort_values(ascending=False)
for (col1, col2), value in corr_pairs.head(5).items():
report += f" {col1} - {col2}: {value:.3f}\n"
return report
Project Execution
# main.py
from data_collection import DataCollector
from data_cleaning import DataCleaner
from analysis import DataAnalyzer
# 1. Collect Data
collector = DataCollector()
df = collector.from_sql("SELECT * FROM sales_data")
# 2. Clean Data
cleaner = DataCleaner(df)
df_clean = (cleaner
.handle_missing()
.remove_duplicates()
.fix_types({'sale_date': 'datetime', 'amount': 'numeric'})
.get_clean_data())
# 3. Analyze Data
analyzer = DataAnalyzer(df_clean)
results = (analyzer
.descriptive_stats()
.correlation_analysis()
.outlier_detection()
.visualize_findings()
.generate_report())
print(results)
Key Takeaways
- Structure your project for reproducibility
- Create reusable modules for each pipeline stage
- Document all assumptions and decisions
- Generate visualizations alongside statistics
- Produce actionable insights with clear recommendations