Why Data Cleaning Matters
DfData Cleaning
Data cleaning (also called data cleansing or data wrangling) is the process of detecting and correcting (or removing) corrupt, inaccurate, irrelevant, or incomplete records from a dataset. It is widely estimated that data scientists spend 60-80% of their time on data cleaning activities (CrowdFlower, 2016). Clean data is the foundation of reliable analysis and modeling — Garbage In, Garbage Out (GIGO).
Architecture Diagram
Raw Data → Dirty → Clean → Reliable → Insights
│ │ │ │ │
└─────────┴────────┴────────┴──────────┘
Data Cleaning Pipeline
Handling Missing Values
Types of Missing Data
| Type | Description | Example |
|---|---|---|
| MCAR (Missing Completely at Random) | Missingness has no pattern | Random survey non-responses |
| MAR (Missing at Random) | Missingness depends on observed data | Young people less likely to report income |
| MNAR (Missing Not at Random) | Missingness depends on unobserved data | High earners hide income |
import pandas as pd
import numpy as np
# Create dataset with missing values
np.random.seed(42)
n = 1000
df = pd.DataFrame({
'age': np.random.randint(18, 70, n).astype(float),
'income': np.random.normal(50000, 15000, n),
'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n),
'experience': np.random.randint(0, 40, n).astype(float),
'satisfaction': np.random.randint(1, 11, n).astype(float)
})
# Introduce missing values (realistic pattern)
# Age: 5% missing (random)
df.loc[np.random.choice(df.index, int(n*0.05), replace=False), 'age'] = np.nan
# Income: 10% missing (higher for younger people - MAR)
young_mask = df['age'] < 30
df.loc[young_mask & (np.random.random(n) < 0.2), 'income'] = np.nan
# Experience: 3% missing (MCAR)
df.loc[np.random.choice(df.index, int(n*0.03), replace=False), 'experience'] = np.nan
# Satisfaction: 8% missing (higher for lower income - MAR)
low_income = df['income'] < 40000
df.loc[low_income & (np.random.random(n) < 0.15), 'satisfaction'] = np.nan
print("Missing Values Summary:")
print(df.isnull().sum())
print(f"\nPercentage missing:")
print((df.isnull().sum() / len(df) * 100).round(2))
Detection Methods
# Check missing values
print("Missing values by column:")
print(df.isnull().sum())
print("\nMissing values by row:")
print(df.isnull().sum(axis=1).value_counts().sort_index())
# Visualize missing patterns
import matplotlib.pyplot as plt
import seaborn as sns
# Missing value heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Missing Value Patterns')
plt.tight_layout()
plt.show()
# Missing value correlation
missing_corr = df.isnull().corr()
print("\nMissing Value Correlation:")
print(missing_corr)
# Identify patterns
print("\nRows with multiple missing values:")
multi_missing = df[df.isnull().sum(axis=1) > 1]
print(f"Count: {len(multi_missing)}")
print(multi_missing.head())
Deletion Strategies
# 1. Listwise deletion (drop rows with any missing)
df_complete = df.dropna()
print(f"Original: {len(df)} rows")
print(f"After dropping: {len(df_complete)} rows")
print(f"Rows removed: {len(df) - len(df_complete)}")
# 2. Drop rows with all missing values
df_drop_all = df.dropna(how='all')
# 3. Drop rows with less than threshold non-missing values
df_thresh = df.dropna(thresh=4) # Keep rows with at least 4 non-missing values
# 4. Listwise deletion for specific columns
df_subset = df.dropna(subset=['age', 'income'])
# 5. Column deletion (if too many missing)
missing_pct = df.isnull().sum() / len(df)
cols_to_drop = missing_pct[missing_pct > 0.3].index
df_dropped_cols = df.drop(columns=cols_to_drop)
print("\nDeletion Results:")
print(f"Listwise: {len(df_complete)} rows")
print(f"Threshold (4): {len(df_thresh)} rows")
print(f"Specific columns: {len(df_subset)} rows")
Imputation Strategies
# 1. Simple imputation
from sklearn.impute import SimpleImputer
# Mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df['age_mean'] = imputer_mean.fit_transform(df[['age']])
# Median imputation (better for skewed data)
imputer_median = SimpleImputer(strategy='median')
df['income_median'] = imputer_median.fit_transform(df[['income']])
# Mode imputation (for categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df['satisfaction_mode'] = imputer_mode.fit_transform(df[['satisfaction']])
# Constant imputation
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)
df['experience_filled'] = imputer_constant.fit_transform(df[['experience']])
print("Simple Imputation Results:")
print(df[['age', 'age_mean', 'income', 'income_median']].head(10))
# 2. KNN Imputation
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=5)
df_knn = pd.DataFrame(
knn_imputer.fit_transform(df[['age', 'income', 'experience', 'satisfaction']]),
columns=['age', 'income', 'experience', 'satisfaction']
)
print("\nKNN Imputation Results:")
print(df_knn.head(10))
# 3. Iterative Imputation (MICE-like)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
iter_imputer = IterativeImputer(max_iter=10, random_state=42)
df_iter = pd.DataFrame(
iter_imputer.fit_transform(df[['age', 'income', 'experience', 'satisfaction']]),
columns=['age', 'income', 'experience', 'satisfaction']
)
print("\nIterative Imputation Results:")
print(df_iter.head(10))
# 4. Time-series specific imputation
df_ts = pd.DataFrame({
'date': pd.date_range('2024-01-01', periods=30),
'value': np.random.randn(30).cumsum()
})
df_ts.loc[5:8, 'value'] = np.nan # Create gap
# Forward fill
df_ts['ffill'] = df_ts['value'].fillna(method='ffill')
# Backward fill
df_ts['bfill'] = df_ts['value'].fillna(method='bfill')
# Interpolation
df_ts['interpolate'] = df_ts['value'].interpolate()
print("\nTime Series Imputation:")
print(df_ts)
Outlier Detection
Statistical Methods
# 1. Z-Score Method
from scipy import stats
def detect_outliers_zscore(data, threshold=3):
z_scores = np.abs(stats.zscore(data.dropna()))
return data[z_scores > threshold]
# Apply to each column
for col in ['age', 'income', 'experience']:
outliers = detect_outliers_zscore(df[col])
print(f"{col}: {len(outliers)} outliers (Z-score > 3)")
# 2. IQR Method
def detect_outliers_iqr(data, factor=1.5):
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - factor * IQR
upper_bound = Q3 + factor * IQR
return data[(data < lower_bound) | (data > upper_bound)]
print("\nIQR Method Outliers:")
for col in ['age', 'income', 'experience']:
outliers = detect_outliers_iqr(df[col])
print(f"{col}: {len(outliers)} outliers")
# 3. Visualization
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
# Box plots
for i, col in enumerate(['age', 'income', 'experience']):
sns.boxplot(data=df, y=col, ax=axes[0, i])
axes[0, i].set_title(f'{col} - Box Plot')
# Histograms with outlier regions
for i, col in enumerate(['age', 'income', 'experience']):
axes[1, i].hist(df[col].dropna(), bins=30, edgecolor='black')
q1, q3 = df[col].quantile(0.25), df[col].quantile(0.75)
iqr = q3 - q1
axes[1, i].axvline(q1 - 1.5*iqr, color='r', linestyle='--', label='Lower')
axes[1, i].axvline(q3 + 1.5*iqr, color='r', linestyle='--', label='Upper')
axes[1, i].set_title(f'{col} - Histogram')
axes[1, i].legend()
plt.tight_layout()
plt.show()
# 4. DBSCAN for multivariate outliers
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[['age', 'income', 'experience']].dropna())
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X_scaled)
n_outliers = (clusters == -1).sum()
print(f"\nDBSCAN Outliers: {n_outliers}")
Handling Outliers
# 1. Remove outliers
def remove_outliers(df, columns, method='iqr', factor=1.5):
df_clean = df.copy()
for col in columns:
if method == 'iqr':
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - factor * IQR
upper = Q3 + factor * IQR
df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
elif method == 'zscore':
z_scores = np.abs(stats.zscore(df_clean[col].dropna()))
df_clean = df_clean[z_scores < 3]
return df_clean
df_no_outliers = remove_outliers(df, ['age', 'income', 'experience'])
print(f"Before: {len(df)} rows")
print(f"After: {len(df_no_outliers)} rows")
# 2. Cap outliers (winsorization)
from scipy.stats import mstats
def cap_outliers(df, columns, limits=(0.05, 0.95)):
df_capped = df.copy()
for col in columns:
lower = df_capped[col].quantile(limits[0])
upper = df_capped[col].quantile(limits[1])
df_capped[col] = df_capped[col].clip(lower, upper)
return df_capped
df_capped = cap_outliers(df, ['income', 'experience'])
print("\nCapped Data Stats:")
print(df_capped[['income', 'experience']].describe())
# 3. Log transformation (for right-skewed data)
df['log_income'] = np.log1p(df['income'].dropna())
print("\nLog Transform:")
print(df[['income', 'log_income']].describe())
# 4. Robust scaling
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
df[['income_robust']] = robust_scaler.fit_transform(df[['income']])
Data Type Conversions
# Check current types
print("Current dtypes:")
print(df.dtypes)
# Common conversions
# String to datetime
df_dates = pd.DataFrame({
'date_str': ['2024-01-15', '2024-02-20', '2024-03-25'],
'time_str': ['14:30:00', '09:15:00', '18:45:00']
})
df_dates['date'] = pd.to_datetime(df_dates['date_str'])
df_dates['datetime'] = pd.to_datetime(df_dates['date_str'] + ' ' + df_dates['time_str'])
print("Datetime Conversion:")
print(df_dates)
# String to numeric
df_str = pd.DataFrame({
'price': ['100.50', '200.75', 'N/A', '300.25'],
'quantity': ['10', '20', '30', 'forty']
})
# Coerce errors to NaN
df_str['price_numeric'] = pd.to_numeric(df_str['price'], errors='coerce')
df_str['quantity_numeric'] = pd.to_numeric(df_str['quantity'], errors='coerce')
print("\nString to Numeric:")
print(df_str)
# Category type (memory efficient for repeated strings)
df['education'] = df['education'].astype('category')
print(f"\nCategory dtype: {df['education'].dtype}")
print(f"Memory usage: {df['education'].memory_usage(deep=True)} bytes")
# Bool conversion
df['high_income'] = df['income'] > 60000
df['high_income'] = df['high_income'].astype(int) # Convert to 0/1
# String cleaning
df_text = pd.DataFrame({
'name': [' Alice ', 'BOB', 'charlie ', ' DIANA '],
'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 'diana@email.com']
})
df_text['name_clean'] = df_text['name'].str.strip().str.lower().str.title()
df_text['domain'] = df_text['email'].str.split('@').str[1]
print("\nString Cleaning:")
print(df_text)
Data Quality Checks
def data_quality_report(df):
"""Generate comprehensive data quality report"""
report = pd.DataFrame({
'dtype': df.dtypes,
'non_null': df.count(),
'null_count': df.isnull().sum(),
'null_pct': (df.isnull().sum() / len(df) * 100).round(2),
'unique': df.nunique(),
'unique_pct': (df.nunique() / len(df) * 100).round(2),
'sample': df.iloc[0].values
})
# Add statistics for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
report.loc[numeric_cols, 'mean'] = df[numeric_cols].mean()
report.loc[numeric_cols, 'std'] = df[numeric_cols].std()
report.loc[numeric_cols, 'min'] = df[numeric_cols].min()
report.loc[numeric_cols, 'max'] = df[numeric_cols].max()
report.loc[numeric_cols, 'skew'] = df[numeric_cols].skew()
# Add value counts for categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
report.loc[col, 'top_values'] = str(df[col].value_counts().head(3).to_dict())
return report
# Generate report
quality_report = data_quality_report(df)
print("Data Quality Report:")
print(quality_report)
# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")
print(f"Duplicate columns: {df.columns.duplicated().sum()}")
# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(f"\nConstant columns: {constant_cols}")
# Check for low variance columns
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
low_var_cols = [col for col in df.select_dtypes(include=[np.number]).columns
if df[col].var() < 0.01]
print(f"Low variance columns: {low_var_cols}")
Complete Cleaning Pipeline
def clean_dataframe(df, config=None):
"""
Complete data cleaning pipeline
Parameters:
-----------
df : pd.DataFrame
Raw dataframe to clean
config : dict
Configuration for cleaning steps
"""
if config is None:
config = {
'remove_duplicates': True,
'handle_missing': 'auto', # 'auto', 'drop', 'impute'
'handle_outliers': 'iqr',
'convert_types': True,
'validate': True
}
df_clean = df.copy()
# 1. Remove duplicates
if config['remove_duplicates']:
n_before = len(df_clean)
df_clean = df_clean.drop_duplicates()
print(f"Removed {n_before - len(df_clean)} duplicate rows")
# 2. Handle missing values
if config['handle_missing'] == 'auto':
# Auto-select strategy based on missing percentage
for col in df_clean.columns:
missing_pct = df_clean[col].isnull().sum() / len(df_clean)
if missing_pct == 0:
continue
elif missing_pct < 0.05:
# Drop rows for small amounts
df_clean = df_clean.dropna(subset=[col])
elif missing_pct < 0.2:
# Impute based on dtype
if df_clean[col].dtype in ['int64', 'float64']:
df_clean[col] = df_clean[col].fillna(df_clean[col].median())
else:
df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
else:
# Drop column if too much missing
df_clean = df_clean.drop(columns=[col])
print(f"Dropped column {col} ({missing_pct*100:.1f}% missing)")
elif config['handle_missing'] == 'drop':
df_clean = df_clean.dropna()
elif config['handle_missing'] == 'impute':
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
# Numeric: median
for col in numeric_cols:
df_clean[col] = df_clean[col].fillna(df_clean[col].median())
# Categorical: mode
for col in cat_cols:
df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
# 3. Handle outliers
if config['handle_outliers'] == 'iqr':
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_clean[col] = df_clean[col].clip(lower, upper)
# 4. Validate
if config['validate']:
print("\nFinal validation:")
print(f"Shape: {df_clean.shape}")
print(f"Missing values: {df_clean.isnull().sum().sum()}")
print(f"Duplicates: {df_clean.duplicated().sum()}")
return df_clean
# Apply pipeline
df_cleaned = clean_dataframe(df)
Key Takeaways
📋Summary: Data Cleaning
- Identify missing data type (MCAR, MAR, MNAR) before choosing strategy
- Visualize missing patterns to understand relationships
- Multiple imputation (KNN, Iterative) often outperforms simple methods
- Outliers can be legitimate data points — investigate before removing
- Data types affect memory usage and available operations
- Always validate after cleaning
Practice Exercise
- Create a dirty dataset with mixed missing patterns
- Detect and handle missing values using 3 different methods
- Identify outliers using Z-score and IQR methods
- Compare results of different imputation strategies
- Write a complete cleaning pipeline function