Data Cleaning: Missing Values, Outliers, Dtypes

Why Data Cleaning Matters

DfData Cleaning

Data cleaning (also called data cleansing or data wrangling) is the process of detecting and correcting (or removing) corrupt, inaccurate, irrelevant, or incomplete records from a dataset. It is widely estimated that data scientists spend 60-80% of their time on data cleaning activities (CrowdFlower, 2016). Clean data is the foundation of reliable analysis and modeling — Garbage In, Garbage Out (GIGO).

Architecture Diagram

Raw Data → Dirty → Clean → Reliable → Insights
   │         │        │        │          │
   └─────────┴────────┴────────┴──────────┘
              Data Cleaning Pipeline

Handling Missing Values

Types of Missing Data

Type	Description	Example
MCAR (Missing Completely at Random)	Missingness has no pattern	Random survey non-responses
MAR (Missing at Random)	Missingness depends on observed data	Young people less likely to report income
MNAR (Missing Not at Random)	Missingness depends on unobserved data	High earners hide income

import pandas as pd
import numpy as np

# Create dataset with missing values
np.random.seed(42)
n = 1000

df = pd.DataFrame({
    'age': np.random.randint(18, 70, n).astype(float),
    'income': np.random.normal(50000, 15000, n),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n),
    'experience': np.random.randint(0, 40, n).astype(float),
    'satisfaction': np.random.randint(1, 11, n).astype(float)
})

# Introduce missing values (realistic pattern)
# Age: 5% missing (random)
df.loc[np.random.choice(df.index, int(n*0.05), replace=False), 'age'] = np.nan

# Income: 10% missing (higher for younger people - MAR)
young_mask = df['age'] < 30
df.loc[young_mask & (np.random.random(n) < 0.2), 'income'] = np.nan

# Experience: 3% missing (MCAR)
df.loc[np.random.choice(df.index, int(n*0.03), replace=False), 'experience'] = np.nan

# Satisfaction: 8% missing (higher for lower income - MAR)
low_income = df['income'] < 40000
df.loc[low_income & (np.random.random(n) < 0.15), 'satisfaction'] = np.nan

print("Missing Values Summary:")
print(df.isnull().sum())
print(f"\nPercentage missing:")
print((df.isnull().sum() / len(df) * 100).round(2))

Detection Methods

# Check missing values
print("Missing values by column:")
print(df.isnull().sum())

print("\nMissing values by row:")
print(df.isnull().sum(axis=1).value_counts().sort_index())

# Visualize missing patterns
import matplotlib.pyplot as plt
import seaborn as sns

# Missing value heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
plt.title('Missing Value Patterns')
plt.tight_layout()
plt.show()

# Missing value correlation
missing_corr = df.isnull().corr()
print("\nMissing Value Correlation:")
print(missing_corr)

# Identify patterns
print("\nRows with multiple missing values:")
multi_missing = df[df.isnull().sum(axis=1) > 1]
print(f"Count: {len(multi_missing)}")
print(multi_missing.head())

Deletion Strategies

# 1. Listwise deletion (drop rows with any missing)
df_complete = df.dropna()
print(f"Original: {len(df)} rows")
print(f"After dropping: {len(df_complete)} rows")
print(f"Rows removed: {len(df) - len(df_complete)}")

# 2. Drop rows with all missing values
df_drop_all = df.dropna(how='all')

# 3. Drop rows with less than threshold non-missing values
df_thresh = df.dropna(thresh=4)  # Keep rows with at least 4 non-missing values

# 4. Listwise deletion for specific columns
df_subset = df.dropna(subset=['age', 'income'])

# 5. Column deletion (if too many missing)
missing_pct = df.isnull().sum() / len(df)
cols_to_drop = missing_pct[missing_pct > 0.3].index
df_dropped_cols = df.drop(columns=cols_to_drop)

print("\nDeletion Results:")
print(f"Listwise: {len(df_complete)} rows")
print(f"Threshold (4): {len(df_thresh)} rows")
print(f"Specific columns: {len(df_subset)} rows")

Imputation Strategies

# 1. Simple imputation
from sklearn.impute import SimpleImputer

# Mean imputation
imputer_mean = SimpleImputer(strategy='mean')
df['age_mean'] = imputer_mean.fit_transform(df[['age']])

# Median imputation (better for skewed data)
imputer_median = SimpleImputer(strategy='median')
df['income_median'] = imputer_median.fit_transform(df[['income']])

# Mode imputation (for categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df['satisfaction_mode'] = imputer_mode.fit_transform(df[['satisfaction']])

# Constant imputation
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)
df['experience_filled'] = imputer_constant.fit_transform(df[['experience']])

print("Simple Imputation Results:")
print(df[['age', 'age_mean', 'income', 'income_median']].head(10))

# 2. KNN Imputation
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)
df_knn = pd.DataFrame(
    knn_imputer.fit_transform(df[['age', 'income', 'experience', 'satisfaction']]),
    columns=['age', 'income', 'experience', 'satisfaction']
)
print("\nKNN Imputation Results:")
print(df_knn.head(10))

# 3. Iterative Imputation (MICE-like)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(max_iter=10, random_state=42)
df_iter = pd.DataFrame(
    iter_imputer.fit_transform(df[['age', 'income', 'experience', 'satisfaction']]),
    columns=['age', 'income', 'experience', 'satisfaction']
)
print("\nIterative Imputation Results:")
print(df_iter.head(10))

# 4. Time-series specific imputation
df_ts = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=30),
    'value': np.random.randn(30).cumsum()
})
df_ts.loc[5:8, 'value'] = np.nan  # Create gap

# Forward fill
df_ts['ffill'] = df_ts['value'].fillna(method='ffill')

# Backward fill
df_ts['bfill'] = df_ts['value'].fillna(method='bfill')

# Interpolation
df_ts['interpolate'] = df_ts['value'].interpolate()

print("\nTime Series Imputation:")
print(df_ts)

Outlier Detection

Statistical Methods

# 1. Z-Score Method
from scipy import stats

def detect_outliers_zscore(data, threshold=3):
    z_scores = np.abs(stats.zscore(data.dropna()))
    return data[z_scores > threshold]

# Apply to each column
for col in ['age', 'income', 'experience']:
    outliers = detect_outliers_zscore(df[col])
    print(f"{col}: {len(outliers)} outliers (Z-score > 3)")

# 2. IQR Method
def detect_outliers_iqr(data, factor=1.5):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    return data[(data < lower_bound) | (data > upper_bound)]

print("\nIQR Method Outliers:")
for col in ['age', 'income', 'experience']:
    outliers = detect_outliers_iqr(df[col])
    print(f"{col}: {len(outliers)} outliers")

# 3. Visualization
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Box plots
for i, col in enumerate(['age', 'income', 'experience']):
    sns.boxplot(data=df, y=col, ax=axes[0, i])
    axes[0, i].set_title(f'{col} - Box Plot')

# Histograms with outlier regions
for i, col in enumerate(['age', 'income', 'experience']):
    axes[1, i].hist(df[col].dropna(), bins=30, edgecolor='black')
    q1, q3 = df[col].quantile(0.25), df[col].quantile(0.75)
    iqr = q3 - q1
    axes[1, i].axvline(q1 - 1.5*iqr, color='r', linestyle='--', label='Lower')
    axes[1, i].axvline(q3 + 1.5*iqr, color='r', linestyle='--', label='Upper')
    axes[1, i].set_title(f'{col} - Histogram')
    axes[1, i].legend()

plt.tight_layout()
plt.show()

# 4. DBSCAN for multivariate outliers
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[['age', 'income', 'experience']].dropna())

dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X_scaled)

n_outliers = (clusters == -1).sum()
print(f"\nDBSCAN Outliers: {n_outliers}")

Handling Outliers

# 1. Remove outliers
def remove_outliers(df, columns, method='iqr', factor=1.5):
    df_clean = df.copy()
    for col in columns:
        if method == 'iqr':
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - factor * IQR
            upper = Q3 + factor * IQR
            df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
        elif method == 'zscore':
            z_scores = np.abs(stats.zscore(df_clean[col].dropna()))
            df_clean = df_clean[z_scores < 3]
    return df_clean

df_no_outliers = remove_outliers(df, ['age', 'income', 'experience'])
print(f"Before: {len(df)} rows")
print(f"After: {len(df_no_outliers)} rows")

# 2. Cap outliers (winsorization)
from scipy.stats import mstats

def cap_outliers(df, columns, limits=(0.05, 0.95)):
    df_capped = df.copy()
    for col in columns:
        lower = df_capped[col].quantile(limits[0])
        upper = df_capped[col].quantile(limits[1])
        df_capped[col] = df_capped[col].clip(lower, upper)
    return df_capped

df_capped = cap_outliers(df, ['income', 'experience'])
print("\nCapped Data Stats:")
print(df_capped[['income', 'experience']].describe())

# 3. Log transformation (for right-skewed data)
df['log_income'] = np.log1p(df['income'].dropna())
print("\nLog Transform:")
print(df[['income', 'log_income']].describe())

# 4. Robust scaling
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
df[['income_robust']] = robust_scaler.fit_transform(df[['income']])

Data Type Conversions

# Check current types
print("Current dtypes:")
print(df.dtypes)

# Common conversions
# String to datetime
df_dates = pd.DataFrame({
    'date_str': ['2024-01-15', '2024-02-20', '2024-03-25'],
    'time_str': ['14:30:00', '09:15:00', '18:45:00']
})

df_dates['date'] = pd.to_datetime(df_dates['date_str'])
df_dates['datetime'] = pd.to_datetime(df_dates['date_str'] + ' ' + df_dates['time_str'])
print("Datetime Conversion:")
print(df_dates)

# String to numeric
df_str = pd.DataFrame({
    'price': ['100.50', '200.75', 'N/A', '300.25'],
    'quantity': ['10', '20', '30', 'forty']
})

# Coerce errors to NaN
df_str['price_numeric'] = pd.to_numeric(df_str['price'], errors='coerce')
df_str['quantity_numeric'] = pd.to_numeric(df_str['quantity'], errors='coerce')
print("\nString to Numeric:")
print(df_str)

# Category type (memory efficient for repeated strings)
df['education'] = df['education'].astype('category')
print(f"\nCategory dtype: {df['education'].dtype}")
print(f"Memory usage: {df['education'].memory_usage(deep=True)} bytes")

# Bool conversion
df['high_income'] = df['income'] > 60000
df['high_income'] = df['high_income'].astype(int)  # Convert to 0/1

# String cleaning
df_text = pd.DataFrame({
    'name': ['  Alice  ', 'BOB', 'charlie  ', '  DIANA  '],
    'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 'diana@email.com']
})

df_text['name_clean'] = df_text['name'].str.strip().str.lower().str.title()
df_text['domain'] = df_text['email'].str.split('@').str[1]
print("\nString Cleaning:")
print(df_text)

Data Quality Checks

def data_quality_report(df):
    """Generate comprehensive data quality report"""
    
    report = pd.DataFrame({
        'dtype': df.dtypes,
        'non_null': df.count(),
        'null_count': df.isnull().sum(),
        'null_pct': (df.isnull().sum() / len(df) * 100).round(2),
        'unique': df.nunique(),
        'unique_pct': (df.nunique() / len(df) * 100).round(2),
        'sample': df.iloc[0].values
    })
    
    # Add statistics for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    report.loc[numeric_cols, 'mean'] = df[numeric_cols].mean()
    report.loc[numeric_cols, 'std'] = df[numeric_cols].std()
    report.loc[numeric_cols, 'min'] = df[numeric_cols].min()
    report.loc[numeric_cols, 'max'] = df[numeric_cols].max()
    report.loc[numeric_cols, 'skew'] = df[numeric_cols].skew()
    
    # Add value counts for categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        report.loc[col, 'top_values'] = str(df[col].value_counts().head(3).to_dict())
    
    return report

# Generate report
quality_report = data_quality_report(df)
print("Data Quality Report:")
print(quality_report)

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")
print(f"Duplicate columns: {df.columns.duplicated().sum()}")

# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(f"\nConstant columns: {constant_cols}")

# Check for low variance columns
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
low_var_cols = [col for col in df.select_dtypes(include=[np.number]).columns 
                if df[col].var() < 0.01]
print(f"Low variance columns: {low_var_cols}")

Complete Cleaning Pipeline

def clean_dataframe(df, config=None):
    """
    Complete data cleaning pipeline
    
    Parameters:
    -----------
    df : pd.DataFrame
        Raw dataframe to clean
    config : dict
        Configuration for cleaning steps
    """
    if config is None:
        config = {
            'remove_duplicates': True,
            'handle_missing': 'auto',  # 'auto', 'drop', 'impute'
            'handle_outliers': 'iqr',
            'convert_types': True,
            'validate': True
        }
    
    df_clean = df.copy()
    
    # 1. Remove duplicates
    if config['remove_duplicates']:
        n_before = len(df_clean)
        df_clean = df_clean.drop_duplicates()
        print(f"Removed {n_before - len(df_clean)} duplicate rows")
    
    # 2. Handle missing values
    if config['handle_missing'] == 'auto':
        # Auto-select strategy based on missing percentage
        for col in df_clean.columns:
            missing_pct = df_clean[col].isnull().sum() / len(df_clean)
            
            if missing_pct == 0:
                continue
            elif missing_pct < 0.05:
                # Drop rows for small amounts
                df_clean = df_clean.dropna(subset=[col])
            elif missing_pct < 0.2:
                # Impute based on dtype
                if df_clean[col].dtype in ['int64', 'float64']:
                    df_clean[col] = df_clean[col].fillna(df_clean[col].median())
                else:
                    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
            else:
                # Drop column if too much missing
                df_clean = df_clean.drop(columns=[col])
                print(f"Dropped column {col} ({missing_pct*100:.1f}% missing)")
    
    elif config['handle_missing'] == 'drop':
        df_clean = df_clean.dropna()
    
    elif config['handle_missing'] == 'impute':
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
        
        # Numeric: median
        for col in numeric_cols:
            df_clean[col] = df_clean[col].fillna(df_clean[col].median())
        
        # Categorical: mode
        for col in cat_cols:
            df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
    
    # 3. Handle outliers
    if config['handle_outliers'] == 'iqr':
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            df_clean[col] = df_clean[col].clip(lower, upper)
    
    # 4. Validate
    if config['validate']:
        print("\nFinal validation:")
        print(f"Shape: {df_clean.shape}")
        print(f"Missing values: {df_clean.isnull().sum().sum()}")
        print(f"Duplicates: {df_clean.duplicated().sum()}")
    
    return df_clean

# Apply pipeline
df_cleaned = clean_dataframe(df)

Key Takeaways

📋Summary: Data Cleaning

Identify missing data type (MCAR, MAR, MNAR) before choosing strategy
Visualize missing patterns to understand relationships
Multiple imputation (KNN, Iterative) often outperforms simple methods
Outliers can be legitimate data points — investigate before removing
Data types affect memory usage and available operations
Always validate after cleaning

Practice Exercise

Create a dirty dataset with mixed missing patterns
Detect and handle missing values using 3 different methods
Identify outliers using Z-score and IQR methods
Compare results of different imputation strategies
Write a complete cleaning pipeline function