Introduction to Data Cleaning
Data cleaning is the process of detecting and correcting (or removing) corrupt, inaccurate, or irrelevant data from a dataset. It's often said that data scientists spend 60-80% of their time on data cleaning.
Common Data Quality Issues
| Issue Type | Description | Example |
|---|
| Missing Values | Empty or null entries | NaN, NULL, blank |
| Duplicates | Repeated records | Same customer appears twice |
| Outliers | Unusual values | Age = 200 |
| Inconsistent | Non-standardized | "NYC", "New York", "new york" |
| Invalid | Wrong format | Phone: "abc123" |
| Range Errors | Values outside expected | Temperature: -100°C |
Handling Missing Values
import pandas as pd
import numpy as np
# Create sample dataset with missing values
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'age': [25, 30, np.nan, 28, 35],
'salary': [50000, np.nan, 75000, 55000, 60000],
'department': ['IT', 'HR', 'Finance', np.nan, 'Marketing'],
'score': [85, 90, 78, 92, np.nan]
})
# Check missing values
print(df.isnull()) # Boolean DataFrame
print(df.isnull().sum()) # Count per column
print(df.isnull().sum().sum()) # Total missing
# Drop missing values
df_dropped = df.dropna() # Drop any row with missing
df_dropped_cols = df.dropna(axis=1) # Drop any column with missing
df_dropped_thresh = df.dropna(thresh=4) # Keep rows with at least 4 non-null
# Fill missing values
df_filled_zero = df.fillna(0) # Fill with 0
df_filled_mean = df.fillna(df.mean()) # Fill with column mean
df_filled_median = df.fillna(df.median()) # Fill with median
df_filled_mode = df.fillna(df.mode().iloc[0]) # Fill with mode
# Forward and backward fill
df_ffill = df.fillna(method='ffill') # Use previous value
df_bfill = df.fillna(method='bfill') # Use next value
# Interpolate for numerical data
df_interpolated = df.interpolate(method='linear')
# Conditional fill
df['age'] = df['age'].fillna(df['age'].median())
df['department'] = df['department'].fillna('Unknown')
Removing Duplicates
# Create dataset with duplicates
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
'age': [25, 30, 25, 35, 30],
'salary': [50000, 60000, 50000, 75000, 60000]
})
# Check for duplicates
print(df.duplicated()) # Boolean Series
print(df.duplicated().sum()) # Count of duplicates
# Remove duplicates
df_unique = df.drop_duplicates() # Keep first occurrence
df_unique_last = df.drop_duplicates(keep='last') # Keep last occurrence
df_unique_subset = df.drop_duplicates(subset=['name']) # Based on specific columns
# Find which rows are duplicates
duplicates = df[df.duplicated(keep=False)]
print(duplicates)
Handling Outliers
# Create sample data with outliers
data = pd.DataFrame({
'value': [10, 12, 14, 15, 16, 18, 20, 22, 100, 25, 28, 30]
})
# Statistical methods for outlier detection
mean = data['value'].mean()
std = data['value'].std()
z_scores = (data['value'] - mean) / std
# Z-score method (values beyond 3 std are outliers)
outliers_zscore = data[abs(z_scores) > 3]
# IQR method
Q1 = data['value'].quantile(0.25)
Q3 = data['value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_iqr = data[(data['value'] < lower_bound) | (data['value'] > upper_bound)]
# Handle outliers - remove, cap, or transform
# Remove outliers
data_cleaned = data[(data['value'] >= lower_bound) & (data['value'] <= upper_bound)]
# Cap outliers (winsorization)
data_capped = data['value'].clip(lower=lower_bound, upper=upper_bound)
# Log transform (reduces impact of extreme values)
data_log = np.log1p(data['value'].clip(lower=0))
Data Type Conversion
# Incorrect data types
df = pd.DataFrame({
'age': ['25', '30', '35', '40'],
'salary': ['50000', '60000', '75000', '80000'],
'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04'],
'is_active': [1, 0, 1, 0]
})
# Convert to appropriate types
df['age'] = df['age'].astype(int)
df['salary'] = df['salary'].astype(float)
df['date'] = pd.to_datetime(df['date'])
df['is_active'] = df['is_active'].astype(bool)
# Handle errors during conversion
df['age'] = pd.to_numeric(df['age'], errors='coerce') # Convert errors to NaN
# Check data types
print(df.dtypes)
# Categorical data type for memory efficiency
df['department'] = df['department'].astype('category')
String Data Cleaning
# String cleaning operations
df = pd.DataFrame({
'name': [' Alice ', 'BOB', 'charlie', 'Diana Eve'],
'email': ['alice@email.com', 'bob@email.com ', 'CHarlie@Email.com', 'diana@email.com'],
'phone': ['123-456-7890', '2345678901', '(345) 678-9012', '456-789-0123']
})
# Trim whitespace
df['name'] = df['name'].str.strip()
df['email'] = df['email'].str.strip()
# Change case
df['name'] = df['name'].str.title()
df['email'] = df['email'].str.lower()
# Remove special characters
df['phone'] = df['phone'].str.replace(r'\D', '', regex=True) # Keep only digits
# Standardize values
df['name'] = df['name'].replace({
'Bob': 'Robert',
'Charlie': 'Charles'
})
# Parse complex strings
df['first_name'] = df['name'].str.split().str[0]
df['last_name'] = df['name'].str.split().str[-1]
Date Time Cleaning
# Date parsing and cleaning
df = pd.DataFrame({
'date': ['2024-01-01', '01/02/2024', '03-01-2024', '2024/04/01']
})
# Parse dates with mixed formats
df['date'] = pd.to_datetime(df['date'], errors='coerce', infer_datetime_format=True)
# Handle timezone
df['date'] = df['date'].dt.tz_localize('UTC')
# Extract date components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
# Calculate time differences
df['days_since'] = (pd.Timestamp('2024-01-15') - df['date']).dt.days
Complete Data Cleaning Pipeline
def clean_dataset(df):
"""Complete data cleaning pipeline."""
df = df.copy()
# 1. Remove duplicates
df = df.drop_duplicates()
# 2. Handle missing values
for col in df.columns:
if df[col].dtype in ['int64', 'float64']:
df[col].fillna(df[col].median(), inplace=True)
else:
df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)
# 3. Handle outliers in numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df[col] = df[col].clip(lower, upper)
# 4. Standardize string columns
for col in df.select_dtypes(include=['object']).columns:
df[col] = df[col].str.strip().str.title()
# 5. Convert data types
# (Add specific conversions based on data)
return df
# Apply cleaning pipeline
df_cleaned = clean_dataset(df)
print(df_cleaned.info())
Key Takeaways
- Always explore first - Understand your data before cleaning
- Document decisions - Keep track of cleaning steps
- Preserve original data - Work on copies
- Validate results - Check data after each step
Data Cleaning Checklist
| Step | Action |
|---|
| 1 | Check for missing values |
| 2 | Identify and remove duplicates |
| 3 | Detect and handle outliers |
| 4 | Standardize inconsistent values |
| 5 | Convert data types |
| 6 | Clean string variables |
| 7 | Parse date/time fields |
| 8 | Validate cleaned data |
Need Expert Data Science Help?
Get personalized tutoring, project support, or professional consulting.