Outlier Detection Algorithms

Outliers are data points that deviate significantly from the majority. Detecting them is critical for fraud detection, quality control, data cleaning, and security. Different algorithms excel in different scenarios.

Anomaly Detection Methods

Why Outlier Detection Matters

A single fraudulent transaction, a manufacturing defect, or a corrupted sensor reading can have outsized impact. Automated outlier detection catches anomalies before they cause damage.

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

Generate Multimodal Data with Outliers

np.random.seed(42)

# Normal data clusters
cluster1 = np.random.normal([0, 0], [1, 1], (200, 2))
cluster2 = np.random.normal([5, 5], [0.8, 0.8], (200, 2))
cluster3 = np.random.normal([0, 5], [0.5, 1.5], (100, 2))

normal_data = np.vstack([cluster1, cluster2, cluster3])

# Inject outliers
n_outliers = 30
outliers = np.random.uniform(-3, 8, (n_outliers, 2))

# Combine
X = np.vstack([normal_data, outliers])
labels_true = np.array([0] * len(normal_data) + [1] * n_outliers)

print(f"Dataset: {len(X)} points, {n_outliers} outliers ({n_outliers/len(X):.1%})")

Isolation Forest

Isolation Forest isolates anomalies by randomly partitioning data. Anomalies need fewer partitions to isolate.

# Fit Isolation Forest
iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42,
    n_jobs=-1
)

predictions_if = iso_forest.fit_predict(X)
scores_if = iso_forest.score_samples(X)

n_detected = (predictions_if == -1).sum()
print(f"Isolation Forest detected {n_detected} outliers")
print(f"True outliers: {n_outliers}")

# Evaluation
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(labels_true, predictions_if == -1)
recall = recall_score(labels_true, predictions_if == -1)
f1 = f1_score(labels_true, predictions_if == -1)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

# Anomaly scores
print(f"\nScore distribution:")
print(f"  Normal: {scores_if[labels_true == 0].mean():.3f} ± {scores_if[labels_true == 0].std():.3f}")
print(f"  Outlier: {scores_if[labels_true == 1].mean():.3f} ± {scores_if[labels_true == 1].std():.3f}")

Local Outlier Factor (LOF)

LOF compares local density of a point to its neighbors. Points in sparse regions relative to their neighbors are outliers.

# Fit LOF
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.05,
    novelty=False
)

predictions_lof = lof.fit_predict(X)
scores_lof = lof.negative_outlier_factor_

n_detected = (predictions_lof == -1).sum()
precision = precision_score(labels_true, predictions_lof == -1)
recall = recall_score(labels_true, predictions_lof == -1)
print(f"LOF detected {n_detected} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")

# LOF scores are relative – lower = more abnormal
print(f"\nLOF score ranges:")
print(f"  Normal: {scores_lof[labels_true == 0].min():.3f} to {scores_lof[labels_true == 0].max():.3f}")
print(f"  Outlier: {scores_lof[labels_true == 1].min():.3f} to {scores_lof[labels_true == 1].max():.3f}")

DBSCAN for Outlier Detection

DBSCAN marks points in low-density regions as noise (label = -1).

# DBSCAN – noise points are outliers
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X)

n_noise = (clusters == -1).sum()
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
print(f"DBSCAN: {n_clusters} clusters, {n_noise} noise points")

# Noise points as outliers
precision = precision_score(labels_true, clusters == -1)
recall = recall_score(labels_true, clusters == -1)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")

# Cluster sizes
unique, counts = np.unique(clusters, return_counts=True)
for u, c in zip(unique, counts):
    label = "noise" if u == -1 else f"cluster {u}"
    print(f"  {label}: {c} points")

Mahalanobis Distance

Mahalanobis distance accounts for correlations between features – ideal for multivariate normal data.

def mahalanobis_outlier_detection(X, threshold_percentile=97):
    """Detect outliers using Mahalanobis distance."""
    mean = np.mean(X, axis=0)
    cov = np.cov(X.T)
    
    # Regularize covariance matrix
    cov += np.eye(cov.shape[1]) * 1e-6
    
    inv_cov = np.linalg.inv(cov)
    
    distances = np.array([
        mahalanobis(x, mean, inv_cov) for x in X
    ])
    
    # Threshold based on chi-squared distribution
    threshold = np.percentile(distances, threshold_percentile)
    outliers = distances > threshold
    
    return distances, outliers

distances, outliers_mahal = mahalanobis_outlier_detection(X)
precision = precision_score(labels_true, outliers_mahal)
recall = recall_score(labels_true, outliers_mahal)
print(f"Mahalanobis: {outliers_mahal.sum()} outliers detected")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")

# Chi-squared threshold
from scipy.stats import chi2
chi2_threshold = chi2.ppf(0.97, df=2)
print(f"Chi-squared threshold (97%): {chi2_threshold:.2f}")
print(f"Points beyond threshold: {(distances > chi2_threshold).sum()}")

Robust Statistics

Methods that are resistant to the influence of outliers.

# MAD (Median Absolute Deviation) – robust alternative to std
def mad_outlier_detection(X, threshold=3.5):
    """Detect outliers using MAD for each feature."""
    median = np.median(X, axis=0)
    mad = np.median(np.abs(X - median), axis=0)
    
    # Modified Z-score using MAD
    modified_z = 0.6745 * (X - median) / (mad + 1e-10)
    
    # Points with any feature beyond threshold
    outliers = np.any(np.abs(modified_z) > threshold, axis=1)
    return modified_z, outliers

z_scores, outliers_mad = mad_outlier_detection(X)
precision = precision_score(labels_true, outliers_mad)
recall = recall_score(labels_true, outliers_mad)
print(f"MAD detection: {outliers_mad.sum()} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")

# Robust Z-score (using median and MAD)
robust_z = 0.6745 * (X - np.median(X, axis=0)) / (stats.median_abs_deviation(X, axis=0) + 1e-10)
print(f"\nRobust Z-score range: {np.abs(robust_z).max():.2f}")

Elliptic Envelope (Minimum Covariance Determinant)

Fits a robust ellipse around normal data; points outside are outliers.

ee = EllipticEnvelope(
    contamination=0.05,
    support_fraction=0.8,
    random_state=42
)

predictions_ee = ee.fit_predict(X)
n_detected = (predictions_ee == -1).sum()
precision = precision_score(labels_true, predictions_ee == -1)
recall = recall_score(labels_true, predictions_ee == -1)
print(f"Elliptic Envelope: {n_detected} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")

Ensemble Methods

Combining multiple detectors for robust detection.

def ensemble_outlier_detection(X, contamination=0.05):
    """Majority vote from multiple detectors."""
    detectors = {
        'iso_forest': IsolationForest(contamination=contamination, random_state=42),
        'lof': LocalOutlierFactor(contamination=contamination, novelty=True),
        'ee': EllipticEnvelope(contamination=contamination, random_state=42)
    }
    
    votes = np.zeros(len(X))
    
    for name, detector in detectors.items():
        pred = detector.fit_predict(X)
        votes += (pred == -1).astype(int)
    
    # Majority vote
    ensemble_pred = (votes >= 2).astype(int)
    return ensemble_pred, votes

ensemble_pred, votes = ensemble_outlier_detection(X)
precision = precision_score(labels_true, ensemble_pred)
recall = recall_score(labels_true, ensemble_pred)
f1 = f1_score(labels_true, ensemble_pred)
print(f"Ensemble: {ensemble_pred.sum()} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(f"Vote distribution: {np.bincount(votes.astype(int))}")

Comparison Summary

results = {}
for name, pred in [
    ('Isolation Forest', predictions_if == -1),
    ('LOF', predictions_lof == -1),
    ('DBSCAN', clusters == -1),
    ('Mahalanobis', outliers_mahal),
    ('MAD', outliers_mad),
    ('Elliptic Envelope', predictions_ee == -1),
    ('Ensemble', ensemble_pred == 1)
]:
    results[name] = {
        'Detected': pred.sum(),
        'Precision': precision_score(labels_true, pred),
        'Recall': recall_score(labels_true, pred),
        'F1': f1_score(labels_true, pred)
    }

results_df = pd.DataFrame(results).T.round(3)
print(results_df.to_string())

Best Practices

Choose based on data distribution – Mahalanobis for Gaussian, Isolation Forest for complex
Tune contamination – know what fraction of data is expected to be anomalous
Use ensemble methods – combine detectors for robustness
Visualize – 2D/3D plots reveal structure that metrics miss
Domain knowledge – some "outliers" are legitimate rare events
Evaluate carefully – precision/recall tradeoff depends on cost of false positives

Summary

Outlier detection requires matching the algorithm to the data structure. Isolation Forest handles high-dimensional data, LOF finds local anomalies, DBSCAN identifies noise, and Mahalanobis works for multivariate normal data. Ensemble methods provide the most robust detection.