Outlier Detection Algorithms
Outliers are data points that deviate significantly from the majority. Detecting them is critical for fraud detection, quality control, data cleaning, and security. Different algorithms excel in different scenarios.
Anomaly Detection Methods
Why Outlier Detection Matters
A single fraudulent transaction, a manufacturing defect, or a corrupted sensor reading can have outsized impact. Automated outlier detection catches anomalies before they cause damage.
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
Generate Multimodal Data with Outliers
np.random.seed(42)
# Normal data clusters
cluster1 = np.random.normal([0, 0], [1, 1], (200, 2))
cluster2 = np.random.normal([5, 5], [0.8, 0.8], (200, 2))
cluster3 = np.random.normal([0, 5], [0.5, 1.5], (100, 2))
normal_data = np.vstack([cluster1, cluster2, cluster3])
# Inject outliers
n_outliers = 30
outliers = np.random.uniform(-3, 8, (n_outliers, 2))
# Combine
X = np.vstack([normal_data, outliers])
labels_true = np.array([0] * len(normal_data) + [1] * n_outliers)
print(f"Dataset: {len(X)} points, {n_outliers} outliers ({n_outliers/len(X):.1%})")
Isolation Forest
Isolation Forest isolates anomalies by randomly partitioning data. Anomalies need fewer partitions to isolate.
# Fit Isolation Forest
iso_forest = IsolationForest(
n_estimators=200,
contamination=0.05,
random_state=42,
n_jobs=-1
)
predictions_if = iso_forest.fit_predict(X)
scores_if = iso_forest.score_samples(X)
n_detected = (predictions_if == -1).sum()
print(f"Isolation Forest detected {n_detected} outliers")
print(f"True outliers: {n_outliers}")
# Evaluation
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(labels_true, predictions_if == -1)
recall = recall_score(labels_true, predictions_if == -1)
f1 = f1_score(labels_true, predictions_if == -1)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
# Anomaly scores
print(f"\nScore distribution:")
print(f" Normal: {scores_if[labels_true == 0].mean():.3f} Β± {scores_if[labels_true == 0].std():.3f}")
print(f" Outlier: {scores_if[labels_true == 1].mean():.3f} Β± {scores_if[labels_true == 1].std():.3f}")
Local Outlier Factor (LOF)
LOF compares local density of a point to its neighbors. Points in sparse regions relative to their neighbors are outliers.
# Fit LOF
lof = LocalOutlierFactor(
n_neighbors=20,
contamination=0.05,
novelty=False
)
predictions_lof = lof.fit_predict(X)
scores_lof = lof.negative_outlier_factor_
n_detected = (predictions_lof == -1).sum()
precision = precision_score(labels_true, predictions_lof == -1)
recall = recall_score(labels_true, predictions_lof == -1)
print(f"LOF detected {n_detected} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")
# LOF scores are relative β lower = more abnormal
print(f"\nLOF score ranges:")
print(f" Normal: {scores_lof[labels_true == 0].min():.3f} to {scores_lof[labels_true == 0].max():.3f}")
print(f" Outlier: {scores_lof[labels_true == 1].min():.3f} to {scores_lof[labels_true == 1].max():.3f}")
DBSCAN for Outlier Detection
DBSCAN marks points in low-density regions as noise (label = -1).
# DBSCAN β noise points are outliers
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X)
n_noise = (clusters == -1).sum()
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
print(f"DBSCAN: {n_clusters} clusters, {n_noise} noise points")
# Noise points as outliers
precision = precision_score(labels_true, clusters == -1)
recall = recall_score(labels_true, clusters == -1)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")
# Cluster sizes
unique, counts = np.unique(clusters, return_counts=True)
for u, c in zip(unique, counts):
label = "noise" if u == -1 else f"cluster {u}"
print(f" {label}: {c} points")
Mahalanobis Distance
Mahalanobis distance accounts for correlations between features β ideal for multivariate normal data.
def mahalanobis_outlier_detection(X, threshold_percentile=97):
"""Detect outliers using Mahalanobis distance."""
mean = np.mean(X, axis=0)
cov = np.cov(X.T)
# Regularize covariance matrix
cov += np.eye(cov.shape[1]) * 1e-6
inv_cov = np.linalg.inv(cov)
distances = np.array([
mahalanobis(x, mean, inv_cov) for x in X
])
# Threshold based on chi-squared distribution
threshold = np.percentile(distances, threshold_percentile)
outliers = distances > threshold
return distances, outliers
distances, outliers_mahal = mahalanobis_outlier_detection(X)
precision = precision_score(labels_true, outliers_mahal)
recall = recall_score(labels_true, outliers_mahal)
print(f"Mahalanobis: {outliers_mahal.sum()} outliers detected")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")
# Chi-squared threshold
from scipy.stats import chi2
chi2_threshold = chi2.ppf(0.97, df=2)
print(f"Chi-squared threshold (97%): {chi2_threshold:.2f}")
print(f"Points beyond threshold: {(distances > chi2_threshold).sum()}")
Robust Statistics
Methods that are resistant to the influence of outliers.
# MAD (Median Absolute Deviation) β robust alternative to std
def mad_outlier_detection(X, threshold=3.5):
"""Detect outliers using MAD for each feature."""
median = np.median(X, axis=0)
mad = np.median(np.abs(X - median), axis=0)
# Modified Z-score using MAD
modified_z = 0.6745 * (X - median) / (mad + 1e-10)
# Points with any feature beyond threshold
outliers = np.any(np.abs(modified_z) > threshold, axis=1)
return modified_z, outliers
z_scores, outliers_mad = mad_outlier_detection(X)
precision = precision_score(labels_true, outliers_mad)
recall = recall_score(labels_true, outliers_mad)
print(f"MAD detection: {outliers_mad.sum()} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")
# Robust Z-score (using median and MAD)
robust_z = 0.6745 * (X - np.median(X, axis=0)) / (stats.median_abs_deviation(X, axis=0) + 1e-10)
print(f"\nRobust Z-score range: {np.abs(robust_z).max():.2f}")
Elliptic Envelope (Minimum Covariance Determinant)
Fits a robust ellipse around normal data; points outside are outliers.
ee = EllipticEnvelope(
contamination=0.05,
support_fraction=0.8,
random_state=42
)
predictions_ee = ee.fit_predict(X)
n_detected = (predictions_ee == -1).sum()
precision = precision_score(labels_true, predictions_ee == -1)
recall = recall_score(labels_true, predictions_ee == -1)
print(f"Elliptic Envelope: {n_detected} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}")
Ensemble Methods
Combining multiple detectors for robust detection.
def ensemble_outlier_detection(X, contamination=0.05):
"""Majority vote from multiple detectors."""
detectors = {
'iso_forest': IsolationForest(contamination=contamination, random_state=42),
'lof': LocalOutlierFactor(contamination=contamination, novelty=True),
'ee': EllipticEnvelope(contamination=contamination, random_state=42)
}
votes = np.zeros(len(X))
for name, detector in detectors.items():
pred = detector.fit_predict(X)
votes += (pred == -1).astype(int)
# Majority vote
ensemble_pred = (votes >= 2).astype(int)
return ensemble_pred, votes
ensemble_pred, votes = ensemble_outlier_detection(X)
precision = precision_score(labels_true, ensemble_pred)
recall = recall_score(labels_true, ensemble_pred)
f1 = f1_score(labels_true, ensemble_pred)
print(f"Ensemble: {ensemble_pred.sum()} outliers")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(f"Vote distribution: {np.bincount(votes.astype(int))}")
Comparison Summary
results = {}
for name, pred in [
('Isolation Forest', predictions_if == -1),
('LOF', predictions_lof == -1),
('DBSCAN', clusters == -1),
('Mahalanobis', outliers_mahal),
('MAD', outliers_mad),
('Elliptic Envelope', predictions_ee == -1),
('Ensemble', ensemble_pred == 1)
]:
results[name] = {
'Detected': pred.sum(),
'Precision': precision_score(labels_true, pred),
'Recall': recall_score(labels_true, pred),
'F1': f1_score(labels_true, pred)
}
results_df = pd.DataFrame(results).T.round(3)
print(results_df.to_string())
Best Practices
- Choose based on data distribution β Mahalanobis for Gaussian, Isolation Forest for complex
- Tune contamination β know what fraction of data is expected to be anomalous
- Use ensemble methods β combine detectors for robustness
- Visualize β 2D/3D plots reveal structure that metrics miss
- Domain knowledge β some "outliers" are legitimate rare events
- Evaluate carefully β precision/recall tradeoff depends on cost of false positives
Summary
Outlier detection requires matching the algorithm to the data structure. Isolation Forest handles high-dimensional data, LOF finds local anomalies, DBSCAN identifies noise, and Mahalanobis works for multivariate normal data. Ensemble methods provide the most robust detection.