OOP for Data Science
Object-oriented programming enables clean, modular, and reusable code. Essential for building ML pipelines, custom transformers, and data structures.
Classes and Objects
class DataPoint:
"""Represents a single data point with features and label."""
# Class attribute (shared across instances)
dataset_name = "default"
def __init__(self, features: list[float], label: str):
# Instance attributes
self.features = features
self.label = label
self._id = id(self) # private by convention
def __repr__(self):
return f"DataPoint(features={self.features}, label={self.label})"
def distance_to(self, other: "DataPoint") -> float:
"""Euclidean distance between two data points."""
return sum((a - b) ** 2 for a, b in zip(self.features, other.features)) ** 0.5
# Usage
dp1 = DataPoint([1.0, 2.0, 3.0], "positive")
dp2 = DataPoint([4.0, 5.0, 6.0], "negative")
dp1.distance_to(dp2) # 5.196...
Inheritance and Polymorphism
from abc import ABC, abstractmethod
import numpy as np
class BaseModel(ABC):
"""Abstract base class for ML models."""
def __init__(self):
self.is_fitted = False
@abstractmethod
def fit(self, X: np.ndarray, y: np.ndarray) -> "BaseModel":
pass
@abstractmethod
def predict(self, X: np.ndarray) -> np.ndarray:
pass
def score(self, X: np.ndarray, y: np.ndarray) -> float:
"""R² score."""
predictions = self.predict(X)
ss_res = np.sum((y - predictions) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
class LinearRegression(BaseModel):
def __init__(self):
super().__init__()
self.coefficients = None
self.intercept = None
def fit(self, X, y):
X_b = np.c_[np.ones((X.shape[0], 1)), X]
theta = np.linalg.lstsq(X_b, y, rcond=None)[0]
self.intercept = theta[0]
self.coefficients = theta[1:]
self.is_fitted = True
return self
def predict(self, X):
if not self.is_fitted:
raise RuntimeError("Model not fitted")
return X @ self.coefficients + self.intercept
# Usage
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 5, 4, 5])
model = LinearRegression()
model.fit(X, y)
model.score(X, y) # 0.5833...
Encapsulation
class DataProcessor:
def __init__(self, data: list):
self._data = data # protected (convention)
self.__cache = {} # name-mangled (truly private)
@property
def data(self):
"""Read-only access to data."""
return self._data.copy()
@property
def statistics(self):
"""Computed property."""
if "stats" not in self.__cache:
self.__cache["stats"] = {
"mean": sum(self._data) / len(self._data),
"min": min(self._data),
"max": max(self._data),
"std": (sum((x - sum(self._data)/len(self._data))**2
for x in self._data) / len(self._data)) ** 0.5
}
return self.__cache["stats"]
def __validate(self, value):
"""Private validation method."""
if not isinstance(value, (int, float)):
raise TypeError("Data must be numeric")
processor = DataProcessor([1, 2, 3, 4, 5])
processor.statistics # {'mean': 3.0, 'min': 1, 'max': 5, 'std': 1.414...}
processor._data # Accessible but convention says don't
Dunder (Magic) Methods
import numpy as np
class Vector:
"""2D Vector with mathematical operations."""
def __init__(self, x, y):
self.x = x
self.y = y
# String representations
def __repr__(self):
return f"Vector({self.x}, {self.y})"
def __str__(self):
return f"({self.x}, {self.y})"
# Arithmetic operators
def __add__(self, other):
return Vector(self.x + other.x, self.y + other.y)
def __sub__(self, other):
return Vector(self.x - other.x, self.y - other.y)
def __mul__(self, scalar):
return Vector(self.x * scalar, self.y * scalar)
def __rmul__(self, scalar):
return self.__mul__(scalar)
def __abs__(self):
return (self.x**2 + self.y**2) ** 0.5
def __eq__(self, other):
return self.x == other.x and self.y == other.y
def __hash__(self):
return hash((self.x, self.y))
# Container protocol
def __len__(self):
return 2
def __getitem__(self, index):
if index == 0: return self.x
if index == 1: return self.y
raise IndexError("Vector index out of range")
# Callable
def __call__(self, other):
return self.x * other.x + self.y * other.y # dot product
# Usage
v1 = Vector(1, 2)
v2 = Vector(3, 4)
print(v1 + v2) # (4, 6)
print(v1 * 3) # (3, 6)
print(abs(v1)) # 2.236...
print(v1(v2)) # 11 (dot product)
Dataclasses
from dataclasses import dataclass, field
from typing import List
@dataclass
class Student:
name: str
age: int
grades: List[float] = field(default_factory=list)
@property
def gpa(self):
return sum(self.grades) / len(self.grades) if self.grades else 0.0
def __post_init__(self):
if self.age < 0:
raise ValueError("Age cannot be negative")
@dataclass(frozen=True) # Immutable
class Hyperparameters:
learning_rate: float = 0.01
epochs: int = 100
batch_size: int = 32
dropout: float = 0.2
# Auto-generated methods
s = Student("Alice", 20, [3.5, 3.8, 4.0])
print(s) # Student(name='Alice', age=20, grades=[3.5, 3.8, 4.0])
print(s.gpa) # 3.766...
# Frozen dataclass
hp = Hyperparameters(learning_rate=0.001)
# hp.learning_rate = 0.1 # FrozenInstanceError
Context Managers
import time
from contextlib import contextmanager
class Timer:
"""Context manager for timing code blocks."""
def __enter__(self):
self.start = time.perf_counter()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.elapsed = time.perf_counter() - self.start
print(f"Elapsed: {self.elapsed:.4f}s")
return False # Don't suppress exceptions
# Usage
with Timer() as t:
sum(range(1000000))
print(f"Time: {t.elapsed:.4f}s")
# Context manager as function
@contextmanager
def managed_resource(name):
print(f"Acquiring {name}")
resource = {"name": name, "active": True}
try:
yield resource
except Exception as e:
print(f"Error: {e}")
resource["active"] = False
finally:
print(f"Releasing {name}")
with managed_resource("database") as res:
print(f"Using {res['name']}")
Design Patterns for Data Science
from abc import ABC, abstractmethod
from typing import List
# Strategy Pattern
class PreprocessingStrategy(ABC):
@abstractmethod
def transform(self, data: List[float]) -> List[float]:
pass
class NormalizeStrategy(PreprocessingStrategy):
def transform(self, data):
min_val, max_val = min(data), max(data)
return [(x - min_val) / (max_val - min_val) for x in data]
class StandardizeStrategy(PreprocessingStrategy):
def transform(self, data):
mean = sum(data) / len(data)
std = (sum((x - mean)**2 for x in data) / len(data)) ** 0.5
return [(x - mean) / std for x in data]
class DataProcessor:
def __init__(self, strategy: PreprocessingStrategy):
self._strategy = strategy
def process(self, data):
return self._strategy.transform(data)
# Swap strategies
processor = DataProcessor(NormalizeStrategy())
processor.process([1, 2, 3, 4, 5]) # [0.0, 0.25, 0.5, 0.75, 1.0]
processor._strategy = StandardizeStrategy()
processor.process([1, 2, 3, 4, 5]) # [-1.414, -0.707, 0, 0.707, 1.414]
# Observer Pattern
class TrainingCallback:
def on_epoch_start(self, epoch): pass
def on_epoch_end(self, epoch, loss): pass
def on_training_end(self, metrics): pass
class EarlyStopping(TrainingCallback):
def __init__(self, patience=5):
self.patience = patience
self.best_loss = float("inf")
self.counter = 0
def on_epoch_end(self, epoch, loss):
if loss < self.best_loss:
self.best_loss = loss
self.counter = 0
else:
self.counter += 1
if self.counter >= self.patience:
print(f"Early stopping at epoch {epoch}")
return True
return False
# Factory Pattern
class ModelFactory:
_registry = {}
@classmethod
def register(cls, name, model_class):
cls._registry[name] = model_class
@classmethod
def create(cls, name, **kwargs):
if name not in cls._registry:
raise ValueError(f"Unknown model: {name}")
return cls._registry[name](**kwargs)
# Register models
ModelFactory.register("linear", LinearRegression)
ModelFactory.register("tree", DecisionTreeClassifier)
# Create dynamically
model = ModelFactory.create("linear", learning_rate=0.01)
Practical Example: ML Pipeline with OOP
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
class OutlierRemover(BaseEstimator, TransformerMixin):
"""Custom transformer to remove outliers using IQR."""
def __init__(self, factor=1.5):
self.factor = factor
def fit(self, X, y=None):
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
self.lower_ = Q1 - self.factor * IQR
self.upper_ = Q3 + self.factor * IQR
return self
def transform(self, X, y=None):
mask = np.all((X >= self.lower_) & (X <= self.upper_), axis=1)
return X[mask]
class FeatureEngineer(BaseEstimator, TransformerMixin):
"""Custom transformer for feature engineering."""
def __init__(self, poly_degree=2):
self.poly_degree = poly_degree
def fit(self, X, y=None):
self.n_features_ = X.shape[1]
return self
def transform(self, X, y=None):
X_poly = X.copy()
for i in range(self.n_features_):
for j in range(i, self.n_features_):
X_poly = np.c_[X_poly, X[:, i] * X[:, j]]
return X_poly
# Usage in pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
("outlier_remover", OutlierRemover(factor=2.0)),
("scaler", StandardScaler()),
("feature_engineer", FeatureEngineer(poly_degree=2)),
("classifier", LogisticRegression())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
Summary
- Use abstract base classes for interface contracts
- Apply inheritance for model hierarchies
- Leverage dataclasses for clean data containers
- Implement magic methods for intuitive object behavior
- Use context managers for resource management
- Apply design patterns (Strategy, Observer, Factory) for scalable ML systems
- Extend sklearn with custom transformers using OOP