CW

OOP for Data Science

Module 1: Introduction & Python BasicsFree Lesson

Advertisement

OOP for Data Science

Object-oriented programming enables clean, modular, and reusable code. Essential for building ML pipelines, custom transformers, and data structures.

Classes and Objects

class DataPoint:
    """Represents a single data point with features and label."""

    # Class attribute (shared across instances)
    dataset_name = "default"

    def __init__(self, features: list[float], label: str):
        # Instance attributes
        self.features = features
        self.label = label
        self._id = id(self)  # private by convention

    def __repr__(self):
        return f"DataPoint(features={self.features}, label={self.label})"

    def distance_to(self, other: "DataPoint") -> float:
        """Euclidean distance between two data points."""
        return sum((a - b) ** 2 for a, b in zip(self.features, other.features)) ** 0.5

# Usage
dp1 = DataPoint([1.0, 2.0, 3.0], "positive")
dp2 = DataPoint([4.0, 5.0, 6.0], "negative")
dp1.distance_to(dp2)  # 5.196...

Inheritance and Polymorphism

BaseModel+ fit(X, y)+ predict(X)+ score(X, y)LinearModel+ fit(X, y)+ predict(X)+ coefficientsTreeModel+ fit(X, y)+ predict(X)+ feature_importanceEnsembleModel+ fit(X, y)+ predict(X)+ base_models
from abc import ABC, abstractmethod
import numpy as np

class BaseModel(ABC):
    """Abstract base class for ML models."""

    def __init__(self):
        self.is_fitted = False

    @abstractmethod
    def fit(self, X: np.ndarray, y: np.ndarray) -> "BaseModel":
        pass

    @abstractmethod
    def predict(self, X: np.ndarray) -> np.ndarray:
        pass

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        """R² score."""
        predictions = self.predict(X)
        ss_res = np.sum((y - predictions) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

class LinearRegression(BaseModel):
    def __init__(self):
        super().__init__()
        self.coefficients = None
        self.intercept = None

    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        theta = np.linalg.lstsq(X_b, y, rcond=None)[0]
        self.intercept = theta[0]
        self.coefficients = theta[1:]
        self.is_fitted = True
        return self

    def predict(self, X):
        if not self.is_fitted:
            raise RuntimeError("Model not fitted")
        return X @ self.coefficients + self.intercept

# Usage
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 5, 4, 5])
model = LinearRegression()
model.fit(X, y)
model.score(X, y)  # 0.5833...

Encapsulation

class DataProcessor:
    def __init__(self, data: list):
        self._data = data          # protected (convention)
        self.__cache = {}          # name-mangled (truly private)

    @property
    def data(self):
        """Read-only access to data."""
        return self._data.copy()

    @property
    def statistics(self):
        """Computed property."""
        if "stats" not in self.__cache:
            self.__cache["stats"] = {
                "mean": sum(self._data) / len(self._data),
                "min": min(self._data),
                "max": max(self._data),
                "std": (sum((x - sum(self._data)/len(self._data))**2
                       for x in self._data) / len(self._data)) ** 0.5
            }
        return self.__cache["stats"]

    def __validate(self, value):
        """Private validation method."""
        if not isinstance(value, (int, float)):
            raise TypeError("Data must be numeric")

processor = DataProcessor([1, 2, 3, 4, 5])
processor.statistics  # {'mean': 3.0, 'min': 1, 'max': 5, 'std': 1.414...}
processor._data       # Accessible but convention says don't

Dunder (Magic) Methods

import numpy as np

class Vector:
    """2D Vector with mathematical operations."""

    def __init__(self, x, y):
        self.x = x
        self.y = y

    # String representations
    def __repr__(self):
        return f"Vector({self.x}, {self.y})"

    def __str__(self):
        return f"({self.x}, {self.y})"

    # Arithmetic operators
    def __add__(self, other):
        return Vector(self.x + other.x, self.y + other.y)

    def __sub__(self, other):
        return Vector(self.x - other.x, self.y - other.y)

    def __mul__(self, scalar):
        return Vector(self.x * scalar, self.y * scalar)

    def __rmul__(self, scalar):
        return self.__mul__(scalar)

    def __abs__(self):
        return (self.x**2 + self.y**2) ** 0.5

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def __hash__(self):
        return hash((self.x, self.y))

    # Container protocol
    def __len__(self):
        return 2

    def __getitem__(self, index):
        if index == 0: return self.x
        if index == 1: return self.y
        raise IndexError("Vector index out of range")

    # Callable
    def __call__(self, other):
        return self.x * other.x + self.y * other.y  # dot product

# Usage
v1 = Vector(1, 2)
v2 = Vector(3, 4)
print(v1 + v2)     # (4, 6)
print(v1 * 3)      # (3, 6)
print(abs(v1))     # 2.236...
print(v1(v2))      # 11 (dot product)

Dataclasses

from dataclasses import dataclass, field
from typing import List

@dataclass
class Student:
    name: str
    age: int
    grades: List[float] = field(default_factory=list)

    @property
    def gpa(self):
        return sum(self.grades) / len(self.grades) if self.grades else 0.0

    def __post_init__(self):
        if self.age < 0:
            raise ValueError("Age cannot be negative")

@dataclass(frozen=True)  # Immutable
class Hyperparameters:
    learning_rate: float = 0.01
    epochs: int = 100
    batch_size: int = 32
    dropout: float = 0.2

# Auto-generated methods
s = Student("Alice", 20, [3.5, 3.8, 4.0])
print(s)           # Student(name='Alice', age=20, grades=[3.5, 3.8, 4.0])
print(s.gpa)       # 3.766...

# Frozen dataclass
hp = Hyperparameters(learning_rate=0.001)
# hp.learning_rate = 0.1  # FrozenInstanceError

Context Managers

import time
from contextlib import contextmanager

class Timer:
    """Context manager for timing code blocks."""

    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.elapsed = time.perf_counter() - self.start
        print(f"Elapsed: {self.elapsed:.4f}s")
        return False  # Don't suppress exceptions

# Usage
with Timer() as t:
    sum(range(1000000))
print(f"Time: {t.elapsed:.4f}s")

# Context manager as function
@contextmanager
def managed_resource(name):
    print(f"Acquiring {name}")
    resource = {"name": name, "active": True}
    try:
        yield resource
    except Exception as e:
        print(f"Error: {e}")
        resource["active"] = False
    finally:
        print(f"Releasing {name}")

with managed_resource("database") as res:
    print(f"Using {res['name']}")

Design Patterns for Data Science

Strategy Pattern• Swap algorithms at runtime• ML model selection• Different preprocessingclass DataCleaner: def init(self, strategy): self.strategy = strategy def clean(self, data): return self.strategy(data)Observer Pattern• Event-driven updates• Training callbacks• Logging/monitoringclass Callback: def on_epoch_end(self, epoch): passclass Logger(Callback): def on_epoch_end(self, ep): print(f"Epoch {ep}")Factory Pattern• Create objects dynamically• Model factories• Pipeline constructionclass ModelFactory: @staticmethod def create(model_type): if model_type == "rf": return RandomForest() elif model_type == "nn":
from abc import ABC, abstractmethod
from typing import List

# Strategy Pattern
class PreprocessingStrategy(ABC):
    @abstractmethod
    def transform(self, data: List[float]) -> List[float]:
        pass

class NormalizeStrategy(PreprocessingStrategy):
    def transform(self, data):
        min_val, max_val = min(data), max(data)
        return [(x - min_val) / (max_val - min_val) for x in data]

class StandardizeStrategy(PreprocessingStrategy):
    def transform(self, data):
        mean = sum(data) / len(data)
        std = (sum((x - mean)**2 for x in data) / len(data)) ** 0.5
        return [(x - mean) / std for x in data]

class DataProcessor:
    def __init__(self, strategy: PreprocessingStrategy):
        self._strategy = strategy

    def process(self, data):
        return self._strategy.transform(data)

# Swap strategies
processor = DataProcessor(NormalizeStrategy())
processor.process([1, 2, 3, 4, 5])  # [0.0, 0.25, 0.5, 0.75, 1.0]

processor._strategy = StandardizeStrategy()
processor.process([1, 2, 3, 4, 5])  # [-1.414, -0.707, 0, 0.707, 1.414]

# Observer Pattern
class TrainingCallback:
    def on_epoch_start(self, epoch): pass
    def on_epoch_end(self, epoch, loss): pass
    def on_training_end(self, metrics): pass

class EarlyStopping(TrainingCallback):
    def __init__(self, patience=5):
        self.patience = patience
        self.best_loss = float("inf")
        self.counter = 0

    def on_epoch_end(self, epoch, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print(f"Early stopping at epoch {epoch}")
                return True
        return False

# Factory Pattern
class ModelFactory:
    _registry = {}

    @classmethod
    def register(cls, name, model_class):
        cls._registry[name] = model_class

    @classmethod
    def create(cls, name, **kwargs):
        if name not in cls._registry:
            raise ValueError(f"Unknown model: {name}")
        return cls._registry[name](**kwargs)

# Register models
ModelFactory.register("linear", LinearRegression)
ModelFactory.register("tree", DecisionTreeClassifier)

# Create dynamically
model = ModelFactory.create("linear", learning_rate=0.01)

Practical Example: ML Pipeline with OOP

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class OutlierRemover(BaseEstimator, TransformerMixin):
    """Custom transformer to remove outliers using IQR."""

    def __init__(self, factor=1.5):
        self.factor = factor

    def fit(self, X, y=None):
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        self.lower_ = Q1 - self.factor * IQR
        self.upper_ = Q3 + self.factor * IQR
        return self

    def transform(self, X, y=None):
        mask = np.all((X >= self.lower_) & (X <= self.upper_), axis=1)
        return X[mask]

class FeatureEngineer(BaseEstimator, TransformerMixin):
    """Custom transformer for feature engineering."""

    def __init__(self, poly_degree=2):
        self.poly_degree = poly_degree

    def fit(self, X, y=None):
        self.n_features_ = X.shape[1]
        return self

    def transform(self, X, y=None):
        X_poly = X.copy()
        for i in range(self.n_features_):
            for j in range(i, self.n_features_):
                X_poly = np.c_[X_poly, X[:, i] * X[:, j]]
        return X_poly

# Usage in pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ("outlier_remover", OutlierRemover(factor=2.0)),
    ("scaler", StandardScaler()),
    ("feature_engineer", FeatureEngineer(poly_degree=2)),
    ("classifier", LogisticRegression())
])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Summary

  • Use abstract base classes for interface contracts
  • Apply inheritance for model hierarchies
  • Leverage dataclasses for clean data containers
  • Implement magic methods for intuitive object behavior
  • Use context managers for resource management
  • Apply design patterns (Strategy, Observer, Factory) for scalable ML systems
  • Extend sklearn with custom transformers using OOP

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement