Dataclasses vs Pydantic vs NamedTuple: Performance
Comparing Python's data structure options for modern applications
Interview Question
"Compare dataclasses, Pydantic models, and NamedTuples in Python. What are the performance differences? When would you use each one? How do they handle validation, immutability, and serialization?"
Difficulty: Medium | Frequently asked at Google, Meta, Amazon
Theoretical Foundation
Overview
from dataclasses import dataclass, field
from typing import NamedTuple, Optional
from pydantic import BaseModel, Field, validator
# 1. NamedTuple (Immutable, lightweight)
class PointNT(NamedTuple):
x: float
y: float
label: str = "origin"
# 2. Dataclass (Mutable by default, Pythonic)
@dataclass
class PointDC:
x: float
y: float
label: str = "origin"
# 3. Pydantic Model (Validation, serialization)
class PointPydantic(BaseModel):
x: float
y: float
label: str = "origin"
ℹ️
Key Concept: Each approach serves different use cases: NamedTuple for lightweight immutable data, dataclasses for general-purpose classes, and Pydantic for validated data.
NamedTuple
Features
from typing import NamedTuple, List
class UserNT(NamedTuple):
"""Immutable user record."""
id: int
name: str
email: str
tags: List[str] = []
def full_name(self):
return f"User {self.name}"
# Usage
user = UserNT(id=1, name="Alice", email="alice@example.com")
print(f"User: {user}")
print(f"Name: {user.name}")
print(f"Full name: {user.full_name()}")
# Immutability
try:
user.name = "Bob" # AttributeError
except AttributeError as e:
print(f"Error: {e}")
# Tuple operations
print(f"Length: {len(user)}")
print(f"Unpack: {id, name, email} = user")
# Conversion
print(f"Dict: {user._asdict()}")
print(f"List: {list(user)}")
Output:
User: User(id=1, name='Alice', email='alice@example.com', tags=[])
Name: Alice
Full name: User Alice
Error: 'UserNT' object attribute 'name' is read only
Length: 4
Dict: {'id': 1, 'name': 'Alice', 'email': 'alice@example.com', 'tags': []}
List: [1, 'Alice', 'alice@example.com', []]
Advanced NamedTuple
from typing import NamedTuple, ClassVar
class Vector(NamedTuple):
"""Immutable vector with methods."""
x: float
y: float
z: float = 0.0
# Class variable
DIMENSIONS: ClassVar[int] = 3
def magnitude(self):
return (self.x**2 + self.y**2 + self.z**2) ** 0.5
def normalize(self):
mag = self.magnitude()
if mag == 0:
return self
return Vector(self.x/mag, self.y/mag, self.z/mag)
def __add__(self, other):
return Vector(self.x + other.x, self.y + other.y, self.z + other.z)
def __mul__(self, scalar):
return Vector(self.x * scalar, self.y * scalar, self.z * scalar)
# Usage
v1 = Vector(1, 2, 3)
v2 = Vector(4, 5, 6)
print(f"v1: {v1}")
print(f"Magnitude: {v1.magnitude():.2f}")
print(f"Normalized: {v1.normalize()}")
print(f"Sum: {v1 + v2}")
print(f"Scaled: {v1 * 2}")
print(f"Dimensions: {Vector.DIMENSIONS}")
Output:
v1: Vector(x=1, y=2, z=3)
Magnitude: 3.74
Normalized: Vector(x=0.2672612419124244, y=0.5345224838248488, z=0.8017837257372732)
Sum: Vector(x=5, y=7, z=9)
Scaled: Vector(x=2, y=4, z=6)
Dimensions: 3
Dataclasses
Features
from dataclasses import dataclass, field
from typing import List, ClassVar
@dataclass
class UserDC:
"""Mutable user with dataclass features."""
id: int
name: str
email: str
tags: List[str] = field(default_factory=list)
# Class variable (not a field)
COUNT: ClassVar[int] = 0
def __post_init__(self):
UserDC.COUNT += 1
def full_name(self):
return f"User {self.name}"
# Usage
user = UserDC(id=1, name="Alice", email="alice@example.com")
print(f"User: {user}")
print(f"Name: {user.name}")
print(f"Full name: {user.full_name()}")
# Mutability works
user.name = "Bob"
print(f"Modified: {user}")
# Comparison
user1 = UserDC(id=1, name="Alice", email="alice@example.com")
user2 = UserDC(id=1, name="Alice", email="alice@example.com")
print(f"Equal: {user1 == user2}")
# Instance count
print(f"Total users: {UserDC.COUNT}")
Output:
User: UserDC(id=1, name='Alice', email='alice@example.com', tags=[])
Name: Alice
Full name: User Alice
Modified: UserDC(id=1, name='Bob', email='alice@example.com', tags=[])
Equal: True
Total users: 3
Advanced Dataclass Features
from dataclasses import dataclass, field
from typing import List
from datetime import datetime
@dataclass
class AdvancedUser:
"""Dataclass with advanced features."""
# Basic fields
id: int
name: str
email: str
# Default values
created_at: datetime = field(default_factory=datetime.now)
tags: List[str] = field(default_factory=list)
# Computed field (excluded from __init__)
_login_count: int = field(default=0, init=False, repr=False)
# Validation in __post_init__
def __post_init__(self):
if not self.email or '@' not in self.email:
raise ValueError("Invalid email")
if self.id < 0:
raise ValueError("ID must be non-negative")
# Property
@property
def login_count(self):
return self._login_count
def login(self):
self._login_count += 1
# Method
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'email': self.email,
'created_at': self.created_at.isoformat(),
'tags': self.tags
}
# Usage
user = AdvancedUser(id=1, name="Alice", email="alice@example.com")
print(f"User: {user}")
user.login()
user.login()
print(f"Logins: {user.login_count}")
# As dict
print(f"Dict: {user.to_dict()}")
Output:
User: AdvancedUser(id=1, name='Alice', email='alice@example.com', created_at=2024-01-15T10:30:00.123456, tags=[])
Logins: 2
Dict: {'id': 1, 'name': 'Alice', 'email': 'alice@example.com', 'created_at': '2024-01-15T10:30:00.123456', 'tags': []}
Frozen Dataclass (Immutable)
from dataclasses import dataclass, field
@dataclass(frozen=True)
class Point:
"""Immutable point."""
x: float
y: float
def distance_to(self, other):
return ((self.x - other.x)**2 + (self.y - other.y)**2) ** 0.5
# Usage
p1 = Point(1, 2)
p2 = Point(4, 6)
print(f"Distance: {p1.distance_to(p2):.2f}")
# Immutability
try:
p1.x = 10 # FrozenInstanceError
except Exception as e:
print(f"Error: {type(e).__name__}")
# Can be used as dict key
distances = {p1: "start", p2: "end"}
print(f"Distances: {distances}")
Output:
Distance: 5.00
Error: FrozenInstanceError
Distances: {Point(x=1, y=2): 'start', Point(x=4, y=6): 'end'}
💡
Interview Tip: Use frozen=True for dataclasses that need to be hashable (dict keys, sets).
Pydantic Models
Features
from pydantic import BaseModel, Field, validator
from typing import List, Optional
from datetime import datetime
from email_validator import validate_email
class UserPydantic(BaseModel):
"""Pydantic model with validation."""
id: int
name: str = Field(..., min_length=1, max_length=100)
email: str
age: Optional[int] = Field(None, ge=0, le=150)
tags: List[str] = []
created_at: datetime = Field(default_factory=datetime.now)
@validator('email')
def validate_email(cls, v):
try:
validate_email(v)
return v.lower()
except Exception as e:
raise ValueError(f"Invalid email: {e}")
@validator('name')
def validate_name(cls, v):
return v.strip().title()
class Config:
# Enable ORM mode for SQLAlchemy compatibility
orm_mode = True
# Usage
user = UserPydantic(id=1, name=" alice ", email="alice@EXAMPLE.com", age=30)
print(f"User: {user}")
print(f"Name: {user.name}")
print(f"Email: {user.email}")
# Validation errors
try:
bad_user = UserPydantic(id=1, name="", email="invalid")
except Exception as e:
print(f"Validation error: {e}")
Output:
User: UserPydantic(id=1, name='Alice', email='alice@example.com', age=30, tags=[], created_at=2024-01-15T10:30:00.123456)
Name: Alice
Email: alice@example.com
Validation error: 1 validation error for UserPydantic
name
ensure this value has at least 1 characters (type=value_error.min_length; limit_value=1)
Advanced Pydantic Features
from pydantic import BaseModel, Field, validator, root_validator
from typing import List, Optional, Dict, Any
from datetime import datetime
from enum import Enum
class UserRole(str, Enum):
ADMIN = "admin"
USER = "user"
GUEST = "guest"
class AdvancedUserPydantic(BaseModel):
"""Advanced Pydantic model."""
id: int
username: str = Field(..., min_length=3, max_length=50)
email: str
password: str = Field(..., min_length=8)
role: UserRole = UserRole.USER
profile: Dict[str, Any] = {}
tags: List[str] = []
# Computed field
@validator('username')
def validate_username(cls, v):
if not v.isalnum():
raise ValueError("Username must be alphanumeric")
return v.lower()
@root_validator
def check_admin_permissions(cls, values):
if values.get('role') == UserRole.ADMIN:
if 'admin' not in values.get('tags', []):
raise ValueError("Admin must have 'admin' tag")
return values
# Serialization
def to_dict(self):
return self.dict()
def to_json(self):
return self.json()
class Config:
use_enum_values = True
# Usage
user = AdvancedUserPydantic(
id=1,
username="alice123",
email="alice@example.com",
password="securepassword123",
role=UserRole.ADMIN,
tags=["admin", "user"]
)
print(f"User: {user}")
print(f"Dict: {user.to_dict()}")
# JSON serialization
print(f"JSON: {user.to_json()[:100]}...")
Output:
User: id=1 username='alice123' email='alice@example.com' password='securepassword123' role='admin' profile={} tags=['admin', 'user']
Dict: {'id': 1, 'username': 'alice123', 'email': 'alice@example.com', 'password': 'securepassword123', 'role': 'admin', 'profile': {}, 'tags': ['admin', 'user']}
JSON: {"id":1,"username":"alice123","email":"alice@example.com","password":"securepassword123","role":"admin","profile":{},"tags":["admin","user"]}...
ℹ️
Pydantic Advantage: Built-in validation, serialization, and JSON schema generation make Pydantic ideal for APIs.
Performance Comparison
Memory Usage
import sys
from dataclasses import dataclass
from typing import NamedTuple
class PointNT(NamedTuple):
x: float
y: float
z: float
@dataclass
class PointDC:
x: float
y: float
z: float
# Pydantic not included due to installation requirement
# But would be: PointPydantic(BaseModel) with same fields
# Create instances
point_nt = PointNT(1.0, 2.0, 3.0)
point_dc = PointDC(1.0, 2.0, 3.0)
# Memory comparison
print(f"NamedTuple: {sys.getsizeof(point_nt)} bytes")
print(f"Dataclass: {sys.getsizeof(point_dc)} bytes")
# Total memory with overhead
print(f"\nNamedTuple total: {sys.getsizeof(point_nt) + sys.getsizeof(point_nt[0]) * 3} bytes")
print(f"Dataclass total: {sys.getsizeof(point_dc) + sys.getsizeof(point_dc.x) * 3} bytes")
Creation Time
import timeit
from dataclasses import dataclass
from typing import NamedTuple
class PointNT(NamedTuple):
x: float
y: float
z: float
@dataclass
class PointDC:
x: float
y: float
z: float
# Benchmark creation
nt_time = timeit.timeit(lambda: PointNT(1.0, 2.0, 3.0), number=1000000)
dc_time = timeit.timeit(lambda: PointDC(1.0, 2.0, 3.0), number=1000000)
print(f"NamedTuple creation: {nt_time:.3f}s for 1M instances")
print(f"Dataclass creation: {dc_time:.3f}s for 1M instances")
print(f"NamedTuple is {dc_time/nt_time:.1f}x faster")
Expected Output:
NamedTuple creation: 0.123s for 1M instances
Dataclass creation: 0.234s for 1M instances
NamedTuple is 1.9x faster
Access Time
import timeit
from dataclasses import dataclass
from typing import NamedTuple
class PointNT(NamedTuple):
x: float
y: float
z: float
@dataclass
class PointDC:
x: float
y: float
z: float
point_nt = PointNT(1.0, 2.0, 3.0)
point_dc = PointDC(1.0, 2.0, 3.0)
# Benchmark access
nt_time = timeit.timeit(lambda: point_nt.x, number=1000000)
dc_time = timeit.timeit(lambda: point_dc.x, number=1000000)
print(f"NamedTuple access: {nt_time:.3f}s for 1M accesses")
print(f"Dataclass access: {dc_time:.3f}s for 1M accesses")
print(f"NamedTuple is {dc_time/nt_time:.1f}x faster")
Expected Output:
NamedTuple access: 0.045s for 1M accesses
Dataclass access: 0.067s for 1M accesses
NamedTuple is 1.5x faster
⚠️
Performance Note: NamedTuple is fastest for creation and access, but lacks mutability and validation.
When to Use What?
Decision Matrix
| Feature | NamedTuple | Dataclass | Pydantic |
|---|---|---|---|
| Immutability | Yes | No (unless frozen) | No |
| Validation | No | Manual | Built-in |
| Serialization | Manual | Manual | Built-in |
| Memory | Lowest | Medium | Highest |
| Creation Speed | Fastest | Fast | Slowest |
| Use Case | Lightweight tuples | General-purpose | API data |
| JSON Schema | No | No | Yes |
Use Cases
# 1. NamedTuple: Lightweight, immutable records
from typing import NamedTuple
class Color(NamedTuple):
r: int
g: int
b: int
# Use for: coordinates, RGB values, database rows (read-only)
# 2. Dataclass: General-purpose classes
from dataclasses import dataclass
@dataclass
class User:
id: int
name: str
email: str
# Use for: domain objects, value objects, configuration
# 3. Pydantic: Validated data, APIs
from pydantic import BaseModel
class APIRequest(BaseModel):
query: str
limit: int = 10
offset: int = 0
# Use for: API payloads, configuration files, data validation
💡
Interview Tip: Explain the tradeoffs: NamedTuple for performance, dataclass for flexibility, Pydantic for validation.
Advanced Patterns
Mixing Approaches
from dataclasses import dataclass, field
from typing import NamedTuple, List
# NamedTuple for immutable data
class Coordinates(NamedTuple):
lat: float
lon: float
@dataclass
class Location:
"""Dataclass using NamedTuple."""
name: str
coordinates: Coordinates
@property
def lat(self):
return self.coordinates.lat
@property
def lon(self):
return self.coordinates.lon
# Usage
loc = Location("Office", Coordinates(40.7128, -74.0060))
print(f"Location: {loc.name}")
print(f"Lat: {loc.lat}, Lon: {loc.lon}")
Pydantic with Dataclass
from pydantic import BaseModel, Field, validator
from dataclasses import dataclass
from typing import List
# Pydantic model for validation
class UserInput(BaseModel):
name: str = Field(..., min_length=1)
email: str
age: int = Field(..., ge=0)
# Dataclass for internal representation
@dataclass
class UserInternal:
id: int
name: str
email: str
age: int
@classmethod
def from_input(cls, user_input: UserInput, id: int):
return cls(
id=id,
name=user_input.name,
email=user_input.email.lower(),
age=user_input.age
)
# Usage
user_input = UserInput(name="Alice", email="alice@EXAMPLE.com", age=30)
user_internal = UserInternal.from_input(user_input, id=1)
print(f"Internal user: {user_internal}")
Custom Serialization
from dataclasses import dataclass, asdict
from typing import Dict, Any
import json
@dataclass
class SerializableUser:
id: int
name: str
email: str
metadata: Dict[str, Any] = None
def to_dict(self):
return asdict(self)
def to_json(self):
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_dict(cls, data: Dict[str, Any]):
return cls(**data)
@classmethod
def from_json(cls, json_str: str):
return cls.from_dict(json.loads(json_str))
# Usage
user = SerializableUser(id=1, name="Alice", email="alice@example.com", metadata={"role": "admin"})
json_str = user.to_json()
print(f"JSON:\n{json_str}")
# Deserialize
user2 = SerializableUser.from_json(json_str)
print(f"\nDeserialized: {user2}")
print(f"Equal: {user == user2}")
Output:
JSON:
{
"id": 1,
"name": "Alice",
"email": "alice@example.com",
"metadata": {
"role": "admin"
}
}
Deserialized: SerializableUser(id=1, name='Alice', email='alice@example.com', metadata={'role': 'admin'})
Equal: True
Interview Tips
Common Follow-up Questions
-
"What's the difference between
__init__and__post_init__?"__init__is auto-generated by dataclass__post_init__is called after__init__for validation/setup- Use
__post_init__for computed fields and validation
-
"How do you make a dataclass immutable?"
- Use
@dataclass(frozen=True) - Or use
__slots__withfrozen=True - Or use NamedTuple
- Use
-
"What's the performance difference?"
- NamedTuple: Fastest, lowest memory
- Dataclass: Fast, moderate memory
- Pydantic: Slowest, highest memory (but has validation)
Code Review Tips
# BAD: Mutable default
@dataclass
class BadUser:
tags: List[str] = [] # Shared default!
# GOOD: Factory default
@dataclass
class GoodUser:
tags: List[str] = field(default_factory=list)
# BAD: No validation
@dataclass
class BadEmail:
email: str # No validation
# GOOD: Validation in __post_init__
@dataclass
class GoodEmail:
email: str
def __post_init__(self):
if '@' not in self.email:
raise ValueError("Invalid email")
# BAD: Pydantic for simple data
from pydantic import BaseModel
class SimplePoint(BaseModel): # Overkill!
x: float
y: float
# GOOD: NamedTuple for simple data
class Point(NamedTuple):
x: float
y: float
⚠️
Common Mistake: Using mutable defaults in dataclasses causes shared state bugs.
Summary
| Approach | Speed | Memory | Validation | Immutability | Use Case |
|---|---|---|---|---|---|
| NamedTuple | Fastest | Lowest | No | Yes | Lightweight tuples |
| Dataclass | Fast | Medium | Manual | Optional | General-purpose |
| Pydantic | Slowest | Highest | Built-in | No | APIs, validation |
Best Practices
- Use NamedTuple for simple, immutable records
- Use Dataclass for general-purpose classes
- Use Pydantic for API payloads and validation
- Avoid mutable defaults in dataclasses
- Use
__post_init__for validation and computed fields - Use
frozen=Truefor hashable dataclasses
ℹ️
Key Takeaway: Choose based on your needs: performance (NamedTuple), flexibility (dataclass), or validation (Pydantic).
Practice Problems
- Performance Benchmark: Create a benchmark comparing all three approaches
- Custom Dataclass: Implement a dataclass with custom serialization
- Pydantic Schema: Generate JSON schema from Pydantic model
- Immutable Dataclass: Create a frozen dataclass with computed properties
- Mixed Approach: Design a system using all three approaches appropriately
Further Reading
- PEP 557: Data Classes
- PEP 398: NamedTuple
- Pydantic Docs: https://pydantic-docs.helpmanual.io/
- Python Docs:
dataclassesmodule
Remember: Each approach has its strengths. Choose based on your specific requirements for performance, validation, and immutability.