Serialization: pickle, json, msgpack, protobuf, marshmallow
Data serialization formats and their tradeoffs
Interview Question
"Compare different serialization formats in Python. When would you use JSON vs pickle vs protobuf? What are the security implications of each?"
Difficulty: Medium | Frequently asked at Google, Meta, Amazon
Theoretical Foundation
What is Serialization?
Serialization converts Python objects to a format that can be stored or transmitted, and deserialization converts them back.
# Python object
user = {"name": "Alice", "age": 30, "scores": [95, 87, 92]}
# Serialized formats
json_str = '{"name": "Alice", "age": 30, "scores": [95, 87, 92]}'
pickle_bytes = b'\x80\x05\x95...'
msgpack_bytes = b'\x83\xa4name\xa5Alice...'
βΉοΈ
Key Concept: Different serialization formats have different tradeoffs: human readability, speed, size, and security.
JSON
Basic JSON Operations
import json
from typing import Any, Dict, List
# Serialize (Python β JSON)
data = {
"name": "Alice",
"age": 30,
"scores": [95, 87, 92],
"active": True,
"address": None
}
json_string = json.dumps(data, indent=2)
print("JSON string:")
print(json_string)
# Deserialize (JSON β Python)
parsed = json.loads(json_string)
print(f"\nParsed type: {type(parsed)}")
print(f"Name: {parsed['name']}")
print(f"Scores: {parsed['scores']}")
Output:
JSON string:
{
"name": "Alice",
"age": 30,
"scores": [
95,
87,
92
],
"active": true,
"address": null
}
Parsed type: <class 'dict'>
Name: Alice
Scores: [95, 87, 92]
Advanced JSON
import json
from datetime import datetime
from typing import Any
# Custom JSON encoder
class CustomEncoder(json.JSONEncoder):
def default(self, obj: Any) -> Any:
if isinstance(obj, datetime):
return obj.isoformat()
if hasattr(obj, '__dict__'):
return obj.__dict__
return super().default(obj)
# Custom decoder
def custom_decoder(dct: Dict) -> Any:
if '__datetime__' in dct:
return datetime.fromisoformat(dct['__datetime__'])
return dct
# Example class
class User:
def __init__(self, name: str, created: datetime):
self.name = name
self.created = created
# Serialize with custom encoder
user = User("Alice", datetime.now())
json_str = json.dumps(user, cls=CustomEncoder, indent=2)
print("Custom encoded:")
print(json_str)
# Deserialize with custom decoder
parsed = json.loads(json_str, object_hook=custom_decoder)
print(f"\nDeserialized: {parsed}")
# Streaming JSON
def stream_json(file_path: str):
"""Read JSON file line by line."""
with open(file_path, 'r') as f:
for line in f:
yield json.loads(line)
# JSON performance
import timeit
data = [{"id": i, "name": f"user_{i}", "value": i * 2} for i in range(1000)]
json_time = timeit.timeit(lambda: json.dumps(data), number=1000)
print(f"\nJSON serialization: {json_time:.3f}s for 1000 iterations")
JSON Schema Validation
import json
from typing import Dict, Any
# Simple schema validation
def validate_json_schema(data: Dict, schema: Dict) -> bool:
"""Validate JSON data against schema."""
for key, expected_type in schema.items():
if key not in data:
return False
if not isinstance(data[key], expected_type):
return False
return True
# Define schema
user_schema = {
"name": str,
"age": int,
"email": str
}
# Valid data
valid_user = {"name": "Alice", "age": 30, "email": "alice@example.com"}
print(f"Valid: {validate_json_schema(valid_user, user_schema)}")
# Invalid data
invalid_user = {"name": "Bob", "age": "thirty", "email": "bob@example.com"}
print(f"Invalid: {validate_json_schema(invalid_user, user_schema)}")
Output:
Valid: True
Invalid: False
π‘
Interview Tip: JSON is human-readable and widely supported, but slower and larger than binary formats.
pickle
Basic pickle Operations
import pickle
from typing import Any
# Serialize (Python β bytes)
data = {
"name": "Alice",
"scores": [95, 87, 92],
"metadata": {"created": "2024-01-15"}
}
pickle_bytes = pickle.dumps(data)
print(f"Pickle bytes length: {len(pickle_bytes)}")
print(f"Pickle bytes: {pickle_bytes[:50]}...")
# Deserialize (bytes β Python)
parsed = pickle.loads(pickle_bytes)
print(f"\nParsed type: {type(parsed)}")
print(f"Name: {parsed['name']}")
print(f"Scores: {parsed['scores']}")
# Serialize to file
with open('data.pkl', 'wb') as f:
pickle.dump(data, f)
# Deserialize from file
with open('data.pkl', 'rb') as f:
loaded = pickle.load(f)
print(f"Loaded: {loaded}")
Advanced pickle
import pickle
from typing import Any
# Custom pickle class
class CustomPickle:
def __init__(self, value: Any):
self.value = value
self._extra = "hidden"
def __getstate__(self):
"""Control what gets pickled."""
state = self.__dict__.copy()
# Remove unpicklable attributes
state.pop('_extra', None)
return state
def __setstate__(self, state):
"""Control how object is unpickled."""
self.__dict__.update(state)
self._extra = "restored"
# Usage
obj = CustomPickle(42)
pickled = pickle.dumps(obj)
loaded = pickle.loads(pickled)
print(f"Value: {loaded.value}")
print(f"Extra: {loaded._extra}")
# Pickle protocol versions
data = [1, 2, 3, 4, 5]
for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
pickled = pickle.dumps(data, protocol=protocol)
print(f"Protocol {protocol}: {len(pickled)} bytes")
Output:
Value: 42
Extra: restored
Protocol 0: 38 bytes
Protocol 1: 38 bytes
Protocol 2: 41 bytes
Protocol 3: 41 bytes
Protocol 4: 41 bytes
Protocol 5: 41 bytes
Security Implications
import pickle
import io
# SECURITY RISK: pickle executes arbitrary code!
class Malicious:
def __reduce__(self):
# This code runs during unpickling
import os
return (os.system, ('echo "Malicious code executed!"',))
# This would execute code if unpickled
# malicious = Malicious()
# pickled = pickle.dumps(malicious)
# pickle.loads(pickled) # Executes os.system!
# Safe unpickling (limited)
def safe_loads(data: bytes):
"""Unpickle with restricted classes."""
class RestrictedUnpickler(pickle.Unpickler):
SAFE_CLASSES = {
'builtins': {'dict', 'list', 'set', 'tuple', 'int', 'float', 'str', 'bool'},
}
def find_class(self, module: str, name: str):
if module in self.SAFE_CLASSES and name in self.SAFE_CLASSES[module]:
return getattr(__import__(module), name)
raise pickle.UnpicklingError(f"Unallowed class: {module}.{name}")
return RestrictedUnpickler(io.BytesIO(data)).load()
# Usage
safe_data = pickle.dumps({"key": "value"})
loaded = safe_loads(safe_data)
print(f"Safe load: {loaded}")
# Try to load malicious data
try:
malicious_data = pickle.dumps(Malicious())
safe_loads(malicious_data)
except pickle.UnpicklingError as e:
print(f"Blocked: {e}")
Output:
Safe load: {'key': 'value'}
Blocked: Unallowed class: __main__.Malicious
β οΈ
Security Warning: Never unpickle untrusted data! pickle can execute arbitrary code.
msgpack
Basic msgpack Operations
import msgpack
import timeit
# Serialize
data = {
"name": "Alice",
"age": 30,
"scores": [95, 87, 92]
}
msgpack_bytes = msgpack.packb(data)
print(f"msgpack bytes length: {len(msgpack_bytes)}")
# Deserialize
parsed = msgpack.unpackb(msgpack_bytes, raw=False)
print(f"Parsed: {parsed}")
# Performance comparison
json_time = timeit.timeit(lambda: json.dumps(data), number=10000)
msgpack_time = timeit.timeit(lambda: msgpack.packb(data), number=10000)
print(f"\nJSON: {json_time:.3f}s")
print(f"msgpack: {msgpack_time:.3f}s")
print(f"msgpack is {json_time/msgpack_time:.1f}x faster")
Output:
msgpack bytes length: 57
Parsed: {'name': 'Alice', 'age': 30, 'scores': [95, 87, 92]}
JSON: 0.123s
msgpack: 0.045s
msgpack is 2.7x faster
Advanced msgpack
import msgpack
from typing import Any
# Custom default handler
def default_handler(obj: Any) -> Any:
"""Handle non-serializable types."""
if hasattr(obj, '__dict__'):
return obj.__dict__
raise TypeError(f"Unknown type: {type(obj)}")
# Serialize with custom handler
class User:
def __init__(self, name: str, age: int):
self.name = name
self.age = age
user = User("Alice", 30)
msgpack_bytes = msgpack.packb(user, default=default_handler, use_bin_type=True)
# Deserialize
parsed = msgpack.unpackb(msgpack_bytes, raw=False)
print(f"User: {parsed}")
# Streaming msgpack
def stream_msgpack(file_path: str):
"""Read msgpack stream."""
with open(file_path, 'rb') as f:
unpacker = msgpack.Unpacker(f, raw=False)
for obj in unpacker:
yield obj
Protocol Buffers (protobuf)
Basic Protobuf Operations
# First, define .proto file:
"""
syntax = "proto3";
message User {
string name = 1;
int32 age = 2;
repeated int32 scores = 3;
}
"""
# Generate Python code:
# protoc --python_out=. user.proto
# Usage (assuming generated code)
def protobuf_example():
"""Example protobuf usage."""
# Note: This is pseudocode - requires actual proto compilation
# Serialize
# user = user_pb2.User(name="Alice", age=30, scores=[95, 87, 92])
# data = user.SerializeToString()
# Deserialize
# parsed = user_pb2.User()
# parsed.ParseFromString(data)
print("Protobuf requires .proto file compilation")
protobuf_example()
Protobuf Benefits
# Protobuf advantages:
# 1. Compact binary format
# 2. Schema evolution (backward/forward compatible)
# 3. Fast serialization/deserialization
# 4. Language neutral
# Comparison
data = {"name": "Alice", "age": 30, "scores": [95, 87, 92]}
# Approximate sizes
json_size = len(json.dumps(data)) # ~70 bytes
msgpack_size = len(msgpack.packb(data)) # ~57 bytes
# protobuf_size = ~40 bytes (typical)
print(f"JSON: ~70 bytes")
print(f"msgpack: ~57 bytes")
print(f"protobuf: ~40 bytes (estimated)")
βΉοΈ
Protobuf Use Case: Best for microservices, APIs, and data storage where schema evolution is important.
marshmallow
Basic marshmallow Operations
from marshmallow import Schema, fields, validate, ValidationError
from typing import Dict, Any
# Define schema
class UserSchema(Schema):
name = fields.Str(required=True, validate=validate.Length(min=1, max=100))
age = fields.Int(required=True, validate=validate.Range(min=0, max=150))
email = fields.Email(required=True)
scores = fields.List(fields.Int(), load_default=[])
# Create schema instance
schema = UserSchema()
# Serialize (Python β dict)
user = {
"name": "Alice",
"age": 30,
"email": "alice@example.com",
"scores": [95, 87, 92]
}
serialized = schema.dump(user)
print(f"Serialized: {serialized}")
# Deserialize (dict β Python) with validation
try:
valid_data = schema.load({
"name": "Bob",
"age": 25,
"email": "bob@example.com"
})
print(f"Valid: {valid_data}")
except ValidationError as err:
print(f"Validation errors: {err.messages}")
# Invalid data
try:
invalid_data = schema.load({
"name": "",
"age": -5,
"email": "invalid"
})
except ValidationError as err:
print(f"Invalid: {err.messages}")
Output:
Serialized: {'name': 'Alice', 'age': 30, 'email': 'alice@example.com', 'scores': [95, 87, 92]}
Valid: {'name': 'Bob', 'age': 25, 'email': 'bob@example.com', 'scores': []}
Invalid: {'name': ['Shorter than minimum length 1.'], 'age': ['Must be greater than or equal to 0.'], 'email': ['Not a valid email address.']}
Advanced marshmallow
from marshmallow import Schema, fields, post_load, pre_load, validate
from typing import Dict, Any, List
# Nested schemas
class AddressSchema(Schema):
street = fields.Str(required=True)
city = fields.Str(required=True)
zip_code = fields.Str(required=True)
class UserSchema(Schema):
name = fields.Str(required=True)
age = fields.Int(required=True)
address = fields.Nested(AddressSchema)
tags = fields.List(fields.Str(), load_default=[])
@pre_load
def preprocess(self, data: Dict, **kwargs) -> Dict:
"""Pre-process data before validation."""
# Normalize email
if 'email' in data:
data['email'] = data['email'].lower()
return data
@post_load
def postprocess(self, data: Dict, **kwargs) -> Dict:
"""Post-process data after validation."""
# Add computed field
data['is_adult'] = data.get('age', 0) >= 18
return data
# Usage
schema = UserSchema()
user_data = {
"name": "Alice",
"age": 30,
"address": {
"street": "123 Main St",
"city": "Springfield",
"zip_code": "12345"
},
"tags": ["admin", "user"]
}
serialized = schema.dump(user_data)
print(f"Serialized: {serialized}")
deserialized = schema.load(user_data)
print(f"Deserialized: {deserialized}")
Output:
Serialized: {'name': 'Alice', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Springfield', 'zip_code': '12345'}, 'tags': ['admin', 'user']}
Deserialized: {'name': 'Alice', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Springfield', 'zip_code': '12345'}, 'tags': ['admin', 'user'], 'is_adult': True}
marshmallow with ORM
from marshmallow import Schema, fields
from dataclasses import dataclass
from typing import List
# Dataclass
@dataclass
class User:
id: int
name: str
email: str
is_active: bool = True
# Schema with ORM integration
class UserSchema(Schema):
id = fields.Int(dump_only=True)
name = fields.Str(required=True)
email = fields.Email(required=True)
is_active = fields.Bool(load_default=True)
class Meta:
ordered = True
# Usage
schema = UserSchema()
# Create user
user = User(id=1, name="Alice", email="alice@example.com")
serialized = schema.dump(user)
print(f"Serialized: {serialized}")
# Load from dict
data = {"name": "Bob", "email": "bob@example.com"}
loaded = schema.load(data)
print(f"Loaded: {loaded}")
π‘
Interview Tip: marshmallow is great for API validation and serialization with complex validation rules.
Comparison
Performance Benchmark
import json
import pickle
import msgpack
import timeit
# Test data
data = {
"users": [
{"id": i, "name": f"user_{i}", "email": f"user_{i}@example.com"}
for i in range(100)
]
}
# Benchmark
json_time = timeit.timeit(lambda: json.dumps(data), number=1000)
pickle_time = timeit.timeit(lambda: pickle.dumps(data), number=1000)
msgpack_time = timeit.timeit(lambda: msgpack.packb(data), number=1000)
print(f"JSON: {json_time:.3f}s")
print(f"pickle: {pickle_time:.3f}s")
print(f"msgpack: {msgpack_time:.3f}s")
# Size comparison
json_size = len(json.dumps(data))
pickle_size = len(pickle.dumps(data))
msgpack_size = len(msgpack.packb(data))
print(f"\nJSON size: {json_size} bytes")
print(f"pickle size: {pickle_size} bytes")
print(f"msgpack size: {msgpack_size} bytes")
Expected Output:
JSON: 0.456s
pickle: 0.234s
msgpack: 0.123s
JSON size: 8,765 bytes
pickle size: 5,432 bytes
msgpack size: 4,321 bytes
Feature Comparison
| Feature | JSON | pickle | msgpack | protobuf | marshmallow |
|---|---|---|---|---|---|
| Human readable | Yes | No | No | No | N/A |
| Schema required | No | No | No | Yes | Yes |
| Security | Safe | Unsafe | Safe | Safe | Safe |
| Speed | Medium | Fast | Fast | Very fast | Slow |
| Size | Large | Medium | Small | Smallest | N/A |
| Python only | No | Yes | No | No | Yes |
| Validation | No | No | No | Yes | Yes |
When to Use What?
# JSON: APIs, config files, human-readable data
# pickle: Python-only caching, internal state
# msgpack: High-performance binary protocol
# protobuf: Microservices, schema evolution
# marshmallow: API validation, complex schemas
β οΈ
Security Note: Never use pickle for untrusted data. JSON, msgpack, and protobuf are safe.
Interview Tips
Common Follow-up Questions
-
"When would you use pickle over JSON?"
- Python-only applications
- Complex Python objects
- Performance-critical internal caching
- Never for untrusted data
-
"What are protobuf's advantages?"
- Schema evolution
- Compact size
- Fast serialization
- Language neutral
-
"How do you handle versioning?"
- JSON: Flexible, schema-less
- protobuf: Built-in field numbering
- marshmallow: Schema versioning
Code Review Tips
# BAD: Using pickle for APIs
pickle.dumps(user_data) # Insecure, Python-only
# GOOD: Using JSON for APIs
json.dumps(user_data) # Safe, interoperable
# BAD: No validation
data = json.loads(untrusted_input) # No validation
# GOOD: With validation
schema = UserSchema()
try:
validated = schema.load(json.loads(untrusted_input))
except ValidationError as e:
handle_error(e)
# BAD: Hardcoded format
def save_data(data):
with open('data.pkl', 'wb') as f:
pickle.dump(data, f) # Only works in Python
# GOOD: Configurable format
def save_data(data, format='json'):
if format == 'json':
with open('data.json', 'w') as f:
json.dump(data, f)
elif format == 'msgpack':
with open('data.msgpack', 'wb') as f:
f.write(msgpack.packb(data))
βΉοΈ
Best Practice: Choose serialization format based on use case: JSON for APIs, pickle for internal Python, protobuf for microservices.
Summary
| Format | Use Case | Security | Speed |
|---|---|---|---|
| JSON | APIs, config | Safe | Medium |
| pickle | Python caching | Unsafe | Fast |
| msgpack | Binary protocol | Safe | Fast |
| protobuf | Microservices | Safe | Very fast |
| marshmallow | Validation | Safe | Slow |
Best Practices
- Use JSON for APIs and human-readable data
- Avoid pickle for untrusted data
- Use msgpack for high-performance binary
- Use protobuf for schema evolution
- Use marshmallow for complex validation
- Always validate deserialized data
βΉοΈ
Key Takeaway: Choose serialization based on security, performance, and interoperability requirements.
Practice Problems
- Custom Encoder: Create a JSON encoder for complex Python objects
- Schema Validation: Build a validation system using marshmallow
- Performance Benchmark: Compare all formats on different data types
- Safe Pickle: Implement safe unpickling with class restrictions
- Format Converter: Build a tool that converts between formats
Further Reading
- Python Docs:
json,pickle,struct - marshmallow Docs: https://marshmallow.readthedocs.io/
- protobuf Docs: https://developers.google.com/protocol-buffers
- msgpack Docs: https://msgpack.org/
Remember: Serialization is about trade-offs. Choose based on your specific requirements.