File I/O and Data Import
Efficient file handling and data import are essential for data science workflows. Master these operations for robust data pipelines.
File Operations
# Basic file reading
with open("data.txt", "r", encoding="utf-8") as f:
content = f.read() # entire file as string
lines = f.readlines() # list of lines
# Writing
with open("output.txt", "w", encoding="utf-8") as f:
f.write("Hello, World!\n")
f.writelines(["Line 1\n", "Line 2\n"])
# Appending
with open("log.txt", "a", encoding="utf-8") as f:
f.write("New entry\n")
# Context manager ensures proper cleanup
CSV Operations
import csv
# Reading CSV
with open("data.csv", "r", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
print(row) # each row is a list of strings
# DictReader - maps to dictionaries
with open("data.csv", "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
print(row["name"], row["age"])
# Writing CSV
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Age", "City"])
writer.writerows([["Alice", 30, "NYC"], ["Bob", 25, "LA"]])
# DictWriter
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["Name", "Age"])
writer.writeheader()
writer.writerow({"Name": "Alice", "Age": 30})
JSON Operations
import json
# Reading JSON
with open("data.json", "r", encoding="utf-8") as f:
data = json.load(f) # parses to Python dict/list
# Writing JSON
with open("output.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# JSON strings
json_str = json.dumps({"name": "Alice", "age": 30}, indent=2)
parsed = json.loads(json_str)
# Handling complex types
def default_serializer(obj):
if hasattr(obj, "isoformat"):
return obj.isoformat()
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
json.dumps({"date": datetime.now()}, default=default_serializer)
Excel Operations
import pandas as pd
# Reading Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df = pd.read_excel("data.xlsx", sheet_name=None) # all sheets as dict
# Writing Excel
df.to_excel("output.xlsx", index=False)
# With openpyxl for formatting
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title = "Data"
ws.append(["Name", "Age", "City"])
ws.append(["Alice", 30, "NYC"])
wb.save("formatted.xlsx")
Pandas Read/Write Functions
Common Pandas I/O Patterns
import pandas as pd
# CSV with options
df = pd.read_csv(
"data.csv",
sep=",",
encoding="utf-8",
parse_dates=["date_column"],
dtype={"id": str, "value": float},
na_values=["NA", "N/A", ""],
skiprows=range(1, 5), # skip rows 1-4
nrows=1000, # read only 1000 rows
usecols=["col1", "col2", "col3"]
)
# Write CSV with options
df.to_csv(
"output.csv",
index=False,
float_format="%.4f",
columns=["col1", "col2"]
)
# Large files: chunked reading
chunks = pd.read_csv("huge.csv", chunksize=10000)
result = pd.concat([chunk.groupby("category").sum() for chunk in chunks])
Database Connections
import sqlite3
import pandas as pd
from sqlalchemy import create_engine
# SQLite
conn = sqlite3.connect("database.db")
df = pd.read_sql("SELECT * FROM users WHERE age > 25", conn)
conn.close()
# SQLAlchemy (recommended for production)
engine = create_engine("sqlite:///database.db")
# engine = create_engine("postgresql://user:pass@host:5432/dbname")
# engine = create_engine("mysql://user:pass@host:3306/dbname")
df = pd.read_sql("SELECT * FROM users", engine)
df.to_sql("users_backup", engine, if_exists="replace", index=False)
# Connection pooling
from sqlalchemy.pool import QueuePool
engine = create_engine(
"postgresql://user:pass@host/db",
poolclass=QueuePool,
pool_size=5,
max_overflow=10
)
Pathlib (Modern Path Handling)
from pathlib import Path
# Create path objects
data_dir = Path("data") / "raw"
file_path = data_dir / "dataset.csv"
# File operations
file_path.exists() # True/False
file_path.is_file() # True
file_path.is_dir() # False
file_path.stat().st_size # file size in bytes
file_path.stem # "dataset" (filename without extension)
file_path.suffix # ".csv"
file_path.parent # Path("data/raw")
# Read/write with pathlib
content = file_path.read_text(encoding="utf-8")
file_path.write_text("new content", encoding="utf-8")
# Glob patterns
csv_files = list(Path("data").glob("**/*.csv")) # recursive
txt_files = list(Path(".").glob("*.txt")) # current dir only
# Create directories
Path("output/results").mkdir(parents=True, exist_ok=True)
File Formats Comparison
| Format | Speed | Size | Compression | Use Case |
|---|---|---|---|---|
| CSV | Slow | Large | None | Universal, human-readable |
| Parquet | Fast | Small | Built-in | Columnar analytics |
| Feather | Very Fast | Medium | None | DataFrame I/O |
| HDF5 | Fast | Medium | Built-in | Large numerical arrays |
| JSON | Medium | Large | None | APIs, nested data |
| Excel | Slow | Large | Built-in | Business reports |
Error Handling for File Operations
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
def safe_read_csv(filepath, **kwargs):
"""Robust CSV reader with error handling."""
path = Path(filepath)
if not path.exists():
logger.error(f"File not found: {filepath}")
raise FileNotFoundError(f"File not found: {filepath}")
if path.stat().st_size == 0:
logger.warning(f"File is empty: {filepath}")
return pd.DataFrame()
try:
df = pd.read_csv(filepath, **kwargs)
logger.info(f"Loaded {len(df)} rows from {filepath}")
return df
except pd.errors.ParserError as e:
logger.error(f"Parse error in {filepath}: {e}")
raise
except Exception as e:
logger.error(f"Error reading {filepath}: {e}")
raise
# Usage
df = safe_read_csv("data.csv", encoding="utf-8")
Summary
- Always use context managers (
withstatement) for file I/O - Specify encoding explicitly (prefer
utf-8) - Use pathlib for modern, cross-platform path handling
- Choose Parquet for columnar analytics, CSV for compatibility
- Use chunked reading for files larger than memory
- Implement error handling for production data pipelines