CW

File I/O and Data Import

Module 1: Introduction & Python BasicsFree Lesson

Advertisement

File I/O and Data Import

Efficient file handling and data import are essential for data science workflows. Master these operations for robust data pipelines.

File Operations

Read Modes"r" Read (default)"rb" Read binary"r+" Read + write"rt" Read textWrite Modes"w" Write (truncate)"wb" Write binary"w+" Write + read"a" AppendBest Practicesรขล“โ€œ Always use with()รขล“โ€œ Specify encodingรขล“โ€œ Handle exceptionsรขล“โ€œ Close resources
# Basic file reading
with open("data.txt", "r", encoding="utf-8") as f:
    content = f.read()          # entire file as string
    lines = f.readlines()       # list of lines

# Writing
with open("output.txt", "w", encoding="utf-8") as f:
    f.write("Hello, World!\n")
    f.writelines(["Line 1\n", "Line 2\n"])

# Appending
with open("log.txt", "a", encoding="utf-8") as f:
    f.write("New entry\n")

# Context manager ensures proper cleanup

CSV Operations

import csv

# Reading CSV
with open("data.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        print(row)  # each row is a list of strings

# DictReader - maps to dictionaries
with open("data.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row["name"], row["age"])

# Writing CSV
with open("output.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Name", "Age", "City"])
    writer.writerows([["Alice", 30, "NYC"], ["Bob", 25, "LA"]])

# DictWriter
with open("output.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["Name", "Age"])
    writer.writeheader()
    writer.writerow({"Name": "Alice", "Age": 30})

JSON Operations

import json

# Reading JSON
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)  # parses to Python dict/list

# Writing JSON
with open("output.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# JSON strings
json_str = json.dumps({"name": "Alice", "age": 30}, indent=2)
parsed = json.loads(json_str)

# Handling complex types
def default_serializer(obj):
    if hasattr(obj, "isoformat"):
        return obj.isoformat()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

json.dumps({"date": datetime.now()}, default=default_serializer)

Excel Operations

import pandas as pd

# Reading Excel
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
df = pd.read_excel("data.xlsx", sheet_name=None)  # all sheets as dict

# Writing Excel
df.to_excel("output.xlsx", index=False)

# With openpyxl for formatting
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title = "Data"
ws.append(["Name", "Age", "City"])
ws.append(["Alice", 30, "NYC"])
wb.save("formatted.xlsx")

Pandas Read/Write Functions

Read Functionspd.read_csv()CSV files, most flexiblepd.read_excel()Excel .xlsx/.xls filespd.read_json()JSON files and stringspd.read_parquet()Columnar format, fastpd.read_sql()SQL database queriespd.read_html()HTML tablespd.read_feather()Feather format (fast I/O)pd.read_hdf()HDF5 storesWrite Functionsdf.to_csv()CSV with optionsdf.to_excel()Excel with formattingdf.to_json()JSON formatdf.to_parquet()Columnar storagedf.to_sql()Write to databasedf.to_html()HTML table outputdf.to_feather()Fast binary formatdf.to_hdf()HDF5 store

Common Pandas I/O Patterns

import pandas as pd

# CSV with options
df = pd.read_csv(
    "data.csv",
    sep=",",
    encoding="utf-8",
    parse_dates=["date_column"],
    dtype={"id": str, "value": float},
    na_values=["NA", "N/A", ""],
    skiprows=range(1, 5),  # skip rows 1-4
    nrows=1000,            # read only 1000 rows
    usecols=["col1", "col2", "col3"]
)

# Write CSV with options
df.to_csv(
    "output.csv",
    index=False,
    float_format="%.4f",
    columns=["col1", "col2"]
)

# Large files: chunked reading
chunks = pd.read_csv("huge.csv", chunksize=10000)
result = pd.concat([chunk.groupby("category").sum() for chunk in chunks])

Database Connections

import sqlite3
import pandas as pd
from sqlalchemy import create_engine

# SQLite
conn = sqlite3.connect("database.db")
df = pd.read_sql("SELECT * FROM users WHERE age > 25", conn)
conn.close()

# SQLAlchemy (recommended for production)
engine = create_engine("sqlite:///database.db")
# engine = create_engine("postgresql://user:pass@host:5432/dbname")
# engine = create_engine("mysql://user:pass@host:3306/dbname")

df = pd.read_sql("SELECT * FROM users", engine)
df.to_sql("users_backup", engine, if_exists="replace", index=False)

# Connection pooling
from sqlalchemy.pool import QueuePool
engine = create_engine(
    "postgresql://user:pass@host/db",
    poolclass=QueuePool,
    pool_size=5,
    max_overflow=10
)

Pathlib (Modern Path Handling)

from pathlib import Path

# Create path objects
data_dir = Path("data") / "raw"
file_path = data_dir / "dataset.csv"

# File operations
file_path.exists()          # True/False
file_path.is_file()         # True
file_path.is_dir()          # False
file_path.stat().st_size    # file size in bytes
file_path.stem              # "dataset" (filename without extension)
file_path.suffix            # ".csv"
file_path.parent            # Path("data/raw")

# Read/write with pathlib
content = file_path.read_text(encoding="utf-8")
file_path.write_text("new content", encoding="utf-8")

# Glob patterns
csv_files = list(Path("data").glob("**/*.csv"))  # recursive
txt_files = list(Path(".").glob("*.txt"))         # current dir only

# Create directories
Path("output/results").mkdir(parents=True, exist_ok=True)

File Formats Comparison

FormatSpeedSizeCompressionUse Case
CSVSlowLargeNoneUniversal, human-readable
ParquetFastSmallBuilt-inColumnar analytics
FeatherVery FastMediumNoneDataFrame I/O
HDF5FastMediumBuilt-inLarge numerical arrays
JSONMediumLargeNoneAPIs, nested data
ExcelSlowLargeBuilt-inBusiness reports

Error Handling for File Operations

from pathlib import Path
import logging

logger = logging.getLogger(__name__)

def safe_read_csv(filepath, **kwargs):
    """Robust CSV reader with error handling."""
    path = Path(filepath)

    if not path.exists():
        logger.error(f"File not found: {filepath}")
        raise FileNotFoundError(f"File not found: {filepath}")

    if path.stat().st_size == 0:
        logger.warning(f"File is empty: {filepath}")
        return pd.DataFrame()

    try:
        df = pd.read_csv(filepath, **kwargs)
        logger.info(f"Loaded {len(df)} rows from {filepath}")
        return df
    except pd.errors.ParserError as e:
        logger.error(f"Parse error in {filepath}: {e}")
        raise
    except Exception as e:
        logger.error(f"Error reading {filepath}: {e}")
        raise

# Usage
df = safe_read_csv("data.csv", encoding="utf-8")

Summary

  • Always use context managers (with statement) for file I/O
  • Specify encoding explicitly (prefer utf-8)
  • Use pathlib for modern, cross-platform path handling
  • Choose Parquet for columnar analytics, CSV for compatibility
  • Use chunked reading for files larger than memory
  • Implement error handling for production data pipelines

Advertisement

Need Expert Data Science Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement