Pandas — Complete Data Manipulation Guide

Free Lesson

Advertisement

Pandas: Python's Data Manipulation Powerhouse

Pandas provides two main data structures built on top of NumPy:

  • Series: 1D labeled array
  • DataFrame: 2D labeled table (think spreadsheet or SQL table)

It is the most-used tool for data cleaning, transformation, and analysis.


Series

import pandas as pd
import numpy as np

# Create Series
s1 = pd.Series([10, 20, 30, 40, 50])             # default integer index
s2 = pd.Series([10, 20, 30], index=['a','b','c']) # custom index
s3 = pd.Series({'mon': 100, 'tue': 200, 'wed': 150})  # from dict

print(s1)
# 0    10
# 1    20
# ...

# Access
print(s2['b'])          # 20 (label-based)
print(s1[2])            # 30 (positional)
print(s2[['a','c']])    # multiple labels

# Vectorized operations
print(s1 * 2)           # [20, 40, 60, 80, 100]
print(s1 + s2)          # aligns on index! NaN for mismatches

# Attributes
print(s1.values)        # numpy array
print(s1.index)         # RangeIndex
print(s1.dtype)         # int64
print(s1.shape)         # (5,)
print(s1.name)          # None (can be set)

# Statistical methods
print(s1.mean(), s1.std(), s1.min(), s1.max())
print(s1.describe())
print(s1.value_counts())

DataFrame Creation

# From dict of lists
df = pd.DataFrame({
    'name':    ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
    'age':     [28, 32, 25, 35, 29],
    'salary':  [75000, 85000, 65000, 95000, 70000],
    'dept':    ['DS', 'Eng', 'DS', 'Mgt', 'Eng'],
    'score':   [92, 87, 95, 78, 88],
})

# From list of dicts (JSON-like)
records = [
    {'id': 1, 'value': 42.5, 'label': 'A'},
    {'id': 2, 'value': 37.2, 'label': 'B'},
    {'id': 3, 'value': 55.1, 'label': 'A'},
]
df2 = pd.DataFrame(records)

# From NumPy array
arr = np.random.randn(100, 4)
df3 = pd.DataFrame(arr, columns=['feat_1', 'feat_2', 'feat_3', 'feat_4'])

# From CSV / Excel / SQL
df_csv   = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# df_sql = pd.read_sql('SELECT * FROM table', connection)

print(df.shape)         # (5, 5)
print(df.dtypes)
print(df.info())        # non-null counts, dtypes, memory
print(df.describe())    # statistical summary of numeric cols
print(df.head(3))       # first 3 rows
print(df.tail(2))       # last 2 rows

Selection and Filtering — The Most Important Section

# ── Column selection ───────────────────────────────────────
print(df['name'])               # Series (single column)
print(df[['name', 'salary']])   # DataFrame (multiple columns)
print(df.name)                  # dot notation — works if no spaces

# ── Row selection ──────────────────────────────────────────

# loc — label-based (INCLUSIVE of stop)
print(df.loc[0])                # row by label
print(df.loc[0:2])              # rows 0, 1, 2 (INCLUSIVE)
print(df.loc[0, 'name'])        # single value
print(df.loc[0:2, ['name','salary']])   # rows 0-2, specific cols

# Set custom index
df_idx = df.set_index('name')
print(df_idx.loc['Alice'])      # row for Alice
print(df_idx.loc['Alice', 'salary'])   # Alice's salary

# iloc — integer position-based (EXCLUSIVE of stop, like Python slicing)
print(df.iloc[0])               # first row
print(df.iloc[0:2])             # rows 0, 1 (NOT 2)
print(df.iloc[0, 2])            # row 0, col 2
print(df.iloc[0:3, [0,2]])      # rows 0-2, cols 0 and 2

# ── Boolean filtering ──────────────────────────────────────
# This is the CORE of data analysis filtering

print(df[df['salary'] >= 80000])           # high earners
print(df[df['dept'] == 'DS'])              # data scientists
print(df[(df['dept'] == 'DS') & (df['score'] >= 90)])  # DS with high score
print(df[(df['salary'] < 70000) | (df['score'] > 90)]) # cheap OR smart

# .query() — SQL-like string syntax
print(df.query("dept == 'DS' and score >= 90"))
print(df.query("salary > @threshold", threshold=80000))  # use Python variable

# isin — filter by list of values
print(df[df['dept'].isin(['DS', 'Eng'])])

# between — filter numeric range
print(df[df['salary'].between(70000, 90000)])

# str accessor for string filtering
print(df[df['name'].str.startswith('A')])
print(df[df['name'].str.contains('a', case=False)])
print(df[df['name'].str.len() > 4])

Data Cleaning

# Missing values
df_messy = pd.DataFrame({
    'A': [1, 2, np.nan, 4, np.nan],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': ['x', 'y', 'z', None, 'w']
})

print(df_messy.isnull())             # boolean mask of NaN
print(df_messy.isnull().sum())       # count NaN per column
print(df_messy.isnull().sum(axis=1)) # count NaN per row
print(f"Total missing: {df_messy.isnull().sum().sum()}")
print(f"% missing: {df_messy.isnull().mean() * 100}")

# Drop missing
df_messy.dropna()                    # drop any row with NaN
df_messy.dropna(axis=1)              # drop columns with NaN
df_messy.dropna(subset=['A'])        # drop rows where A is NaN
df_messy.dropna(thresh=2)            # keep rows with at least 2 non-NaN

# Fill missing
df_messy.fillna(0)                   # fill with constant
df_messy['A'].fillna(df_messy['A'].mean())   # fill with mean
df_messy.fillna(method='ffill')      # forward fill
df_messy.fillna(method='bfill')      # backward fill
df_messy['A'].interpolate()          # linear interpolation

# Duplicates
df.duplicated()                      # boolean: is row a duplicate?
df.duplicated(subset=['name'])       # duplicates on specific cols
df.drop_duplicates()                 # remove duplicate rows
df.drop_duplicates(subset=['dept'], keep='first')  # keep first per dept

# Data type conversion
df['age'] = df['age'].astype(float)
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')  # NaN for failures
df['dept'] = df['dept'].astype('category')   # memory-efficient for categoricals

# Rename columns
df.rename(columns={'name': 'full_name', 'score': 'performance_score'}, inplace=True)
df.columns = [col.lower().replace(' ', '_') for col in df.columns]  # clean all names

# Drop columns/rows
df.drop(columns=['score'])
df.drop(index=[0, 2])

Adding and Transforming Columns

# New column from expression
df['annual_salary'] = df['salary'] * 12
df['salary_grade'] = df['salary'].apply(lambda x: 'High' if x > 80000 else 'Standard')

# pd.cut — bin continuous into categories
df['age_group'] = pd.cut(df['age'],
                           bins=[0, 25, 35, 100],
                           labels=['<25', '25-35', '35+'])

# pd.qcut — quantile-based binning (equal frequency)
df['salary_quartile'] = pd.qcut(df['salary'], q=4, labels=['Q1','Q2','Q3','Q4'])

# map — replace values using a dict
dept_full = {'DS': 'Data Science', 'Eng': 'Engineering', 'Mgt': 'Management'}
df['dept_full'] = df['dept'].map(dept_full)

# apply — apply a function to each element, row, or column
df['name_len'] = df['name'].apply(len)

# apply row-wise (axis=1)
def compute_bonus(row):
    return row['salary'] * (0.10 if row['score'] >= 90 else 0.05)
df['bonus'] = df.apply(compute_bonus, axis=1)

# transform — apply function but keep same shape (useful in groupby)
df['salary_zscore'] = df['salary'].transform(
    lambda x: (x - x.mean()) / x.std()
)

GroupBy — Split-Apply-Combine

# Basic groupby
grouped = df.groupby('dept')

# Single aggregation
print(grouped['salary'].mean())     # mean salary per dept
print(grouped['salary'].median())
print(grouped.size())               # count per group

# Multiple aggregations with agg()
dept_stats = grouped['salary'].agg(['mean', 'std', 'min', 'max', 'count'])
print(dept_stats)

# Multiple columns, multiple functions
multi_stats = grouped.agg({
    'salary': ['mean', 'std'],
    'score':  ['mean', 'max'],
    'age':    'median'
})
print(multi_stats)

# Custom aggregation functions
def salary_range(series):
    return series.max() - series.min()

print(grouped['salary'].agg(salary_range))

# Transform — return same-length output
df['dept_avg_salary'] = df.groupby('dept')['salary'].transform('mean')
df['salary_vs_dept'] = df['salary'] - df['dept_avg_salary']

# filter — keep groups satisfying a condition
big_depts = df.groupby('dept').filter(lambda g: len(g) >= 2)

# Multiple groupby keys
df.groupby(['dept', 'age_group'])['salary'].mean()

Merging and Joining

employees = pd.DataFrame({
    'emp_id': [1, 2, 3, 4, 5],
    'name':   ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
    'dept_id': [10, 20, 10, 30, 20]
})

departments = pd.DataFrame({
    'dept_id':   [10, 20, 30, 40],
    'dept_name': ['Data Science', 'Engineering', 'Management', 'Marketing']
})

salaries = pd.DataFrame({
    'emp_id': [1, 2, 3, 4, 6],   # note: emp 5 missing, emp 6 extra
    'salary': [75000, 85000, 65000, 95000, 72000]
})

# merge (like SQL JOIN)
inner   = pd.merge(employees, salaries, on='emp_id', how='inner')   # rows in both
left    = pd.merge(employees, salaries, on='emp_id', how='left')    # all employees
right   = pd.merge(employees, salaries, on='emp_id', how='right')   # all salary records
outer   = pd.merge(employees, salaries, on='emp_id', how='outer')   # all rows

# Different column names
result  = pd.merge(employees, departments,
                   left_on='dept_id', right_on='dept_id', how='left')

# Multiple keys
pd.merge(df1, df2, on=['key1', 'key2'])

# concat — stack DataFrames vertically
all_data = pd.concat([df1, df2, df3], ignore_index=True)
all_data = pd.concat([df1, df2], axis=0)   # vertical
features = pd.concat([features1, features2], axis=1)  # horizontal (join by index)

Reshaping Data

# pivot_table — Excel-like pivot
pivot = df.pivot_table(
    values='salary',
    index='dept',
    columns='age_group',
    aggfunc='mean',
    fill_value=0,
    margins=True           # add row/column totals
)

# melt — wide to long format (unpivot)
wide = pd.DataFrame({
    'name':  ['Alice', 'Bob'],
    'math':  [90, 80],
    'english': [85, 92]
})

long = pd.melt(wide,
               id_vars='name',
               value_vars=['math', 'english'],
               var_name='subject',
               value_name='score')
#    name  subject  score
# 0  Alice    math     90
# 1    Bob    math     80
# 2  Alice english     85
# 3    Bob english     92

# stack/unstack — multi-level index operations
df_multi = df.set_index(['dept', 'name'])
stacked = df_multi.stack()       # column names become inner index level
unstacked = stacked.unstack()    # back to original

Time Series

# DateTime handling
df['date'] = pd.to_datetime(df['date'])        # parse strings to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# DatetimeIndex
dates = pd.date_range('2024-01-01', periods=100, freq='D')   # daily
ts = pd.Series(np.random.randn(100), index=dates)

# Resampling — aggregate to different frequency
monthly = ts.resample('M').mean()
weekly  = ts.resample('W').sum()
quarterly = ts.resample('Q').std()

# Rolling windows
ts.rolling(window=7).mean()    # 7-day moving average
ts.rolling(window=30).std()    # 30-day rolling std
ts.expanding().mean()          # expanding mean (from start)

# Time-based indexing
ts['2024-03']                  # all March data
ts['2024-01':'2024-03']        # Jan through March
ts.between_time('09:00','17:00')  # business hours only

# Shift and lag
ts.shift(1)      # shift forward 1 period (create lag)
ts.shift(-1)     # shift backward 1 period (create lead)
ts.diff(1)       # first difference
ts.pct_change(1) # percent change

Key Takeaways

  1. loc for labels, iloc for positions — know the difference to avoid bugs
  2. Boolean indexing is the Pandas way to filter — df[df['col'] > value]
  3. GroupBy follows split-apply-combine: group → aggregate → result
  4. merge is for joining on a key column; concat is for stacking
  5. Always check dtypes and missing values at the start of any analysis
  6. Method chaining (.filter().groupby().agg()) is idiomatic and readable
  7. Use .copy() when modifying a subset to avoid SettingWithCopyWarning

→ Next: Data Visualization → See Also: Data Cleaning Workflow

Advertisement

Need Expert Python Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement