Pandas: Python's Data Manipulation Powerhouse
Pandas provides two main data structures built on top of NumPy:
- Series: 1D labeled array
- DataFrame: 2D labeled table (think spreadsheet or SQL table)
It is the most-used tool for data cleaning, transformation, and analysis.
Series
import pandas as pd
import numpy as np
# Create Series
s1 = pd.Series([10, 20, 30, 40, 50]) # default integer index
s2 = pd.Series([10, 20, 30], index=['a','b','c']) # custom index
s3 = pd.Series({'mon': 100, 'tue': 200, 'wed': 150}) # from dict
print(s1)
# 0 10
# 1 20
# ...
# Access
print(s2['b']) # 20 (label-based)
print(s1[2]) # 30 (positional)
print(s2[['a','c']]) # multiple labels
# Vectorized operations
print(s1 * 2) # [20, 40, 60, 80, 100]
print(s1 + s2) # aligns on index! NaN for mismatches
# Attributes
print(s1.values) # numpy array
print(s1.index) # RangeIndex
print(s1.dtype) # int64
print(s1.shape) # (5,)
print(s1.name) # None (can be set)
# Statistical methods
print(s1.mean(), s1.std(), s1.min(), s1.max())
print(s1.describe())
print(s1.value_counts())
DataFrame Creation
# From dict of lists
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
'age': [28, 32, 25, 35, 29],
'salary': [75000, 85000, 65000, 95000, 70000],
'dept': ['DS', 'Eng', 'DS', 'Mgt', 'Eng'],
'score': [92, 87, 95, 78, 88],
})
# From list of dicts (JSON-like)
records = [
{'id': 1, 'value': 42.5, 'label': 'A'},
{'id': 2, 'value': 37.2, 'label': 'B'},
{'id': 3, 'value': 55.1, 'label': 'A'},
]
df2 = pd.DataFrame(records)
# From NumPy array
arr = np.random.randn(100, 4)
df3 = pd.DataFrame(arr, columns=['feat_1', 'feat_2', 'feat_3', 'feat_4'])
# From CSV / Excel / SQL
df_csv = pd.read_csv('data.csv')
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# df_sql = pd.read_sql('SELECT * FROM table', connection)
print(df.shape) # (5, 5)
print(df.dtypes)
print(df.info()) # non-null counts, dtypes, memory
print(df.describe()) # statistical summary of numeric cols
print(df.head(3)) # first 3 rows
print(df.tail(2)) # last 2 rows
Selection and Filtering — The Most Important Section
# ── Column selection ───────────────────────────────────────
print(df['name']) # Series (single column)
print(df[['name', 'salary']]) # DataFrame (multiple columns)
print(df.name) # dot notation — works if no spaces
# ── Row selection ──────────────────────────────────────────
# loc — label-based (INCLUSIVE of stop)
print(df.loc[0]) # row by label
print(df.loc[0:2]) # rows 0, 1, 2 (INCLUSIVE)
print(df.loc[0, 'name']) # single value
print(df.loc[0:2, ['name','salary']]) # rows 0-2, specific cols
# Set custom index
df_idx = df.set_index('name')
print(df_idx.loc['Alice']) # row for Alice
print(df_idx.loc['Alice', 'salary']) # Alice's salary
# iloc — integer position-based (EXCLUSIVE of stop, like Python slicing)
print(df.iloc[0]) # first row
print(df.iloc[0:2]) # rows 0, 1 (NOT 2)
print(df.iloc[0, 2]) # row 0, col 2
print(df.iloc[0:3, [0,2]]) # rows 0-2, cols 0 and 2
# ── Boolean filtering ──────────────────────────────────────
# This is the CORE of data analysis filtering
print(df[df['salary'] >= 80000]) # high earners
print(df[df['dept'] == 'DS']) # data scientists
print(df[(df['dept'] == 'DS') & (df['score'] >= 90)]) # DS with high score
print(df[(df['salary'] < 70000) | (df['score'] > 90)]) # cheap OR smart
# .query() — SQL-like string syntax
print(df.query("dept == 'DS' and score >= 90"))
print(df.query("salary > @threshold", threshold=80000)) # use Python variable
# isin — filter by list of values
print(df[df['dept'].isin(['DS', 'Eng'])])
# between — filter numeric range
print(df[df['salary'].between(70000, 90000)])
# str accessor for string filtering
print(df[df['name'].str.startswith('A')])
print(df[df['name'].str.contains('a', case=False)])
print(df[df['name'].str.len() > 4])
Data Cleaning
# Missing values
df_messy = pd.DataFrame({
'A': [1, 2, np.nan, 4, np.nan],
'B': [np.nan, 2, 3, np.nan, 5],
'C': ['x', 'y', 'z', None, 'w']
})
print(df_messy.isnull()) # boolean mask of NaN
print(df_messy.isnull().sum()) # count NaN per column
print(df_messy.isnull().sum(axis=1)) # count NaN per row
print(f"Total missing: {df_messy.isnull().sum().sum()}")
print(f"% missing: {df_messy.isnull().mean() * 100}")
# Drop missing
df_messy.dropna() # drop any row with NaN
df_messy.dropna(axis=1) # drop columns with NaN
df_messy.dropna(subset=['A']) # drop rows where A is NaN
df_messy.dropna(thresh=2) # keep rows with at least 2 non-NaN
# Fill missing
df_messy.fillna(0) # fill with constant
df_messy['A'].fillna(df_messy['A'].mean()) # fill with mean
df_messy.fillna(method='ffill') # forward fill
df_messy.fillna(method='bfill') # backward fill
df_messy['A'].interpolate() # linear interpolation
# Duplicates
df.duplicated() # boolean: is row a duplicate?
df.duplicated(subset=['name']) # duplicates on specific cols
df.drop_duplicates() # remove duplicate rows
df.drop_duplicates(subset=['dept'], keep='first') # keep first per dept
# Data type conversion
df['age'] = df['age'].astype(float)
df['salary'] = pd.to_numeric(df['salary'], errors='coerce') # NaN for failures
df['dept'] = df['dept'].astype('category') # memory-efficient for categoricals
# Rename columns
df.rename(columns={'name': 'full_name', 'score': 'performance_score'}, inplace=True)
df.columns = [col.lower().replace(' ', '_') for col in df.columns] # clean all names
# Drop columns/rows
df.drop(columns=['score'])
df.drop(index=[0, 2])
Adding and Transforming Columns
# New column from expression
df['annual_salary'] = df['salary'] * 12
df['salary_grade'] = df['salary'].apply(lambda x: 'High' if x > 80000 else 'Standard')
# pd.cut — bin continuous into categories
df['age_group'] = pd.cut(df['age'],
bins=[0, 25, 35, 100],
labels=['<25', '25-35', '35+'])
# pd.qcut — quantile-based binning (equal frequency)
df['salary_quartile'] = pd.qcut(df['salary'], q=4, labels=['Q1','Q2','Q3','Q4'])
# map — replace values using a dict
dept_full = {'DS': 'Data Science', 'Eng': 'Engineering', 'Mgt': 'Management'}
df['dept_full'] = df['dept'].map(dept_full)
# apply — apply a function to each element, row, or column
df['name_len'] = df['name'].apply(len)
# apply row-wise (axis=1)
def compute_bonus(row):
return row['salary'] * (0.10 if row['score'] >= 90 else 0.05)
df['bonus'] = df.apply(compute_bonus, axis=1)
# transform — apply function but keep same shape (useful in groupby)
df['salary_zscore'] = df['salary'].transform(
lambda x: (x - x.mean()) / x.std()
)
GroupBy — Split-Apply-Combine
# Basic groupby
grouped = df.groupby('dept')
# Single aggregation
print(grouped['salary'].mean()) # mean salary per dept
print(grouped['salary'].median())
print(grouped.size()) # count per group
# Multiple aggregations with agg()
dept_stats = grouped['salary'].agg(['mean', 'std', 'min', 'max', 'count'])
print(dept_stats)
# Multiple columns, multiple functions
multi_stats = grouped.agg({
'salary': ['mean', 'std'],
'score': ['mean', 'max'],
'age': 'median'
})
print(multi_stats)
# Custom aggregation functions
def salary_range(series):
return series.max() - series.min()
print(grouped['salary'].agg(salary_range))
# Transform — return same-length output
df['dept_avg_salary'] = df.groupby('dept')['salary'].transform('mean')
df['salary_vs_dept'] = df['salary'] - df['dept_avg_salary']
# filter — keep groups satisfying a condition
big_depts = df.groupby('dept').filter(lambda g: len(g) >= 2)
# Multiple groupby keys
df.groupby(['dept', 'age_group'])['salary'].mean()
Merging and Joining
employees = pd.DataFrame({
'emp_id': [1, 2, 3, 4, 5],
'name': ['Alice', 'Bob', 'Carol', 'Dave', 'Eve'],
'dept_id': [10, 20, 10, 30, 20]
})
departments = pd.DataFrame({
'dept_id': [10, 20, 30, 40],
'dept_name': ['Data Science', 'Engineering', 'Management', 'Marketing']
})
salaries = pd.DataFrame({
'emp_id': [1, 2, 3, 4, 6], # note: emp 5 missing, emp 6 extra
'salary': [75000, 85000, 65000, 95000, 72000]
})
# merge (like SQL JOIN)
inner = pd.merge(employees, salaries, on='emp_id', how='inner') # rows in both
left = pd.merge(employees, salaries, on='emp_id', how='left') # all employees
right = pd.merge(employees, salaries, on='emp_id', how='right') # all salary records
outer = pd.merge(employees, salaries, on='emp_id', how='outer') # all rows
# Different column names
result = pd.merge(employees, departments,
left_on='dept_id', right_on='dept_id', how='left')
# Multiple keys
pd.merge(df1, df2, on=['key1', 'key2'])
# concat — stack DataFrames vertically
all_data = pd.concat([df1, df2, df3], ignore_index=True)
all_data = pd.concat([df1, df2], axis=0) # vertical
features = pd.concat([features1, features2], axis=1) # horizontal (join by index)
Reshaping Data
# pivot_table — Excel-like pivot
pivot = df.pivot_table(
values='salary',
index='dept',
columns='age_group',
aggfunc='mean',
fill_value=0,
margins=True # add row/column totals
)
# melt — wide to long format (unpivot)
wide = pd.DataFrame({
'name': ['Alice', 'Bob'],
'math': [90, 80],
'english': [85, 92]
})
long = pd.melt(wide,
id_vars='name',
value_vars=['math', 'english'],
var_name='subject',
value_name='score')
# name subject score
# 0 Alice math 90
# 1 Bob math 80
# 2 Alice english 85
# 3 Bob english 92
# stack/unstack — multi-level index operations
df_multi = df.set_index(['dept', 'name'])
stacked = df_multi.stack() # column names become inner index level
unstacked = stacked.unstack() # back to original
Time Series
# DateTime handling
df['date'] = pd.to_datetime(df['date']) # parse strings to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
# DatetimeIndex
dates = pd.date_range('2024-01-01', periods=100, freq='D') # daily
ts = pd.Series(np.random.randn(100), index=dates)
# Resampling — aggregate to different frequency
monthly = ts.resample('M').mean()
weekly = ts.resample('W').sum()
quarterly = ts.resample('Q').std()
# Rolling windows
ts.rolling(window=7).mean() # 7-day moving average
ts.rolling(window=30).std() # 30-day rolling std
ts.expanding().mean() # expanding mean (from start)
# Time-based indexing
ts['2024-03'] # all March data
ts['2024-01':'2024-03'] # Jan through March
ts.between_time('09:00','17:00') # business hours only
# Shift and lag
ts.shift(1) # shift forward 1 period (create lag)
ts.shift(-1) # shift backward 1 period (create lead)
ts.diff(1) # first difference
ts.pct_change(1) # percent change
Key Takeaways
- loc for labels, iloc for positions — know the difference to avoid bugs
- Boolean indexing is the Pandas way to filter —
df[df['col'] > value] - GroupBy follows split-apply-combine: group → aggregate → result
- merge is for joining on a key column; concat is for stacking
- Always check dtypes and missing values at the start of any analysis
- Method chaining (
.filter().groupby().agg()) is idiomatic and readable - Use
.copy()when modifying a subset to avoid SettingWithCopyWarning
→ Next: Data Visualization → See Also: Data Cleaning Workflow