What is Pandas?
Pandas is the most important library for data manipulation and analysis in Python. It provides powerful data structures for working with structured (tabular) data.
"Pandas is the Python Data Analysis Library." β Wes McKinney (Creator)
Core Data Structures
Architecture Diagram
Series (1D) DataFrame (2D)
ββββββββββββββ ββββββββ¬βββββββ¬βββββββ
β Name β β Name β Age β City β
ββββββββββββββ€ ββββββββΌβββββββΌβββββββ€
β Alice β βAlice β 25 β NYC β
β Bob β β Bob β 30 β LA β
β Charlie β βCharl β 35 β Chicagoβ
ββββββββββββββ ββββββββ΄βββββββ΄βββββββ
Series: The 1D Building Block
import pandas as pd
import numpy as np
# Creating Series from list
s = pd.Series([10, 20, 30, 40, 50])
print(s)
# 0 10
# 1 20
# 2 30
# 3 40
# 4 50
# dtype: int64
# Series with custom index
s_named = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s_named)
# a 10
# b 20
# c 30
# Series from dictionary
s_dict = pd.Series({'math': 95, 'english': 88, 'science': 92})
print(s_dict)
# Series operations
print(f"Values: {s.values}") # [10 20 30 40 50]
print(f"Index: {s.index}") # RangeIndex(start=0, stop=5)
print(f"Dtype: {s.dtype}") # int64
print(f"Mean: {s.mean()}") # 30.0
print(f"Sum: {s.sum()}") # 150
# Boolean indexing on Series
print(s[s > 25]) # 2 30
# 3 40
# 4 50
# String methods
names = pd.Series(['Alice', 'Bob', 'Charlie'])
print(names.str.lower()) # ['alice', 'bob', 'charlie']
print(names.str.len()) # [5, 3, 7]
print(names.str.contains('a')) # [True, False, True]
DataFrames: The 2D Workhorse
# Creating DataFrame from dictionary
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'age': [25, 30, 35, 28],
'city': ['New York', 'Los Angeles', 'Chicago', 'Boston'],
'salary': [70000, 85000, 90000, 75000]
})
print(df)
# name age city salary
# 0 Alice 25 New York 70000
# 1 Bob 30 Los Angeles 85000
# 2 Charlie 35 Chicago 90000
# 3 Diana 28 Boston 75000
# Creating DataFrame from 2D array
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_arr = pd.DataFrame(arr, columns=['A', 'B', 'C'], index=['x', 'y', 'z'])
print(df_arr)
# Creating DataFrame from list of dictionaries
data = [
{'product': 'Laptop', 'price': 999, 'quantity': 50},
{'product': 'Phone', 'price': 699, 'quantity': 100},
{'product': 'Tablet', 'price': 449, 'quantity': 75}
]
df_products = pd.DataFrame(data)
print(df_products)
Essential DataFrame Operations
Inspecting Data
# First look at your data
print(f"Shape: {df.shape}") # (4, 4)
print(f"\nColumns: {df.columns}") # Index(['name', 'age', 'city', 'salary'])
print(f"\nData types:\n{df.dtypes}")
print(f"\nInfo:")
df.info()
# Statistical summary
print(f"\nDescribe:\n{df.describe()}")
# For categorical data
print(f"\nValue counts:\n{df['city'].value_counts()}")
# Check for missing values
print(f"\nMissing values:\n{df.isnull().sum()}")
Selection Methods
# Column selection
print(df['name']) # Single column (returns Series)
print(df[['name', 'age']]) # Multiple columns (returns DataFrame)
# Row selection with loc (label-based)
print(df.loc[0]) # First row
print(df.loc[0:2]) # Rows 0, 1, 2 (inclusive)
# Row selection with iloc (integer position-based)
print(df.iloc[0]) # First row
print(df.iloc[0:2]) # Rows 0, 1 (exclusive of 2)
# Combining row and column selection
print(df.loc[0:2, ['name', 'age']]) # Rows 0-2, columns name and age
print(df.iloc[0:2, 0:2]) # First 2 rows, first 2 columns
# Conditional selection (filtering)
print(df[df['age'] > 28])
# name age city salary
# 1 Bob 30 Los Angeles 85000
# 2 Charlie 35 Chicago 90000
# Multiple conditions
print(df[(df['age'] > 25) & (df['salary'] > 80000)])
# Query method (SQL-like syntax)
print(df.query('age > 25 and salary > 80000'))
Adding and Removing Columns
# Add new column
df['bonus'] = df['salary'] * 0.1
df['tax_bracket'] = df['salary'].apply(lambda x: 'high' if x > 80000 else 'medium')
print(df)
# Add column with insert
df.insert(2, 'department', ['Engineering', 'Marketing', 'Engineering', 'Sales'])
# Remove columns
df_dropped = df.drop(columns=['bonus'])
print(df_dropped)
# Remove rows
df_dropped = df.drop(index=[0, 2])
print(df_dropped)
# Rename columns
df_renamed = df.rename(columns={'name': 'employee_name', 'age': 'employee_age'})
print(df_renamed.columns)
Indexing Deep Dive
Setting and Resetting Index
# Set existing column as index
df_indexed = df.set_index('name')
print(df_indexed)
# age city salary bonus
# name
# Alice 25 New York 70000 7000
# Bob 30 Los Angeles 85000 8500
# Charlie 35 Chicago 90000 9000
# Diana 28 Boston 75000 7500
# Now you can use loc with names
print(df_indexed.loc['Alice'])
# Reset index back to default
df_reset = df_indexed.reset_index()
print(df_reset.head())
# MultiIndex (hierarchical indexing)
arrays = [['A', 'A', 'B', 'B'], ['one', 'two', 'one', 'two']]
multi_index = pd.MultiIndex.from_arrays(arrays, names=['group', 'number'])
df_multi = pd.DataFrame(np.random.randn(4, 2), index=multi_index, columns=['X', 'Y'])
print(df_multi)
print(df_multi.loc['A']) # All rows where group = A
print(df_multi.loc['A']['one']) # Row where group = A, number = one
Index Alignment
# Pandas automatically aligns by index
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([10, 20, 30], index=['b', 'c', 'd'])
# Operations align by index
result = s1 + s2
print(result)
# a NaN
# b 22.0
# c 33.0
# d NaN
# Fill missing values
result_filled = s1.add(s2, fill_value=0)
print(result_filled)
# a 1.0
# b 22.0
# c 33.0
# d 30.0
Data Types and Conversion
# Check data types
print(df.dtypes)
# Convert data types
df['age'] = df['age'].astype(float)
df['salary'] = df['salary'].astype(str)
# Convert to datetime
dates = pd.to_datetime(['2024-01-15', '2024-02-20', '2024-03-25'])
print(dates)
print(f"Day of week: {dates.day_name()}")
# Category type (saves memory for repeated strings)
df['city'] = df['city'].astype('category')
print(f"Memory saved: {df['city'].memory_usage(deep=True)} bytes")
Handling Missing Data
# Create sample data with missing values
df_missing = pd.DataFrame({
'A': [1, 2, np.nan, 4, 5],
'B': [np.nan, 2, 3, np.nan, 5],
'C': ['a', 'b', np.nan, 'd', 'e']
})
print(f"Missing values:\n{df_missing.isnull().sum()}")
print(f"\nMissing values per row:\n{df_missing.isnull().sum(axis=1)}")
# Drop missing values
print(f"\nDrop rows with any NaN:\n{df_missing.dropna()}")
print(f"\nDrop rows with all NaN:\n{df_missing.dropna(how='all')}")
print(f"\nDrop rows with less than 2 non-NaN values:\n{df_missing.dropna(thresh=2)}")
# Fill missing values
print(f"\nFill with 0:\n{df_missing.fillna(0)}")
print(f"\nFill with mean:\n{df_missing['A'].fillna(df_missing['A'].mean())}")
print(f"\nForward fill:\n{df_missing.fillna(method='ffill')}")
print(f"\nBackward fill:\n{df_missing.fillna(method='bfill')}")
# Interpolation
print(f"\nLinear interpolation:\n{df_missing['A'].interpolate()}")
Practical Example: Customer Analysis
# Load and analyze customer data
import pandas as pd
import numpy as np
# Simulate customer dataset
np.random.seed(42)
n_customers = 1000
df_customers = pd.DataFrame({
'customer_id': range(1, n_customers + 1),
'age': np.random.randint(18, 70, n_customers),
'gender': np.random.choice(['M', 'F', 'Other'], n_customers),
'annual_income': np.random.normal(55000, 15000, n_customers).astype(int),
'spending_score': np.random.randint(1, 100, n_customers),
'membership_years': np.random.randint(0, 15, n_customers)
})
# Add some missing values
df_customers.loc[np.random.choice(df_customers.index, 50), 'annual_income'] = np.nan
print("Dataset Overview:")
print(f"Shape: {df_customers.shape}")
print(f"\nData types:\n{df_customers.dtypes}")
print(f"\nMissing values:\n{df_customers.isnull().sum()}")
print(f"\nStatistical summary:\n{df_customers.describe()}")
# Quick EDA
print(f"\nGender distribution:\n{df_customers['gender'].value_counts()}")
print(f"\nAge distribution:\n{df_customers['age'].describe()}")
# Feature engineering
df_customers['income_bracket'] = pd.cut(
df_customers['annual_income'],
bins=[0, 30000, 60000, 100000, np.inf],
labels=['Low', 'Medium', 'High', 'Very High']
)
print(f"\nIncome distribution:\n{df_customers['income_bracket'].value_counts()}")
Key Takeaways
πSummary: Pandas DataFrames & Series
- Series = 1D labeled array, DataFrame = 2D labeled table
- Use loc for label-based indexing, iloc for position-based
- Always check dtypes and missing values first
- Pandas operations are vectorized β avoid loops when possible
- Index alignment is automatic but can produce NaN values
Practice Exercise
- Create a DataFrame with 100 rows and 5 columns
- Select rows where column A greater than 50 and column B is less than column C
- Add a new column that's the ratio of column A to column B
- Handle 10% random missing values
- Group by a categorical column and calculate statistics