R Data Frames — Tabular Data Masterclass
Learning Objectives
By the end of this tutorial, you will be able to:
- Create data frames with
data.frame()andtibble() - Subset data frames by rows, columns, or conditions
- Add, remove, and modify columns
- Merge and join data frames
- Reshape data between wide and long formats
- Use
str(),summary(), andglimpse()to inspect data
What Is a Data Frame?
A data frame is R's most important data structure — a table where each column is a vector and columns can have different types. Think of it as a spreadsheet or SQL table.
# Create a data frame
df <- data.frame(
name = c("Alice", "Bob", "Charlie", "Diana"),
age = c(25, 30, 35, 28),
score = c(95, 87, 92, 88),
passed = c(TRUE, FALSE, TRUE, TRUE)
)
df
# name age score passed
# 1 Alice 25 95 TRUE
# 2 Bob 30 87 FALSE
# 3 Charlie 35 92 TRUE
# 4 Diana 28 88 TRUE
Creating Data Frames
Using data.frame()
# Basic data frame
df <- data.frame(
x = 1:5,
y = c("a", "b", "c", "d", "e"),
z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)
# Strings as factors (default in R < 4.0)
df <- data.frame(
x = 1:3,
y = c("a", "b", "c"),
stringsAsFactors = FALSE # Recommended
)
Using tibble (tidyverse)
library(tibble)
# Tibble is a modern data frame
tb <- tibble(
x = 1:5,
y = c("a", "b", "c", "d", "e"),
z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)
# Tibbles print nicer
tb
# # A tibble: 5 × 3
# x y z
# <int> <chr> <lgl>
# 1 1 a TRUE
# 2 2 b FALSE
# 3 3 c TRUE
# 4 4 d FALSE
# 5 5 e TRUE
# Convert between
as.data.frame(tb)
as_tibble(df)
From Vectors
# Column by column
name <- c("Alice", "Bob", "Charlie")
age <- c(25, 30, 35)
score <- c(95, 87, 92)
df <- data.frame(name, age, score)
# From matrix
m <- matrix(1:12, nrow = 4, dimnames = list(NULL, c("a", "b", "c")))
df <- as.data.frame(m)
Inspecting Data Frames
df <- data.frame(
name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
age = c(25, 30, 35, 28, 32),
score = c(95, 87, 92, 88, 90),
passed = c(TRUE, FALSE, TRUE, TRUE, TRUE)
)
# Structure
str(df)
# 'data.frame': 5 obs. of 4 variables:
# $ name : chr "Alice" "Bob" "Charlie" "Diana" ...
# $ age : num 25 30 35 28 32
# $ score : num 95 87 92 88 90
# $ passed: logi TRUE FALSE TRUE TRUE TRUE
# Summary
summary(df)
# name age score passed
# Length:5 Min. :25.0 Min. :87.0 Mode :logical
# Class :character 1st Qu.:28.0 1st Qu.:88.0 FALSE:1
# Mode :character Median :30.0 Median :90.0 TRUE :4
# Mean :30.0 Mean :90.4
# 3rd Qu.:32.0 3rd Qu.:92.0
# Max. :35.0 Max. :95.0
# Dimensions
nrow(df) # [1] 5
ncol(df) # [1] 4
dim(df) # [1] 5 4
names(df) # [1] "name" "age" "score" "passed"
# Head and tail
head(df, 3) # First 3 rows
tail(df, 2) # Last 2 rows
# Glimpse (tidyverse)
glimpse(df)
# Rows: 5
# Columns: 4
# $ name <chr> "Alice", "Bob", "Charlie", "Diana", "Eve"
# $ age <dbl> 25, 30, 35, 28, 32
# $ score <dbl> 95, 87, 92, 88, 90
# $ passed <lgl> TRUE, FALSE, TRUE, TRUE, TRUE
Subsetting Data Frames
By Row
# First row
df[1, ]
# Rows 1-3
df[1:3, ]
# Specific rows
df[c(1, 3, 5), ]
# Row by condition
df[df$age > 30, ]
# Using which()
df[which(df$score >= 90), ]
# Using slice (dplyr)
library(dplyr)
slice(df, 1:3)
slice_min(df, score, n = 2)
slice_max(df, score, n = 2)
By Column
# Single column (returns vector)
df$name
# Single column (returns data frame)
df["name"]
df[, "name"]
# Multiple columns
df[, c("name", "score")]
# Using select (dplyr)
select(df, name, score)
select(df, -passed) # Exclude column
select(df, starts_with("s")) # Pattern matching
By Row and Column
# Row 1, Column 2
df[1, 2] # [1] 25
# Rows 1-3, Columns 1-2
df[1:3, 1:2]
# By name
df[1:3, c("name", "age")]
# Using filter and select (dplyr)
df |>
filter(age > 25) |>
select(name, score)
Logical Subsetting
# Multiple conditions
df[df$age > 25 & df$score >= 90, ]
# Using %in%
df[df$name %in% c("Alice", "Charlie"), ]
# Using between
df[between(df$age, 25, 30), ]
# Using slice (dplyr)
df |>
filter(age > 25, score >= 90)
Modifying Data Frames
Adding Columns
# Direct assignment
df$grade <- c("A", "B", "A", "B", "A")
# Using cbind
df <- cbind(df, gpa = c(4.0, 3.0, 3.8, 3.2, 3.9))
# Using mutate (dplyr)
df <- df |>
mutate(
passed = score >= 80,
grade = ifelse(score >= 90, "A", "B")
)
Removing Columns
# Set to NULL
df$grade <- NULL
# Using select (dplyr)
df <- df |> select(-gpa)
Modifying Columns
# Direct modification
df$score <- df$score * 1.1 # Add 10% bonus
# Using mutate (dplyr)
df <- df |>
mutate(
score = score + 5,
age = age + 1
)
Merging Data Frames
# Two data frames
students <- data.frame(
id = c(1, 2, 3, 4),
name = c("Alice", "Bob", "Charlie", "Diana")
)
scores <- data.frame(
id = c(2, 3, 4, 5),
score = c(87, 92, 88, 95)
)
# Inner join — only matching rows
merge(students, scores, by = "id")
# id name score
# 1 2 Bob 87
# 2 3 Charlie 92
# 3 4 Diana 88
# Left join — all from left
merge(students, scores, by = "id", all.x = TRUE)
# Right join — all from right
merge(students, scores, by = "id", all.y = TRUE)
# Full join — all from both
merge(students, scores, by = "id", all = TRUE)
# Using dplyr
library(dplyr)
students |> inner_join(scores, by = "id")
students |> left_join(scores, by = "id")
students |> anti_join(scores, by = "id") # No match
Reshaping Data
Wide to Long
# Wide format
wide <- data.frame(
student = c("Alice", "Bob", "Charlie"),
math = c(90, 85, 88),
science = c(92, 87, 91),
english = c(88, 82, 90)
)
# Using reshape2
library(reshape2)
long <- melt(wide, id.vars = "student", variable.name = "subject", value.name = "score")
long
# Using tidyr
library(tidyr)
long <- pivot_longer(wide, cols = math:english, names_to = "subject", values_to = "score")
long
Long to Wide
# Long format
long <- data.frame(
student = c("Alice", "Alice", "Alice", "Bob", "Bob", "Bob"),
subject = c("math", "science", "english", "math", "science", "english"),
score = c(90, 92, 88, 85, 87, 82)
)
# Using reshape2
wide <- dcast(long, student ~ subject, value.var = "score")
# Using tidyr
wide <- pivot_wider(long, names_from = subject, values_from = score)
Practice Exercises
Exercise 1: Create and Query
Create a data frame of 10 students with names, ages, and scores. Then:
- Find all students older than 25
- Calculate the average score
- Find the student with the highest score
- Add a new column "grade" (A if score >= 90, else B)
Solution
students <- data.frame(
name = paste0("Student", 1:10),
age = sample(20:35, 10, replace = TRUE),
score = sample(70:100, 10, replace = TRUE)
)
# 1. Students older than 25
students[students$age > 25, ]
# 2. Average score
mean(students$score)
# 3. Highest score
students[which.max(students$score), ]
# 4. Add grade column
students$grade <- ifelse(students$score >= 90, "A", "B")
Key Takeaways
- Data frames are R's most important structure — tabular data with mixed types
- Use
data.frame()ortibble()to create - Subset with
df[rows, cols]— usefilter()andselect()from dplyr $extracts a column as a vector- Merge with
merge()or dplyr joins — inner, left, right, full - Reshape with
pivot_longer()andpivot_wider()from tidyr - Always check
str()andsummary()when exploring data
Next: Learn about R Factors — categorical data in R.