R Data Frames — Tabular Data Masterclass

Learning Objectives

By the end of this tutorial, you will be able to:

Create data frames with data.frame() and tibble()
Subset data frames by rows, columns, or conditions
Add, remove, and modify columns
Merge and join data frames
Reshape data between wide and long formats
Use str(), summary(), and glimpse() to inspect data

What Is a Data Frame?

A data frame is R's most important data structure — a table where each column is a vector and columns can have different types. Think of it as a spreadsheet or SQL table.

# Create a data frame
df <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana"),
  age = c(25, 30, 35, 28),
  score = c(95, 87, 92, 88),
  passed = c(TRUE, FALSE, TRUE, TRUE)
)

df
#     name age score passed
# 1  Alice  25    95   TRUE
# 2    Bob  30    87  FALSE
# 3 Charlie 35    92   TRUE
# 4   Diana 28    88   TRUE

Creating Data Frames

Using `data.frame()`

# Basic data frame
df <- data.frame(
  x = 1:5,
  y = c("a", "b", "c", "d", "e"),
  z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)

# Strings as factors (default in R < 4.0)
df <- data.frame(
  x = 1:3,
  y = c("a", "b", "c"),
  stringsAsFactors = FALSE  # Recommended
)

Using `tibble` (tidyverse)

library(tibble)

# Tibble is a modern data frame
tb <- tibble(
  x = 1:5,
  y = c("a", "b", "c", "d", "e"),
  z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)

# Tibbles print nicer
tb
# # A tibble: 5 × 3
#       x y     z
#   <int> <chr> <lgl>
# 1     1 a     TRUE
# 2     2 b     FALSE
# 3     3 c     TRUE
# 4     4 d     FALSE
# 5     5 e     TRUE

# Convert between
as.data.frame(tb)
as_tibble(df)

From Vectors

# Column by column
name <- c("Alice", "Bob", "Charlie")
age <- c(25, 30, 35)
score <- c(95, 87, 92)

df <- data.frame(name, age, score)

# From matrix
m <- matrix(1:12, nrow = 4, dimnames = list(NULL, c("a", "b", "c")))
df <- as.data.frame(m)

Inspecting Data Frames

df <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  age = c(25, 30, 35, 28, 32),
  score = c(95, 87, 92, 88, 90),
  passed = c(TRUE, FALSE, TRUE, TRUE, TRUE)
)

# Structure
str(df)
# 'data.frame':	5 obs. of  4 variables:
#  $ name  : chr  "Alice" "Bob" "Charlie" "Diana" ...
#  $ age   : num  25 30 35 28 32
#  $ score : num  95 87 92 88 90
#  $ passed: logi  TRUE FALSE TRUE TRUE TRUE

# Summary
summary(df)
#      name               age          score       passed
# Length:5           Min.   :25.0   Min.   :87.0   Mode :logical
# Class :character   1st Qu.:28.0   1st Qu.:88.0   FALSE:1
# Mode  :character   Median :30.0   Median :90.0   TRUE :4
#                    Mean   :30.0   Mean   :90.4
#                    3rd Qu.:32.0   3rd Qu.:92.0
#                    Max.   :35.0   Max.   :95.0

# Dimensions
nrow(df)     # [1] 5
ncol(df)     # [1] 4
dim(df)      # [1] 5 4
names(df)    # [1] "name"   "age"    "score"  "passed"

# Head and tail
head(df, 3)  # First 3 rows
tail(df, 2)  # Last 2 rows

# Glimpse (tidyverse)
glimpse(df)
# Rows: 5
# Columns: 4
# $ name   <chr> "Alice", "Bob", "Charlie", "Diana", "Eve"
# $ age    <dbl> 25, 30, 35, 28, 32
# $ score  <dbl> 95, 87, 92, 88, 90
# $ passed <lgl> TRUE, FALSE, TRUE, TRUE, TRUE

Subsetting Data Frames

By Row

# First row
df[1, ]

# Rows 1-3
df[1:3, ]

# Specific rows
df[c(1, 3, 5), ]

# Row by condition
df[df$age > 30, ]

# Using which()
df[which(df$score >= 90), ]

# Using slice (dplyr)
library(dplyr)
slice(df, 1:3)
slice_min(df, score, n = 2)
slice_max(df, score, n = 2)

By Column

# Single column (returns vector)
df$name

# Single column (returns data frame)
df["name"]
df[, "name"]

# Multiple columns
df[, c("name", "score")]

# Using select (dplyr)
select(df, name, score)
select(df, -passed)        # Exclude column
select(df, starts_with("s")) # Pattern matching

By Row and Column

# Row 1, Column 2
df[1, 2]          # [1] 25

# Rows 1-3, Columns 1-2
df[1:3, 1:2]

# By name
df[1:3, c("name", "age")]

# Using filter and select (dplyr)
df |>
  filter(age > 25) |>
  select(name, score)

Logical Subsetting

# Multiple conditions
df[df$age > 25 & df$score >= 90, ]

# Using %in%
df[df$name %in% c("Alice", "Charlie"), ]

# Using between
df[between(df$age, 25, 30), ]

# Using slice (dplyr)
df |>
  filter(age > 25, score >= 90)

Modifying Data Frames

Adding Columns

# Direct assignment
df$grade <- c("A", "B", "A", "B", "A")

# Using cbind
df <- cbind(df, gpa = c(4.0, 3.0, 3.8, 3.2, 3.9))

# Using mutate (dplyr)
df <- df |>
  mutate(
    passed = score >= 80,
    grade = ifelse(score >= 90, "A", "B")
  )

Removing Columns

# Set to NULL
df$grade <- NULL

# Using select (dplyr)
df <- df |> select(-gpa)

Modifying Columns

# Direct modification
df$score <- df$score * 1.1  # Add 10% bonus

# Using mutate (dplyr)
df <- df |>
  mutate(
    score = score + 5,
    age = age + 1
  )

Merging Data Frames

# Two data frames
students <- data.frame(
  id = c(1, 2, 3, 4),
  name = c("Alice", "Bob", "Charlie", "Diana")
)

scores <- data.frame(
  id = c(2, 3, 4, 5),
  score = c(87, 92, 88, 95)
)

# Inner join — only matching rows
merge(students, scores, by = "id")
#   id   name score
# 1  2    Bob    87
# 2  3 Charlie    92
# 3  4   Diana    88

# Left join — all from left
merge(students, scores, by = "id", all.x = TRUE)

# Right join — all from right
merge(students, scores, by = "id", all.y = TRUE)

# Full join — all from both
merge(students, scores, by = "id", all = TRUE)

# Using dplyr
library(dplyr)
students |> inner_join(scores, by = "id")
students |> left_join(scores, by = "id")
students |> anti_join(scores, by = "id")  # No match

Reshaping Data

Wide to Long

# Wide format
wide <- data.frame(
  student = c("Alice", "Bob", "Charlie"),
  math = c(90, 85, 88),
  science = c(92, 87, 91),
  english = c(88, 82, 90)
)

# Using reshape2
library(reshape2)
long <- melt(wide, id.vars = "student", variable.name = "subject", value.name = "score")
long

# Using tidyr
library(tidyr)
long <- pivot_longer(wide, cols = math:english, names_to = "subject", values_to = "score")
long

Long to Wide

# Long format
long <- data.frame(
  student = c("Alice", "Alice", "Alice", "Bob", "Bob", "Bob"),
  subject = c("math", "science", "english", "math", "science", "english"),
  score = c(90, 92, 88, 85, 87, 82)
)

# Using reshape2
wide <- dcast(long, student ~ subject, value.var = "score")

# Using tidyr
wide <- pivot_wider(long, names_from = subject, values_from = score)

Practice Exercises

Exercise 1: Create and Query

Create a data frame of 10 students with names, ages, and scores. Then:

Find all students older than 25
Calculate the average score
Find the student with the highest score
Add a new column "grade" (A if score >= 90, else B)

Solution

students <- data.frame(
  name = paste0("Student", 1:10),
  age = sample(20:35, 10, replace = TRUE),
  score = sample(70:100, 10, replace = TRUE)
)

# 1. Students older than 25
students[students$age > 25, ]

# 2. Average score
mean(students$score)

# 3. Highest score
students[which.max(students$score), ]

# 4. Add grade column
students$grade <- ifelse(students$score >= 90, "A", "B")

Key Takeaways

Data frames are R's most important structure — tabular data with mixed types
Use data.frame() or tibble() to create
Subset with df[rows, cols] — use filter() and select() from dplyr
$ extracts a column as a vector
Merge with merge() or dplyr joins — inner, left, right, full
Reshape with pivot_longer() and pivot_wider() from tidyr
Always check str() and summary() when exploring data

Next: Learn about R Factors — categorical data in R.

R Data Frames — Tabular Data Masterclass

R Data Frames — Tabular Data Masterclass

Learning Objectives

What Is a Data Frame?

Creating Data Frames

Using data.frame()

Using tibble (tidyverse)

From Vectors

Inspecting Data Frames

Subsetting Data Frames

By Row

By Column

By Row and Column

Logical Subsetting

Modifying Data Frames

Adding Columns

Removing Columns

Modifying Columns

Merging Data Frames

Reshaping Data

Wide to Long

Long to Wide

Practice Exercises

Exercise 1: Create and Query

Key Takeaways

Need Expert R Programming Help?

Using `data.frame()`

Using `tibble` (tidyverse)