R Data Frames — Tabular Data Masterclass

R BasicsData FramesFree Lesson

Advertisement

R Data Frames — Tabular Data Masterclass

Learning Objectives

By the end of this tutorial, you will be able to:

  • Create data frames with data.frame() and tibble()
  • Subset data frames by rows, columns, or conditions
  • Add, remove, and modify columns
  • Merge and join data frames
  • Reshape data between wide and long formats
  • Use str(), summary(), and glimpse() to inspect data

What Is a Data Frame?

A data frame is R's most important data structure — a table where each column is a vector and columns can have different types. Think of it as a spreadsheet or SQL table.

# Create a data frame
df <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana"),
  age = c(25, 30, 35, 28),
  score = c(95, 87, 92, 88),
  passed = c(TRUE, FALSE, TRUE, TRUE)
)

df
#     name age score passed
# 1  Alice  25    95   TRUE
# 2    Bob  30    87  FALSE
# 3 Charlie 35    92   TRUE
# 4   Diana 28    88   TRUE

Creating Data Frames

Using data.frame()

# Basic data frame
df <- data.frame(
  x = 1:5,
  y = c("a", "b", "c", "d", "e"),
  z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)

# Strings as factors (default in R < 4.0)
df <- data.frame(
  x = 1:3,
  y = c("a", "b", "c"),
  stringsAsFactors = FALSE  # Recommended
)

Using tibble (tidyverse)

library(tibble)

# Tibble is a modern data frame
tb <- tibble(
  x = 1:5,
  y = c("a", "b", "c", "d", "e"),
  z = c(TRUE, FALSE, TRUE, FALSE, TRUE)
)

# Tibbles print nicer
tb
# # A tibble: 5 × 3
#       x y     z
#   <int> <chr> <lgl>
# 1     1 a     TRUE
# 2     2 b     FALSE
# 3     3 c     TRUE
# 4     4 d     FALSE
# 5     5 e     TRUE

# Convert between
as.data.frame(tb)
as_tibble(df)

From Vectors

# Column by column
name <- c("Alice", "Bob", "Charlie")
age <- c(25, 30, 35)
score <- c(95, 87, 92)

df <- data.frame(name, age, score)

# From matrix
m <- matrix(1:12, nrow = 4, dimnames = list(NULL, c("a", "b", "c")))
df <- as.data.frame(m)

Inspecting Data Frames

df <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  age = c(25, 30, 35, 28, 32),
  score = c(95, 87, 92, 88, 90),
  passed = c(TRUE, FALSE, TRUE, TRUE, TRUE)
)

# Structure
str(df)
# 'data.frame':	5 obs. of  4 variables:
#  $ name  : chr  "Alice" "Bob" "Charlie" "Diana" ...
#  $ age   : num  25 30 35 28 32
#  $ score : num  95 87 92 88 90
#  $ passed: logi  TRUE FALSE TRUE TRUE TRUE

# Summary
summary(df)
#      name               age          score       passed
# Length:5           Min.   :25.0   Min.   :87.0   Mode :logical
# Class :character   1st Qu.:28.0   1st Qu.:88.0   FALSE:1
# Mode  :character   Median :30.0   Median :90.0   TRUE :4
#                    Mean   :30.0   Mean   :90.4
#                    3rd Qu.:32.0   3rd Qu.:92.0
#                    Max.   :35.0   Max.   :95.0

# Dimensions
nrow(df)     # [1] 5
ncol(df)     # [1] 4
dim(df)      # [1] 5 4
names(df)    # [1] "name"   "age"    "score"  "passed"

# Head and tail
head(df, 3)  # First 3 rows
tail(df, 2)  # Last 2 rows

# Glimpse (tidyverse)
glimpse(df)
# Rows: 5
# Columns: 4
# $ name   <chr> "Alice", "Bob", "Charlie", "Diana", "Eve"
# $ age    <dbl> 25, 30, 35, 28, 32
# $ score  <dbl> 95, 87, 92, 88, 90
# $ passed <lgl> TRUE, FALSE, TRUE, TRUE, TRUE

Subsetting Data Frames

By Row

# First row
df[1, ]

# Rows 1-3
df[1:3, ]

# Specific rows
df[c(1, 3, 5), ]

# Row by condition
df[df$age > 30, ]

# Using which()
df[which(df$score >= 90), ]

# Using slice (dplyr)
library(dplyr)
slice(df, 1:3)
slice_min(df, score, n = 2)
slice_max(df, score, n = 2)

By Column

# Single column (returns vector)
df$name

# Single column (returns data frame)
df["name"]
df[, "name"]

# Multiple columns
df[, c("name", "score")]

# Using select (dplyr)
select(df, name, score)
select(df, -passed)        # Exclude column
select(df, starts_with("s")) # Pattern matching

By Row and Column

# Row 1, Column 2
df[1, 2]          # [1] 25

# Rows 1-3, Columns 1-2
df[1:3, 1:2]

# By name
df[1:3, c("name", "age")]

# Using filter and select (dplyr)
df |>
  filter(age > 25) |>
  select(name, score)

Logical Subsetting

# Multiple conditions
df[df$age > 25 & df$score >= 90, ]

# Using %in%
df[df$name %in% c("Alice", "Charlie"), ]

# Using between
df[between(df$age, 25, 30), ]

# Using slice (dplyr)
df |>
  filter(age > 25, score >= 90)

Modifying Data Frames

Adding Columns

# Direct assignment
df$grade <- c("A", "B", "A", "B", "A")

# Using cbind
df <- cbind(df, gpa = c(4.0, 3.0, 3.8, 3.2, 3.9))

# Using mutate (dplyr)
df <- df |>
  mutate(
    passed = score >= 80,
    grade = ifelse(score >= 90, "A", "B")
  )

Removing Columns

# Set to NULL
df$grade <- NULL

# Using select (dplyr)
df <- df |> select(-gpa)

Modifying Columns

# Direct modification
df$score <- df$score * 1.1  # Add 10% bonus

# Using mutate (dplyr)
df <- df |>
  mutate(
    score = score + 5,
    age = age + 1
  )

Merging Data Frames

# Two data frames
students <- data.frame(
  id = c(1, 2, 3, 4),
  name = c("Alice", "Bob", "Charlie", "Diana")
)

scores <- data.frame(
  id = c(2, 3, 4, 5),
  score = c(87, 92, 88, 95)
)

# Inner join — only matching rows
merge(students, scores, by = "id")
#   id   name score
# 1  2    Bob    87
# 2  3 Charlie    92
# 3  4   Diana    88

# Left join — all from left
merge(students, scores, by = "id", all.x = TRUE)

# Right join — all from right
merge(students, scores, by = "id", all.y = TRUE)

# Full join — all from both
merge(students, scores, by = "id", all = TRUE)

# Using dplyr
library(dplyr)
students |> inner_join(scores, by = "id")
students |> left_join(scores, by = "id")
students |> anti_join(scores, by = "id")  # No match

Reshaping Data

Wide to Long

# Wide format
wide <- data.frame(
  student = c("Alice", "Bob", "Charlie"),
  math = c(90, 85, 88),
  science = c(92, 87, 91),
  english = c(88, 82, 90)
)

# Using reshape2
library(reshape2)
long <- melt(wide, id.vars = "student", variable.name = "subject", value.name = "score")
long

# Using tidyr
library(tidyr)
long <- pivot_longer(wide, cols = math:english, names_to = "subject", values_to = "score")
long

Long to Wide

# Long format
long <- data.frame(
  student = c("Alice", "Alice", "Alice", "Bob", "Bob", "Bob"),
  subject = c("math", "science", "english", "math", "science", "english"),
  score = c(90, 92, 88, 85, 87, 82)
)

# Using reshape2
wide <- dcast(long, student ~ subject, value.var = "score")

# Using tidyr
wide <- pivot_wider(long, names_from = subject, values_from = score)

Practice Exercises

Exercise 1: Create and Query

Create a data frame of 10 students with names, ages, and scores. Then:

  1. Find all students older than 25
  2. Calculate the average score
  3. Find the student with the highest score
  4. Add a new column "grade" (A if score >= 90, else B)

Solution

students <- data.frame(
  name = paste0("Student", 1:10),
  age = sample(20:35, 10, replace = TRUE),
  score = sample(70:100, 10, replace = TRUE)
)

# 1. Students older than 25
students[students$age > 25, ]

# 2. Average score
mean(students$score)

# 3. Highest score
students[which.max(students$score), ]

# 4. Add grade column
students$grade <- ifelse(students$score >= 90, "A", "B")

Key Takeaways

  • Data frames are R's most important structure — tabular data with mixed types
  • Use data.frame() or tibble() to create
  • Subset with df[rows, cols] — use filter() and select() from dplyr
  • $ extracts a column as a vector
  • Merge with merge() or dplyr joins — inner, left, right, full
  • Reshape with pivot_longer() and pivot_wider() from tidyr
  • Always check str() and summary() when exploring data

Next: Learn about R Factors — categorical data in R.

Advertisement

Need Expert R Programming Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement