R Data Manipulation with dplyr — Grammar of Data
Learning Objectives
By the end of this tutorial, you will be able to:
- Use
filter()andslice()for row selection - Apply
select()andrename()for column operations - Create and modify columns with
mutate()andtransmute() - Sort data with
arrange() - Summarize data with
summarize()andgroup_by() - Use window functions for running calculations
What Is dplyr?
dplyr is a grammar of data manipulation — a consistent set of functions that solve the most common data manipulation challenges.
library(dplyr)
Core Verbs
filter() — Row Selection
# Filter rows by condition
mtcars |> filter(cyl == 6)
# Multiple conditions (AND)
mtcars |> filter(cyl == 6, mpg > 20)
# Multiple conditions (OR)
mtcars |> filter(cyl == 6 | cyl == 8)
# Not equal
mtcars |> filter(cyl != 4)
# Range
mtcars |> filter(mpg >= 20, mpg <= 30)
# %in% operator
mtcars |> filter(cyl %in% c(4, 6))
# Using which()
mtcars |> filter(which.max(mpg))
select() — Column Selection
# Select specific columns
mtcars |> select(mpg, cyl, hp)
# Select range
mtcars |> select(mpg:hp)
# Exclude columns
mtcars |> select(-mpg, -cyl)
# Select by pattern
mtcars |> select(starts_with("c"))
mtcars |> select(ends_with("p"))
mtcars |> select(contains("ar"))
mtcars |> select(matches("^c"))
# Rename while selecting
mtcars |> select(miles_per_gallon = mpg, cylinders = cyl)
# Everything else
mtcars |> select(mpg, everything())
mutate() — Create/Modify Columns
# Add new column
mtcars |> mutate(
kpg = mpg * 1.60934,
hp_per_cyl = hp / cyl
)
# Modify existing column
mtcars |> mutate(
mpg = mpg * 1.1 # 10% increase
)
# Conditional column
mtcars |> mutate(
efficiency = case_when(
mpg > 25 ~ "High",
mpg > 15 ~ "Medium",
TRUE ~ "Low"
)
)
# transmute() — only keep new columns
mtcars |> transmute(
name = rownames(mtcars),
kpg = mpg * 1.60934
)
arrange() — Sorting
# Sort ascending
mtcars |> arrange(mpg)
# Sort descending
mtcars |> arrange(desc(mpg))
# Multiple columns
mtcars |> arrange(cyl, desc(mpg))
# With NA handling
mtcars |> arrange(desc(mpg), .na_last = TRUE)
summarize() — Aggregation
# Single summary
mtcars |> summarize(
avg_mpg = mean(mpg),
sd_mpg = sd(mpg),
n = n()
)
# With group_by
mtcars |>
group_by(cyl) |>
summarize(
avg_mpg = mean(mpg),
avg_hp = mean(hp),
n = n()
)
# Multiple grouping variables
mtcars |>
group_by(cyl, gear) |>
summarize(
avg_mpg = mean(mpg),
n = n(),
.groups = "drop"
)
Grouped Operations
# group_by + summarize
mtcars |>
group_by(cyl) |>
summarize(
across(where(is.numeric), mean),
n = n()
)
# group_by + mutate
mtcars |>
group_by(cyl) |>
mutate(
mpg_rank = rank(desc(mpg)),
mpg_z = (mpg - mean(mpg)) / sd(mpg)
)
# group_by + filter
mtcars |>
group_by(cyl) |>
filter(mpg == max(mpg))
# ungroup
mtcars |>
group_by(cyl) |>
summarize(avg = mean(mpg)) |>
ungroup()
Window Functions
# Ranking
mtcars |>
group_by(cyl) |>
mutate(
rank = rank(desc(mpg)),
dense_rank = dense_rank(desc(mpg)),
percent_rank = percent_rank(desc(mpg)),
row_number = row_number(desc(mpg))
)
# Running totals
mtcars |>
group_by(cyl) |>
arrange(mpg) |>
mutate(
cumulative_hp = cumsum(hp),
running_mean = cummean(mpg)
)
# Lead and lag
mtcars |>
group_by(cyl) |>
arrange(mpg) |>
mutate(
prev_mpg = lag(mpg),
next_mpg = lead(mpg),
mpg_diff = mpg - lag(mpg)
)
# Cumulative statistics
mtcars |>
group_by(cyl) |>
arrange(mpg) |>
mutate(
cum_max = cummax(mpg),
cum_min = cummin(mpg),
cum_any = cumany(mpg > 25),
cum_all = cumall(mpg > 10)
)
Joins
# Create sample data
authors <- tibble(
id = c(1, 2, 3),
name = c("Alice", "Bob", "Charlie")
)
books <- tibble(
id = c(1, 2, 2, 4),
title = c("Book A", "Book B", "Book C", "Book D")
)
# Inner join
authors |> inner_join(books, by = "id")
# Left join
authors |> left_join(books, by = "id")
# Right join
authors |> right_join(books, by = "id")
# Full join
authors |> full_join(books, by = "id")
# Anti join (no match)
authors |> anti_join(books, by = "id")
# Cross join
authors |> cross_join(books)
Binding
# Bind rows
df1 <- tibble(x = 1:3, y = letters[1:3])
df2 <- tibble(x = 4:6, y = letters[4:6])
bind_rows(df1, df2)
# Bind columns
df1 <- tibble(x = 1:3)
df2 <- tibble(y = 4:6)
bind_cols(df1, df2)
across() — Multiple Columns
# Apply function to multiple columns
mtcars |>
summarize(across(where(is.numeric), mean))
# Specific columns
mtcars |>
summarize(across(c(mpg, cyl, hp), mean))
# With custom function
mtcars |>
summarize(across(where(is.numeric), list(
mean = mean,
sd = sd
)))
# In mutate
mtcars |>
mutate(across(where(is.numeric), scale))
Practical Examples
Example 1: Sales Analysis
sales <- tibble(
date = sample(seq(as.Date("2024-01-01"), as.Date("2024-12-31"), by = "day"), 1000),
product = sample(c("A", "B", "C"), 1000, replace = TRUE),
region = sample(c("North", "South", "East", "West"), 1000, replace = TRUE),
revenue = runif(1000, 100, 1000)
)
# Monthly revenue by product
monthly <- sales |>
mutate(month = floor_date(date, "month")) |>
group_by(month, product) |>
summarize(total = sum(revenue), .groups = "drop")
# Top product per region
top_products <- sales |>
group_by(region, product) |>
summarize(total = sum(revenue), .groups = "drop") |>
group_by(region) |>
filter(total == max(total))
Practice Exercises
Exercise 1: Data Summary
Using the diamonds dataset from ggplot2, summarize average price by cut and color.
Solution
library(ggplot2)
diamonds |>
group_by(cut, color) |>
summarize(
avg_price = mean(price),
n = n(),
.groups = "drop"
)
Key Takeaways
- dplyr verbs are intuitive: filter, select, mutate, arrange, summarize
- Pipe
|>chains operations — read top to bottom group_by()enables grouped operations — like SQL GROUP BY- Window functions compute across rows within groups
across()applies functions to multiple columns- Joins combine datasets — inner, left, right, full, anti
- Always end with
.groups = "drop"in summarize
Next: Learn about R ggplot2 Visualization — the grammar of graphics.