R Data Manipulation with dplyr — Grammar of Data

Learning Objectives

By the end of this tutorial, you will be able to:

Use filter() and slice() for row selection
Apply select() and rename() for column operations
Create and modify columns with mutate() and transmute()
Sort data with arrange()
Summarize data with summarize() and group_by()
Use window functions for running calculations

What Is dplyr?

dplyr is a grammar of data manipulation — a consistent set of functions that solve the most common data manipulation challenges.

library(dplyr)

Core Verbs

filter() — Row Selection

# Filter rows by condition
mtcars |> filter(cyl == 6)

# Multiple conditions (AND)
mtcars |> filter(cyl == 6, mpg > 20)

# Multiple conditions (OR)
mtcars |> filter(cyl == 6 | cyl == 8)

# Not equal
mtcars |> filter(cyl != 4)

# Range
mtcars |> filter(mpg >= 20, mpg <= 30)

# %in% operator
mtcars |> filter(cyl %in% c(4, 6))

# Using which()
mtcars |> filter(which.max(mpg))

select() — Column Selection

# Select specific columns
mtcars |> select(mpg, cyl, hp)

# Select range
mtcars |> select(mpg:hp)

# Exclude columns
mtcars |> select(-mpg, -cyl)

# Select by pattern
mtcars |> select(starts_with("c"))
mtcars |> select(ends_with("p"))
mtcars |> select(contains("ar"))
mtcars |> select(matches("^c"))

# Rename while selecting
mtcars |> select(miles_per_gallon = mpg, cylinders = cyl)

# Everything else
mtcars |> select(mpg, everything())

mutate() — Create/Modify Columns

# Add new column
mtcars |> mutate(
  kpg = mpg * 1.60934,
  hp_per_cyl = hp / cyl
)

# Modify existing column
mtcars |> mutate(
  mpg = mpg * 1.1  # 10% increase
)

# Conditional column
mtcars |> mutate(
  efficiency = case_when(
    mpg > 25 ~ "High",
    mpg > 15 ~ "Medium",
    TRUE ~ "Low"
  )
)

# transmute() — only keep new columns
mtcars |> transmute(
  name = rownames(mtcars),
  kpg = mpg * 1.60934
)

arrange() — Sorting

# Sort ascending
mtcars |> arrange(mpg)

# Sort descending
mtcars |> arrange(desc(mpg))

# Multiple columns
mtcars |> arrange(cyl, desc(mpg))

# With NA handling
mtcars |> arrange(desc(mpg), .na_last = TRUE)

summarize() — Aggregation

# Single summary
mtcars |> summarize(
  avg_mpg = mean(mpg),
  sd_mpg = sd(mpg),
  n = n()
)

# With group_by
mtcars |>
  group_by(cyl) |>
  summarize(
    avg_mpg = mean(mpg),
    avg_hp = mean(hp),
    n = n()
  )

# Multiple grouping variables
mtcars |>
  group_by(cyl, gear) |>
  summarize(
    avg_mpg = mean(mpg),
    n = n(),
    .groups = "drop"
  )

Grouped Operations

# group_by + summarize
mtcars |>
  group_by(cyl) |>
  summarize(
    across(where(is.numeric), mean),
    n = n()
  )

# group_by + mutate
mtcars |>
  group_by(cyl) |>
  mutate(
    mpg_rank = rank(desc(mpg)),
    mpg_z = (mpg - mean(mpg)) / sd(mpg)
  )

# group_by + filter
mtcars |>
  group_by(cyl) |>
  filter(mpg == max(mpg))

# ungroup
mtcars |>
  group_by(cyl) |>
  summarize(avg = mean(mpg)) |>
  ungroup()

Window Functions

# Ranking
mtcars |>
  group_by(cyl) |>
  mutate(
    rank = rank(desc(mpg)),
    dense_rank = dense_rank(desc(mpg)),
    percent_rank = percent_rank(desc(mpg)),
    row_number = row_number(desc(mpg))
  )

# Running totals
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    cumulative_hp = cumsum(hp),
    running_mean = cummean(mpg)
  )

# Lead and lag
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    prev_mpg = lag(mpg),
    next_mpg = lead(mpg),
    mpg_diff = mpg - lag(mpg)
  )

# Cumulative statistics
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    cum_max = cummax(mpg),
    cum_min = cummin(mpg),
    cum_any = cumany(mpg > 25),
    cum_all = cumall(mpg > 10)
  )

Joins

# Create sample data
authors <- tibble(
  id = c(1, 2, 3),
  name = c("Alice", "Bob", "Charlie")
)

books <- tibble(
  id = c(1, 2, 2, 4),
  title = c("Book A", "Book B", "Book C", "Book D")
)

# Inner join
authors |> inner_join(books, by = "id")

# Left join
authors |> left_join(books, by = "id")

# Right join
authors |> right_join(books, by = "id")

# Full join
authors |> full_join(books, by = "id")

# Anti join (no match)
authors |> anti_join(books, by = "id")

# Cross join
authors |> cross_join(books)

Binding

# Bind rows
df1 <- tibble(x = 1:3, y = letters[1:3])
df2 <- tibble(x = 4:6, y = letters[4:6])
bind_rows(df1, df2)

# Bind columns
df1 <- tibble(x = 1:3)
df2 <- tibble(y = 4:6)
bind_cols(df1, df2)

across() — Multiple Columns

# Apply function to multiple columns
mtcars |>
  summarize(across(where(is.numeric), mean))

# Specific columns
mtcars |>
  summarize(across(c(mpg, cyl, hp), mean))

# With custom function
mtcars |>
  summarize(across(where(is.numeric), list(
    mean = mean,
    sd = sd
  )))

# In mutate
mtcars |>
  mutate(across(where(is.numeric), scale))

Practical Examples

Example 1: Sales Analysis

sales <- tibble(
  date = sample(seq(as.Date("2024-01-01"), as.Date("2024-12-31"), by = "day"), 1000),
  product = sample(c("A", "B", "C"), 1000, replace = TRUE),
  region = sample(c("North", "South", "East", "West"), 1000, replace = TRUE),
  revenue = runif(1000, 100, 1000)
)

# Monthly revenue by product
monthly <- sales |>
  mutate(month = floor_date(date, "month")) |>
  group_by(month, product) |>
  summarize(total = sum(revenue), .groups = "drop")

# Top product per region
top_products <- sales |>
  group_by(region, product) |>
  summarize(total = sum(revenue), .groups = "drop") |>
  group_by(region) |>
  filter(total == max(total))

Practice Exercises

Exercise 1: Data Summary

Using the diamonds dataset from ggplot2, summarize average price by cut and color.

Solution

library(ggplot2)

diamonds |>
  group_by(cut, color) |>
  summarize(
    avg_price = mean(price),
    n = n(),
    .groups = "drop"
  )

Key Takeaways

dplyr verbs are intuitive: filter, select, mutate, arrange, summarize
Pipe |> chains operations — read top to bottom
group_by() enables grouped operations — like SQL GROUP BY
Window functions compute across rows within groups
across() applies functions to multiple columns
Joins combine datasets — inner, left, right, full, anti
Always end with .groups = "drop" in summarize

Next: Learn about R ggplot2 Visualization — the grammar of graphics.

R Data Manipulation with dplyr — Grammar of Data

R Data Manipulation with dplyr — Grammar of Data

Learning Objectives

What Is dplyr?

Core Verbs

filter() — Row Selection

select() — Column Selection

mutate() — Create/Modify Columns

arrange() — Sorting

summarize() — Aggregation

Grouped Operations

Window Functions

Joins

Binding

across() — Multiple Columns

Practical Examples

Example 1: Sales Analysis

Practice Exercises

Exercise 1: Data Summary

Key Takeaways

Need Expert R Programming Help?