R Data Manipulation with dplyr — Grammar of Data

R Data ScienceData ManipulationFree Lesson

Advertisement

R Data Manipulation with dplyr — Grammar of Data

Learning Objectives

By the end of this tutorial, you will be able to:

  • Use filter() and slice() for row selection
  • Apply select() and rename() for column operations
  • Create and modify columns with mutate() and transmute()
  • Sort data with arrange()
  • Summarize data with summarize() and group_by()
  • Use window functions for running calculations

What Is dplyr?

dplyr is a grammar of data manipulation — a consistent set of functions that solve the most common data manipulation challenges.

library(dplyr)

Core Verbs

filter() — Row Selection

# Filter rows by condition
mtcars |> filter(cyl == 6)

# Multiple conditions (AND)
mtcars |> filter(cyl == 6, mpg > 20)

# Multiple conditions (OR)
mtcars |> filter(cyl == 6 | cyl == 8)

# Not equal
mtcars |> filter(cyl != 4)

# Range
mtcars |> filter(mpg >= 20, mpg <= 30)

# %in% operator
mtcars |> filter(cyl %in% c(4, 6))

# Using which()
mtcars |> filter(which.max(mpg))

select() — Column Selection

# Select specific columns
mtcars |> select(mpg, cyl, hp)

# Select range
mtcars |> select(mpg:hp)

# Exclude columns
mtcars |> select(-mpg, -cyl)

# Select by pattern
mtcars |> select(starts_with("c"))
mtcars |> select(ends_with("p"))
mtcars |> select(contains("ar"))
mtcars |> select(matches("^c"))

# Rename while selecting
mtcars |> select(miles_per_gallon = mpg, cylinders = cyl)

# Everything else
mtcars |> select(mpg, everything())

mutate() — Create/Modify Columns

# Add new column
mtcars |> mutate(
  kpg = mpg * 1.60934,
  hp_per_cyl = hp / cyl
)

# Modify existing column
mtcars |> mutate(
  mpg = mpg * 1.1  # 10% increase
)

# Conditional column
mtcars |> mutate(
  efficiency = case_when(
    mpg > 25 ~ "High",
    mpg > 15 ~ "Medium",
    TRUE ~ "Low"
  )
)

# transmute() — only keep new columns
mtcars |> transmute(
  name = rownames(mtcars),
  kpg = mpg * 1.60934
)

arrange() — Sorting

# Sort ascending
mtcars |> arrange(mpg)

# Sort descending
mtcars |> arrange(desc(mpg))

# Multiple columns
mtcars |> arrange(cyl, desc(mpg))

# With NA handling
mtcars |> arrange(desc(mpg), .na_last = TRUE)

summarize() — Aggregation

# Single summary
mtcars |> summarize(
  avg_mpg = mean(mpg),
  sd_mpg = sd(mpg),
  n = n()
)

# With group_by
mtcars |>
  group_by(cyl) |>
  summarize(
    avg_mpg = mean(mpg),
    avg_hp = mean(hp),
    n = n()
  )

# Multiple grouping variables
mtcars |>
  group_by(cyl, gear) |>
  summarize(
    avg_mpg = mean(mpg),
    n = n(),
    .groups = "drop"
  )

Grouped Operations

# group_by + summarize
mtcars |>
  group_by(cyl) |>
  summarize(
    across(where(is.numeric), mean),
    n = n()
  )

# group_by + mutate
mtcars |>
  group_by(cyl) |>
  mutate(
    mpg_rank = rank(desc(mpg)),
    mpg_z = (mpg - mean(mpg)) / sd(mpg)
  )

# group_by + filter
mtcars |>
  group_by(cyl) |>
  filter(mpg == max(mpg))

# ungroup
mtcars |>
  group_by(cyl) |>
  summarize(avg = mean(mpg)) |>
  ungroup()

Window Functions

# Ranking
mtcars |>
  group_by(cyl) |>
  mutate(
    rank = rank(desc(mpg)),
    dense_rank = dense_rank(desc(mpg)),
    percent_rank = percent_rank(desc(mpg)),
    row_number = row_number(desc(mpg))
  )

# Running totals
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    cumulative_hp = cumsum(hp),
    running_mean = cummean(mpg)
  )

# Lead and lag
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    prev_mpg = lag(mpg),
    next_mpg = lead(mpg),
    mpg_diff = mpg - lag(mpg)
  )

# Cumulative statistics
mtcars |>
  group_by(cyl) |>
  arrange(mpg) |>
  mutate(
    cum_max = cummax(mpg),
    cum_min = cummin(mpg),
    cum_any = cumany(mpg > 25),
    cum_all = cumall(mpg > 10)
  )

Joins

# Create sample data
authors <- tibble(
  id = c(1, 2, 3),
  name = c("Alice", "Bob", "Charlie")
)

books <- tibble(
  id = c(1, 2, 2, 4),
  title = c("Book A", "Book B", "Book C", "Book D")
)

# Inner join
authors |> inner_join(books, by = "id")

# Left join
authors |> left_join(books, by = "id")

# Right join
authors |> right_join(books, by = "id")

# Full join
authors |> full_join(books, by = "id")

# Anti join (no match)
authors |> anti_join(books, by = "id")

# Cross join
authors |> cross_join(books)

Binding

# Bind rows
df1 <- tibble(x = 1:3, y = letters[1:3])
df2 <- tibble(x = 4:6, y = letters[4:6])
bind_rows(df1, df2)

# Bind columns
df1 <- tibble(x = 1:3)
df2 <- tibble(y = 4:6)
bind_cols(df1, df2)

across() — Multiple Columns

# Apply function to multiple columns
mtcars |>
  summarize(across(where(is.numeric), mean))

# Specific columns
mtcars |>
  summarize(across(c(mpg, cyl, hp), mean))

# With custom function
mtcars |>
  summarize(across(where(is.numeric), list(
    mean = mean,
    sd = sd
  )))

# In mutate
mtcars |>
  mutate(across(where(is.numeric), scale))

Practical Examples

Example 1: Sales Analysis

sales <- tibble(
  date = sample(seq(as.Date("2024-01-01"), as.Date("2024-12-31"), by = "day"), 1000),
  product = sample(c("A", "B", "C"), 1000, replace = TRUE),
  region = sample(c("North", "South", "East", "West"), 1000, replace = TRUE),
  revenue = runif(1000, 100, 1000)
)

# Monthly revenue by product
monthly <- sales |>
  mutate(month = floor_date(date, "month")) |>
  group_by(month, product) |>
  summarize(total = sum(revenue), .groups = "drop")

# Top product per region
top_products <- sales |>
  group_by(region, product) |>
  summarize(total = sum(revenue), .groups = "drop") |>
  group_by(region) |>
  filter(total == max(total))

Practice Exercises

Exercise 1: Data Summary

Using the diamonds dataset from ggplot2, summarize average price by cut and color.

Solution

library(ggplot2)

diamonds |>
  group_by(cut, color) |>
  summarize(
    avg_price = mean(price),
    n = n(),
    .groups = "drop"
  )

Key Takeaways

  • dplyr verbs are intuitive: filter, select, mutate, arrange, summarize
  • Pipe |> chains operations — read top to bottom
  • group_by() enables grouped operations — like SQL GROUP BY
  • Window functions compute across rows within groups
  • across() applies functions to multiple columns
  • Joins combine datasets — inner, left, right, full, anti
  • Always end with .groups = "drop" in summarize

Next: Learn about R ggplot2 Visualization — the grammar of graphics.

Advertisement

Need Expert R Programming Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement