R String Functions — Advanced Text Manipulation
Learning Objectives
By the end of this tutorial, you will be able to:
- Use base R string functions for manipulation
- Apply stringr package for consistent string operations
- Write regular expressions for pattern matching
- Perform advanced text processing tasks
Base R String Functions
Character Vector Operations
# Length
nchar("hello") # [1] 5
nchar(c("a", "bb", "ccc")) # [1] 1 2 3
# Case conversion
toupper("hello") # [1] "HELLO"
tolower("HELLO") # [1] "hello"
# Substring
substr("hello world", 1, 5) # [1] "hello"
substring("hello world", 7) # [1] "world"
# Split
strsplit("a,b,c", ",") # [[1]] "a" "b" "c"
# Concatenate
paste("hello", "world") # [1] "hello world"
paste0("hello", "world") # [1] "helloworld"
paste(c("a", "b", "c"), collapse = ", ") # [1] "a, b, c"
Pattern Matching
# grep — find matches
grep("e", c("hello", "world", "test")) # [1] 1 3
# grepl — logical
grepl("e", c("hello", "world", "test")) # [1] TRUE FALSE TRUE
# sub — replace first
sub("o", "0", "hello") # [1] "hell0"
# gsub — replace all
gsub("o", "0", "hello") # [1] "hell0"
# regexpr — find position
regexpr("o", "hello") # [1] 5
stringr Package
library(stringr)
Core Functions
# String length
str_length("hello") # [1] 5
# Concatenation
str_c("hello", "world", sep = " ") # [1] "hello world"
# Substring
str_sub("hello world", 1, 5) # [1] "hello"
str_sub("hello world", -6, -1) # [1] "world"
# Split
str_split("a,b,c", ",")
# [[1]]
# [1] "a" "b" "c"
# Trim
str_trim(" hello ") # [1] "hello"
str_pad("hi", 10, pad = " ") # [1] " hi"
# Duplicate
str_dup("ab", 3) # [1] "ababab"
# Truncate
str_trunc("long string here", 10) # [1] "long st..."
Pattern Matching
# Detect
str_detect(c("apple", "banana", "cherry"), "an")
# [1] FALSE TRUE FALSE
# Count
str_count("mississippi", "s") # [1] 4
# Locate
str_locate("hello world", "world")
# start end
# [1,] 7 11
# Extract
str_extract("Order #12345", "\\d+") # [1] "12345"
# Extract all
str_extract_all("Order #12345 shipped #67890", "\\d+")
# [[1]]
# [1] "12345" "67890"
# Replace
str_replace("hello world", "world", "R") # [1] "hello R"
str_replace_all("hello", "l", "L") # [1] "heLLo"
# Match
str_match("2024-01-15", "(\\d{4})-(\\d{2})-(\\d{2})")
# [,1] [,2] [,3] [,4]
# [1,] "2024-01-15" "2024" "01" "15"
# Start/End with
str_starts_with(c("hello", "world"), "h") # [1] TRUE FALSE
str_ends_with(c("hello", "world"), "o") # [1] TRUE TRUE
Regular Expressions
Basic Patterns
# Any character (except newline)
grepl("h.llo", "hello") # [1] TRUE
# Start and end
grepl("^hello", "hello world") # [1] TRUE
grepl("world$", "hello world") # [1] TRUE
# Character classes
grepl("[aeiou]", "hello") # [1] TRUE (has vowel)
grepl("[^aeiou]", "hello") # [1] TRUE (has non-vowel)
# Quantifiers
grepl("lo+", "hello") # [1] TRUE (one or more o)
grepl("lo*", "hell") # [1] TRUE (zero or more o)
grepl("lo?", "hell") # [1] TRUE (zero or one o)
# Escape special characters
grepl("\\.", "hello.world") # [1] TRUE (literal dot)
Common Regex Patterns
| Pattern | Description | Example |
|---|---|---|
\\d | Digit | "5" in "a5b" |
\\w | Word character | "h" in "hello" |
\\s | Whitespace | " " in "a b" |
[abc] | Character class | "a" in "apple" |
[^abc] | Negated class | "b" in "abc" |
. | Any character | "e" in "hello" |
^ | Start of string | "h" in "hello" |
$ | End of string | "o" in "hello" |
* | Zero or more | "ll" in "hello" |
+ | One or more | "ll" in "hello" |
? | Zero or one | "h" in "hello" |
{n} | Exactly n times | "ll" in "hello" |
{n,m} | Between n and m | "ll" in "hello" |
\\b | Word boundary | "h" in "hello" |
| ` | ` | Alternation |
() | Grouping | Captures |
Practical Examples
# Email validation
email <- "user@example.com"
grepl("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", email)
# [1] TRUE
# Phone number
phone <- "(555) 123-4567"
grepl("^\\(\\d{3}\\) \\d{3}-\\d{4}$", phone)
# [1] TRUE
# URL
url <- "https://www.example.com/path"
grepl("^https?://[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", url)
# [1] TRUE
# Extract numbers
text <- "I have 3 cats and 5 dogs"
as.integer(regmatches(text, gregexpr("\\d+", text))[[1]])
# [1] 3 5
# Extract words
words <- "hello world foo bar"
str_extract_all(words, "\\b\\w+\\b")
# [[1]]
# [1] "hello" "world" "foo" "bar"
Practical Examples
Example 1: Clean Data
library(stringr)
# Messy data
raw <- c(" Alice Smith ", "BOB JONES", "charlie brown")
# Clean up
clean <- raw |>
str_trim() |>
str_to_title()
clean
# [1] "Alice Smith" "Bob Jones" "Charlie Brown"
Example 2: Parse CSV Line
library(stringr)
line <- "John,Doe,30,New York"
fields <- str_split(line, ",", simplify = TRUE)
data.frame(
first = fields[1],
last = fields[2],
age = as.integer(fields[3]),
city = fields[4]
)
Example 3: Extract Domain
library(stringr)
emails <- c("alice@gmail.com", "bob@yahoo.com", "charlie@company.org")
domains <- str_extract(emails, "(?<=@)[^.]+")
domains
# [1] "gmail" "yahoo" "company"
Example 4: Word Count
library(stringr)
count_words <- function(text) {
str_count(text, "\\S+")
}
count_words("hello world") # [1] 2
count_words(" one two three ") # [1] 3
Common Mistakes
1. Forgetting escape characters
# Wrong
grepl(".", "hello") # [1] TRUE (matches anything)
# Right
grepl("\\.", "hello.world") # [1] TRUE (literal dot)
2. Case sensitivity
# Wrong
grepl("hello", "Hello World") # [1] FALSE
# Right
grepl("hello", "Hello World", ignore.case = TRUE) # [1] TRUE
3. Not handling NA
x <- c("hello", NA, "world")
str_to_upper(x) # [1] "HELLO" NA "WORLD"
# Better
x[!is.na(x)] <- str_to_upper(x[!is.na(x)])
Practice Exercises
Exercise 1: Email Extractor
Write a function that extracts all email addresses from a text.
Solution
library(stringr)
extract_emails <- function(text) {
str_extract_all(text, "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}")[[1]]
}
text <- "Contact alice@example.com or bob@yahoo.com for info"
extract_emails(text)
# [1] "alice@example.com" "bob@yahoo.com"
Exercise 2: Camel Case Converter
Write a function that converts snake_case to camelCase.
Solution
library(stringr)
to_camel <- function(snake) {
parts <- str_split(snake, "_")[[1]]
parts <- str_to_title(parts)
paste0(parts[1], paste(parts[-1], collapse = ""))
}
to_camel("hello_world") # [1] "helloWorld"
to_camel("my_variable_name") # [1] "myVariableName"
Key Takeaways
- Base R has basic string functions —
nchar(),substr(),paste(),strsplit() stringrprovides consistency — all functions start withstr_- Regular expressions are powerful — learn
\\d,\\w,\\s,[],() str_extract()andstr_match()pull out matchesstr_replace()andstr_replace_all()substitute patterns- Always handle
NAin string operations - Use
ignore.case = TRUEfor case-insensitive matching - Practice regex with online tools like regex101.com
Next: Learn about R Date and Time — temporal data manipulation.