R Statistical Functions — Descriptive Statistics
Learning Objectives
By the end of this tutorial, you will be able to:
- Calculate measures of central tendency (mean, median, mode)
- Compute measures of dispersion (variance, SD, IQR, range)
- Create frequency tables and cross-tabulations
- Calculate correlation and covariance
- Use the
summary()andpsychpackages
Measures of Central Tendency
x <- c(10, 20, 30, 40, 50, 30, 30)
# Mean
mean(x) # [1] 30
mean(x, trim = 0.1) # Trimmed mean
mean(x, na.rm = TRUE) # Handle NA
# Median
median(x) # [1] 30
# Mode (no built-in function)
get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
get_mode(x) # [1] 30
# Quantiles
quantile(x) # 0% 25% 50% 75% 100%
quantile(x, probs = c(0.1, 0.5, 0.9))
# Weighted mean
weights <- c(1, 2, 3, 4, 5, 3, 3)
weighted.mean(x, weights)
Measures of Dispersion
x <- c(10, 20, 30, 40, 50)
# Range
range(x) # [1] 10 50
diff(range(x)) # [1] 40
# Variance
var(x) # [1] 250
# Standard Deviation
sd(x) # [1] 15.81139
# Mean Absolute Deviation
mad(x) # [1] 14.826
# Interquartile Range
IQR(x) # [1] 20
# Coefficient of Variation
cv <- function(x) sd(x) / mean(x) * 100
cv(x) # [1] 52.70463
# Standard Error
se <- function(x) sd(x) / sqrt(length(x))
se(x) # [1] 7.071068
Five-Number Summary
x <- rnorm(100, mean = 50, sd = 10)
# Base R
fivenum(x) # min, Q1, median, Q3, max
summary(x) # min, Q1, median, mean, Q3, max
# With psych package
library(psych)
describe(x)
# vars n mean median sd min max range se
# X1 1 100 50.12 50.34 9.87 25.4 74.3 48.9 0.99
Frequency Tables
# Simple table
x <- c("A", "B", "A", "C", "B", "A", "B", "B")
table(x)
# x
# A B C
# 3 4 1
# Proportions
prop.table(table(x))
# x
# A B C
# 0.375 0.500 0.125
# Percentage
round(prop.table(table(x)) * 100, 1)
# Cross-tabulation
gender <- c("M", "F", "M", "F", "M", "F", "M", "F")
prefer <- c("A", "B", "A", "B", "A", "A", "B", "B")
table(gender, prefer)
# prefer
# gender A B
# F 1 3
# M 3 1
# Marginal totals
addmargins(table(gender, prefer))
# Joint and marginal percentages
prop.table(table(gender, prefer))
Correlation and Covariance
# Correlation
x <- c(1, 2, 3, 4, 5)
y <- c(2, 4, 5, 4, 5)
cor(x, y) # [1] 0.832
# Different methods
cor(x, y, method = "pearson") # Default
cor(x, y, method = "spearman") # Rank-based
cor(x, y, method = "kendall") # Kendall's tau
# Correlation matrix
cor(mtcars[, 1:4])
# Covariance
cov(x, y) # [1] 2.0
# Covariance matrix
cov(mtcars[, 1:4])
# Correlation with NA
cor(x, y, use = "complete.obs")
# Significance test
cor.test(x, y)
Grouped Statistics
library(dplyr)
# dplyr
mtcars |>
group_by(cyl) |>
summarize(
mean_mpg = mean(mpg),
sd_mpg = sd(mpg),
n = n()
)
# tapply
tapply(mtcars$mpg, mtcars$cyl, mean)
# aggregate
aggregate(mpg ~ cyl, data = mtcars, FUN = mean)
# data.table
library(data.table)
dt <- as.data.table(mtcars)
dt[, .(mean_mpg = mean(mpg), sd_mpg = sd(mpg)), by = cyl]
Summary Functions
# Base R summary
summary(mtcars)
# By variable
summary(mtcars$mpg)
# psych package
library(psych)
describe(mtcars)
describeBy(mtcars, group = mtcars$cyl)
# Hmisc package
library(Hmisc)
describe(mtcars)
Practical Examples
Example 1: Data Quality Report
data_quality <- function(df) {
lapply(df, function(x) {
list(
type = class(x)[1],
n_unique = length(unique(x)),
n_na = sum(is.na(x)),
pct_na = mean(is.na(x)) * 100,
if (is.numeric(x)) list(
mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE)
) else NULL
)
})
}
data_quality(mtcars)
Practice Exercises
Exercise 1: Descriptive Statistics
Write a function that calculates mean, median, sd, min, max, and IQR for a numeric vector.
Solution
desc_stats <- function(x) {
list(
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
iqr = IQR(x, na.rm = TRUE)
)
}
desc_stats(mtcars$mpg)
Key Takeaways
mean(),median(),mode()— central tendencysd(),var(),IQR()— dispersionsummary()— five-number summary plus meancor()— correlation matrixtable()— frequency tablesgroup_by() %>% summarize()— grouped statistics- Use
na.rm = TRUEto handle missing values
Next: Learn about R Probability Distributions — random number generation.