R Statistical Functions — Descriptive Statistics

Learning Objectives

By the end of this tutorial, you will be able to:

Calculate measures of central tendency (mean, median, mode)
Compute measures of dispersion (variance, SD, IQR, range)
Create frequency tables and cross-tabulations
Calculate correlation and covariance
Use the summary() and psych packages

Measures of Central Tendency

x <- c(10, 20, 30, 40, 50, 30, 30)

# Mean
mean(x)                  # [1] 30
mean(x, trim = 0.1)      # Trimmed mean
mean(x, na.rm = TRUE)    # Handle NA

# Median
median(x)                # [1] 30

# Mode (no built-in function)
get_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
get_mode(x)              # [1] 30

# Quantiles
quantile(x)              # 0%  25%  50%  75% 100%
quantile(x, probs = c(0.1, 0.5, 0.9))

# Weighted mean
weights <- c(1, 2, 3, 4, 5, 3, 3)
weighted.mean(x, weights)

Measures of Dispersion

x <- c(10, 20, 30, 40, 50)

# Range
range(x)                 # [1] 10 50
diff(range(x))           # [1] 40

# Variance
var(x)                   # [1] 250

# Standard Deviation
sd(x)                    # [1] 15.81139

# Mean Absolute Deviation
mad(x)                   # [1] 14.826

# Interquartile Range
IQR(x)                   # [1] 20

# Coefficient of Variation
cv <- function(x) sd(x) / mean(x) * 100
cv(x)                    # [1] 52.70463

# Standard Error
se <- function(x) sd(x) / sqrt(length(x))
se(x)                    # [1] 7.071068

Five-Number Summary

x <- rnorm(100, mean = 50, sd = 10)

# Base R
fivenum(x)               # min, Q1, median, Q3, max
summary(x)               # min, Q1, median, mean, Q3, max

# With psych package
library(psych)
describe(x)
#    vars   n  mean median   sd  min  max range  se
# X1    1 100 50.12  50.34 9.87 25.4 74.3  48.9 0.99

Frequency Tables

# Simple table
x <- c("A", "B", "A", "C", "B", "A", "B", "B")
table(x)
# x
# A B C
# 3 4 1

# Proportions
prop.table(table(x))
# x
#     A     B     C
# 0.375 0.500 0.125

# Percentage
round(prop.table(table(x)) * 100, 1)

# Cross-tabulation
gender <- c("M", "F", "M", "F", "M", "F", "M", "F")
prefer <- c("A", "B", "A", "B", "A", "A", "B", "B")
table(gender, prefer)
#        prefer
# gender A B
#       F 1 3
#       M 3 1

# Marginal totals
addmargins(table(gender, prefer))

# Joint and marginal percentages
prop.table(table(gender, prefer))

Correlation and Covariance

# Correlation
x <- c(1, 2, 3, 4, 5)
y <- c(2, 4, 5, 4, 5)

cor(x, y)                # [1] 0.832

# Different methods
cor(x, y, method = "pearson")   # Default
cor(x, y, method = "spearman")  # Rank-based
cor(x, y, method = "kendall")   # Kendall's tau

# Correlation matrix
cor(mtcars[, 1:4])

# Covariance
cov(x, y)                # [1] 2.0

# Covariance matrix
cov(mtcars[, 1:4])

# Correlation with NA
cor(x, y, use = "complete.obs")

# Significance test
cor.test(x, y)

Grouped Statistics

library(dplyr)

# dplyr
mtcars |>
  group_by(cyl) |>
  summarize(
    mean_mpg = mean(mpg),
    sd_mpg = sd(mpg),
    n = n()
  )

# tapply
tapply(mtcars$mpg, mtcars$cyl, mean)

# aggregate
aggregate(mpg ~ cyl, data = mtcars, FUN = mean)

# data.table
library(data.table)
dt <- as.data.table(mtcars)
dt[, .(mean_mpg = mean(mpg), sd_mpg = sd(mpg)), by = cyl]

Summary Functions

# Base R summary
summary(mtcars)

# By variable
summary(mtcars$mpg)

# psych package
library(psych)
describe(mtcars)
describeBy(mtcars, group = mtcars$cyl)

# Hmisc package
library(Hmisc)
describe(mtcars)

Practical Examples

Example 1: Data Quality Report

data_quality <- function(df) {
  lapply(df, function(x) {
    list(
      type = class(x)[1],
      n_unique = length(unique(x)),
      n_na = sum(is.na(x)),
      pct_na = mean(is.na(x)) * 100,
      if (is.numeric(x)) list(
        mean = mean(x, na.rm = TRUE),
        sd = sd(x, na.rm = TRUE),
        min = min(x, na.rm = TRUE),
        max = max(x, na.rm = TRUE)
      ) else NULL
    )
  })
}

data_quality(mtcars)

Practice Exercises

Exercise 1: Descriptive Statistics

Write a function that calculates mean, median, sd, min, max, and IQR for a numeric vector.

Solution

desc_stats <- function(x) {
  list(
    mean = mean(x, na.rm = TRUE),
    median = median(x, na.rm = TRUE),
    sd = sd(x, na.rm = TRUE),
    min = min(x, na.rm = TRUE),
    max = max(x, na.rm = TRUE),
    iqr = IQR(x, na.rm = TRUE)
  )
}

desc_stats(mtcars$mpg)

Key Takeaways

mean(), median(), mode() — central tendency
sd(), var(), IQR() — dispersion
summary() — five-number summary plus mean
cor() — correlation matrix
table() — frequency tables
group_by() %>% summarize() — grouped statistics
Use na.rm = TRUE to handle missing values

Next: Learn about R Probability Distributions — random number generation.

R Statistical Functions — Descriptive Statistics

R Statistical Functions — Descriptive Statistics

Learning Objectives

Measures of Central Tendency

Measures of Dispersion

Five-Number Summary

Frequency Tables

Correlation and Covariance

Grouped Statistics

Summary Functions

Practical Examples

Example 1: Data Quality Report

Practice Exercises

Exercise 1: Descriptive Statistics

Key Takeaways

Need Expert R Programming Help?