R Logistic Regression — Modeling Binary Outcomes

R Data ScienceLogistic RegressionFree Lesson

Advertisement

R Logistic Regression — Modeling Binary Outcomes

Learning Objectives

By the end of this tutorial, you will be able to:

  • Fit binary logistic regression models
  • Interpret odds ratios and coefficients
  • Evaluate model performance (AUC, confusion matrix, accuracy)
  • Perform multinomial and ordinal logistic regression
  • Visualize logistic regression results

Binary Logistic Regression

# Model: log(p/(1-p)) = β₀ + β₁x
# Using iris (setosa vs versicolor)
data <- iris[iris$Species %in% c("setosa", "versicolor"), ]
data$Species <- factor(data$Species)

model <- glm(Species ~ Sepal.Length, data = data, family = binomial)

# Summary
summary(model)

# Coefficients interpretation (odds ratios)
exp(coef(model))

# Predictions
data$pred_prob <- predict(model, type = "response")
data$pred_class <- ifelse(data$pred_prob > 0.5, "versicolor", "setosa")

Model Evaluation

# Confusion matrix
table(Predicted = data$pred_class, Actual = data$Species)

# Accuracy
mean(data$pred_class == data$Species)

# AUC
library(pROC)
roc_obj <- roc(data$Species, data$pred_prob)
auc(roc_obj)
plot(roc_obj)

# Multiple metrics
caret::confusionMatrix(data$pred_class, data$Species)

Multiple Predictors

model <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length,
             data = data, family = binomial)
summary(model)

# Stepwise selection
model_null <- glm(Species ~ 1, data = data, family = binomial)
model_full <- glm(Species ~ ., data = data, family = binomial)
model_best <- step(model_null, scope = list(lower = model_null, upper = model_full),
                   direction = "both", trace = 0)
summary(model_best)

Multinomial Logistic Regression

library(nnet)

# 3+ categories
model <- multinom(Species ~ ., data = iris)
summary(model)

# Predictions
iris$pred <- predict(model)
table(iris$pred, iris$Species)

# Coefficients as odds ratios
exp(coef(model))

Ordinal Logistic Regression

library(MASS)

# Ordered factors
data <- iris
data$Size <- cut(data$Sepal.Length, breaks = 3, labels = c("Small", "Medium", "Large"))

model <- polr(Size ~ Petal.Length + Petal.Width, data = data, Hess = TRUE)
summary(model)

# Predictions
data$pred <- predict(model)
table(data$pred, data$Size)

Model Diagnostics

model <- glm(Species ~ Sepal.Length, data = data, family = binomial)

# Residuals
residuals(model, type = "response")   # Raw residuals
residuals(model, type = "pearson")    # Pearson residuals
residuals(model, type = "deviance")   # Deviance residuals

# Influential points
cooks.distance(model)

# Goodness of fit
with(model, pchisq(null.deviance - deviance, df.null - df.residual, lower = FALSE))

# Hosmer-Lemeshow test
library(ResourceSelection)
hoslem.test(data$Species, data$pred_prob, g = 10)

Practical Examples

Example 1: Customer Churn

set.seed(42)
n <- 500
data <- data.frame(
  tenure = sample(1:72, n, replace = TRUE),
  monthly_charges = runif(n, 20, 100),
  contract = sample(c("Month-to-month", "One year", "Two year"), n, replace = TRUE)
)
data$churn <- ifelse(
  data$contract == "Month-to-month" & data$monthly_charges > 60,
  sample(c(0, 1), n, replace = TRUE, prob = c(0.3, 0.7)),
  sample(c(0, 1), n, replace = TRUE, prob = c(0.8, 0.2))
)

model <- glm(churn ~ tenure + monthly_charges + contract,
             data = data, family = binomial)
summary(model)
exp(coef(model))

Practice Exercises

Exercise 1: Loan Default

Build a logistic regression model to predict loan default using income, credit score, and employment years.

Solution

set.seed(42)
n <- 300
data <- data.frame(
  income = rnorm(n, 50000, 15000),
  credit_score = rnorm(n, 700, 50),
  employment_years = sample(1:30, n, replace = TRUE)
)
data$default <- ifelse(
  data$credit_score < 650 & data$income < 40000,
  sample(c(0, 1), n, replace = TRUE, prob = c(0.3, 0.7)),
  sample(c(0, 1), n, replace = TRUE, prob = c(0.9, 0.1))
)

model <- glm(default ~ income + credit_score + employment_years,
             data = data, family = binomial)
summary(model)
exp(coef(model))

Key Takeaways

  • glm() with family = binomial fits logistic regression
  • Coefficients are log-odds — exponentiate for odds ratios
  • Odds ratio greater than 1 increases odds, less than 1 decreases odds
  • Use AUC to evaluate discrimination ability
  • Confusion matrix shows classification performance
  • Multinomial logistic regression for 3+ unordered categories
  • Ordinal logistic regression for ordered categories

Next: Learn about R Time Series Analysis — analyzing temporal data.

Advertisement

Need Expert R Programming Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement