Introduction
Text mining extracts information from text data. The tm package provides text processing functions.
Creating Corpus
library(tm)
# From vector
corpus <- VCorpus(VectorSource(text_data))
# From directory
corpus <- VCorpus(DirSource("directory"))
Text Processing
# Transformations
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords())
# Stemming
corpus <- tm_map(corpus, stemDocument)
Document Term Matrix
# Create DTM
dtm <- DocumentTermMatrix(corpus)
# Sparse terms
dtm <- removeSparseTerms(dtm, 0.95)
Analysis
# Word frequencies
findFreqTerms(dtm, 5)
# Associations
findAssocs(dtm, "word", 0.5)
Summary
tm provides text mining tools. Process text before analysis.