c_topic_modeling.R
In textmineR: Functions for Text Mining and Topic Modeling

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(textmineR)

# load nih_sample data set from textmineR
data(nih_sample)

str(nih_sample)

# create a document term matrix 
dtm <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT, # character vector of documents
                 doc_names = nih_sample$APPLICATION_ID, # document names
                 ngram_window = c(1, 2), # minimum and maximum n-gram length
                 stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
                                  stopwords::stopwords(source = "smart")), # this is the default value
                 lower = TRUE, # lowercase - this is the default value
                 remove_punctuation = TRUE, # punctuation - this is the default
                 remove_numbers = TRUE, # numbers - this is the default
                 verbose = FALSE, # Turn off status bar for this demo
                 cpus = 2) # default is all available cpus on the system

dtm <- dtm[,colSums(dtm) > 2]


## -----------------------------------------------------------------------------

# Fit a Latent Dirichlet Allocation model
# note the number of topics is arbitrary here
# see extensions for more info

set.seed(12345)

model <- FitLdaModel(dtm = dtm, 
                     k = 20,
                     iterations = 200, # I usually recommend at least 500 iterations or more
                     burnin = 180,
                     alpha = 0.1,
                     beta = 0.05,
                     optimize_alpha = TRUE,
                     calc_likelihood = TRUE,
                     calc_coherence = TRUE,
                     calc_r2 = TRUE,
                     cpus = 2) 


## -----------------------------------------------------------------------------

str(model)


## -----------------------------------------------------------------------------

# R-squared 
# - only works for probabilistic models like LDA and CTM
model$r2

# log Likelihood (does not consider the prior) 
plot(model$log_likelihood, type = "l")

## ----fig.width = 7.5, fig.height = 4------------------------------------------
# probabilistic coherence, a measure of topic quality
# this measure can be used with any topic model, not just probabilistic ones
summary(model$coherence)

hist(model$coherence, 
     col= "blue", 
     main = "Histogram of probabilistic coherence")

## -----------------------------------------------------------------------------
# Get the top terms of each topic
model$top_terms <- GetTopTerms(phi = model$phi, M = 5)

## ----eval = FALSE-------------------------------------------------------------
#  head(t(model$top_terms)

## ---- echo = FALSE------------------------------------------------------------
knitr::kable(head(t(model$top_terms)), 
             col.names = rep("", nrow(model$top_terms)))

## -----------------------------------------------------------------------------
# Get the prevalence of each topic
# You can make this discrete by applying a threshold, say 0.05, for
# topics in/out of docuemnts. 
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100

# prevalence should be proportional to alpha
plot(model$prevalence, model$alpha, xlab = "prevalence", ylab = "alpha")

# textmineR has a naive topic labeling tool based on probable bigrams
model$labels <- LabelTopics(assignments = model$theta > 0.05, 
                            dtm = dtm,
                            M = 1)

head(model$labels)

# put them together, with coherence into a summary table
model$summary <- data.frame(topic = rownames(model$phi),
                            label = model$labels,
                            coherence = round(model$coherence, 3),
                            prevalence = round(model$prevalence,3),
                            top_terms = apply(model$top_terms, 2, function(x){
                              paste(x, collapse = ", ")
                            }),
                            stringsAsFactors = FALSE)


## ----eval = FALSE-------------------------------------------------------------
#  model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ], caption = "Summary of 10 most prevalent topics")


## ----fig.width = 7.5, fig.height = 4------------------------------------------

# predictions with gibbs
assignments <- predict(model, dtm,
                       method = "gibbs", 
                       iterations = 200,
                       burnin = 180,
                       cpus = 2)

# predictions with dot
assignments_dot <- predict(model, dtm,
                           method = "dot")


# compare
barplot(rbind(assignments[10,], assignments_dot[10,]),
        col = c("red", "blue"), las = 2, beside = TRUE)
legend("topright", legend = c("gibbs", "dot"), col = c("red", "blue"), 
       fill = c("red", "blue"))

## -----------------------------------------------------------------------------

# get a tf-idf matrix
tf_sample <- TermDocFreq(dtm)

tf_sample$idf[ is.infinite(tf_sample$idf) ] <- 0 # fix idf for missing words

tf_idf <- t(dtm / rowSums(dtm)) * tf_sample$idf

tf_idf <- t(tf_idf)

# Fit a Latent Semantic Analysis model
# note the number of topics is arbitrary here
# see extensions for more info
lsa_model <- FitLsaModel(dtm = tf_idf, 
                     k = 100)

# objects: 
# sv = a vector of singular values created with SVD
# theta = distribution of topics over documents
# phi = distribution of words over topics
# gamma = predition matrix, distribution of topics over words
# coherence = coherence of each topic
# data = data used to train model
str(lsa_model)

## ----fig.width = 7.5, fig.height = 4------------------------------------------
# probabilistic coherence, a measure of topic quality
# - can be used with any topic lsa_model, e.g. LSA

summary(lsa_model$coherence)

hist(lsa_model$coherence, col= "blue")

# Get the top terms of each topic
lsa_model$top_terms <- GetTopTerms(phi = lsa_model$phi, M = 5)

## ----eval = FALSE-------------------------------------------------------------
#  head(t(lsa_model$top_terms))

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(head(t(lsa_model$top_terms)), 
             col.names = rep("", nrow(lsa_model$top_terms)))

## -----------------------------------------------------------------------------

# Get the prevalence of each topic
# You can make this discrete by applying a threshold, say 0.05, for
# topics in/out of docuemnts. 
lsa_model$prevalence <- colSums(lsa_model$theta) / sum(lsa_model$theta) * 100

# textmineR has a naive topic labeling tool based on probable bigrams
lsa_model$labels <- LabelTopics(assignments = lsa_model$theta > 0.05, 
                            dtm = dtm,
                            M = 1)

## ----eval = FALSE-------------------------------------------------------------
#  head(lsa_model$labels)

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(head(lsa_model$labels))


## -----------------------------------------------------------------------------

# put them together, with coherence into a summary table
lsa_model$summary <- data.frame(topic = rownames(lsa_model$phi),
                            label = lsa_model$labels,
                            coherence = round(lsa_model$coherence, 3),
                            prevalence = round(lsa_model$prevalence,3),
                            top_terms = apply(lsa_model$top_terms, 2, function(x){
                              paste(x, collapse = ", ")
                            }),
                            stringsAsFactors = FALSE)

## ----eval = FALSE-------------------------------------------------------------
#  lsa_model$summary[ order(lsa_model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(lsa_model$summary[ order(lsa_model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ], caption = "Summary of 10 most prevalent LSA topics")


## -----------------------------------------------------------------------------
# Get topic predictions for all 5,000 documents

# set up the assignments matrix and a simple dot product gives us predictions
lsa_assignments <- t(dtm / rowSums(dtm)) * tf_sample$idf

lsa_assignments <- t(lsa_assignments)

lsa_assignments <- predict(lsa_model, lsa_assignments)



## ----fig.width = 7.5, fig.height = 4------------------------------------------
# compare the "fit" assignments to the predicted ones
barplot(rbind(lsa_model$theta[ rownames(dtm)[ 1 ] , ],
              lsa_assignments[ rownames(dtm)[ 1 ] , ]), 
        las = 2,
        main = "Comparing topic assignments in LSA",
        beside = TRUE,
        col = c("red", "blue"))

legend("topleft", 
       legend = c("During fitting", "Predicted"),
       fill = c("red", "blue"))


## ----fig.width = 7.5, fig.height = 4------------------------------------------
# load a sample DTM
data(nih_sample_dtm)

# choose a range of k 
# - here, the range runs into the corpus size. Not recommended for large corpora!
k_list <- seq(10,85, by=15)

# you may want toset up a temporary directory to store fit models so you get 
# partial results if the process fails or times out. This is a trivial example, 
# but with a decent sized corpus, the procedure can take hours or days, 
# depending on the size of the data and complexity of the model.
# I suggest using the digest package to create a hash so that it's obvious this 
# is a temporary directory
model_dir <- paste0("models_", digest::digest(colnames(nih_sample_dtm), algo = "sha1"))

# Fit a bunch of LDA models
# even on this trivial corpus, it will take a bit of time to fit all of these models
model_list <- TmParallelApply(X = k_list, FUN = function(k){

  m <- FitLdaModel(dtm = nih_sample_dtm, 
                   k = k, 
                   iterations = 200, 
                   burnin = 180,
                   alpha = 0.1,
                   beta = colSums(nih_sample_dtm) / sum(nih_sample_dtm) * 100,
                   optimize_alpha = TRUE,
                   calc_likelihood = FALSE,
                   calc_coherence = TRUE,
                   calc_r2 = FALSE,
                   cpus = 1)
  m$k <- k
  
  m
}, export= ls(), # c("nih_sample_dtm"), # export only needed for Windows machines
cpus = 2) 

# Get average coherence for each model
coherence_mat <- data.frame(k = sapply(model_list, function(x) nrow(x$phi)), 
                            coherence = sapply(model_list, function(x) mean(x$coherence)), 
                            stringsAsFactors = FALSE)


# Plot the result
# On larger (~1,000 or greater documents) corpora, you will usually get a clear peak
plot(coherence_mat, type = "o")
Any scripts or data that you put into this service are public.
textmineR documentation built on June 28, 2021, 9:08 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
textmineR
Functions for Text Mining and Topic Modeling

inst/doc/c_topic_modeling.R
In textmineR: Functions for Text Mining and Topic Modeling

Try the textmineR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

textmineR Functions for Text Mining and Topic Modeling

inst/doc/c_topic_modeling.R In textmineR: Functions for Text Mining and Topic Modeling

Try the textmineR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

textmineR
Functions for Text Mining and Topic Modeling

inst/doc/c_topic_modeling.R
In textmineR: Functions for Text Mining and Topic Modeling