d_text_embeddings.R
In textmineR: Functions for Text Mining and Topic Modeling

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------

# load the NIH data set
library(textmineR)

# load nih_sample data set from textmineR
data(nih_sample)

# First create a TCM using skip grams, we'll use a 5-word window
# most options available on CreateDtm are also available for CreateTcm
tcm <- CreateTcm(doc_vec = nih_sample$ABSTRACT_TEXT,
                 skipgram_window = 10,
                 verbose = FALSE,
                 cpus = 2)

# a TCM is generally larger than a DTM
dim(tcm)

## -----------------------------------------------------------------------------
# use LDA to get embeddings into probability space
# This will take considerably longer as the TCM matrix has many more rows 
# than your average DTM
embeddings <- FitLdaModel(dtm = tcm,
                          k = 50,
                          iterations = 200,
                          burnin = 180,
                          alpha = 0.1,
                          beta = 0.05,
                          optimize_alpha = TRUE,
                          calc_likelihood = FALSE,
                          calc_coherence = TRUE,
                          calc_r2 = TRUE,
                          cpus = 2)

## -----------------------------------------------------------------------------
# Get an R-squared for general goodness of fit
embeddings$r2

# Get coherence (relative to the TCM) for goodness of fit
summary(embeddings$coherence)

## -----------------------------------------------------------------------------
# Get top terms, no labels because we don't have bigrams
embeddings$top_terms <- GetTopTerms(phi = embeddings$phi,
                                    M = 5)

## -----------------------------------------------------------------------------
# Create a summary table, similar to the above
embeddings$summary <- data.frame(topic = rownames(embeddings$phi),
                                 coherence = round(embeddings$coherence, 3),
                                 prevalence = round(colSums(embeddings$theta), 2),
                                 top_terms = apply(embeddings$top_terms, 2, function(x){
                                   paste(x, collapse = ", ")
                                 }),
                                 stringsAsFactors = FALSE)


## ----eval = FALSE-------------------------------------------------------------
#  embeddings$summary[ order(embeddings$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(embeddings$summary[ order(embeddings$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ], caption = "Summary of top 10 embedding dimensions")

## ----eval = FALSE-------------------------------------------------------------
#  embeddings$summary[ order(embeddings$summary$coherence, decreasing = TRUE) , ][ 1:10 , ]

## ----echo = FALSE-------------------------------------------------------------
knitr::kable(embeddings$summary[ order(embeddings$summary$coherence, decreasing = TRUE) , ][ 1:10 , ], caption = "Summary of 10 most coherent embedding dimensions")

## -----------------------------------------------------------------------------
# Make a DTM from our documents
dtm_embed <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT,
                       doc_names = nih_sample$APPLICATION_ID,
                       ngram_window = c(1,1),
                       verbose = FALSE,
                       cpus = 2)

dtm_embed <- dtm_embed[,colSums(dtm_embed) > 2]

# Project the documents into the embedding space
embedding_assignments <- predict(embeddings, dtm_embed, method = "gibbs",
                                 iterations = 200, burnin = 180)

## -----------------------------------------------------------------------------
# get a goodness of fit relative to the DTM
embeddings$r2_dtm <- CalcTopicModelR2(dtm = dtm_embed, 
                                      phi = embeddings$phi[,colnames(dtm_embed)], # line up vocabulary
                                      theta = embedding_assignments,
                                      cpus = 2)

embeddings$r2_dtm

# get coherence relative to DTM
embeddings$coherence_dtm <- CalcProbCoherence(phi = embeddings$phi[,colnames(dtm_embed)], # line up vocabulary
                                              dtm = dtm_embed)

summary(embeddings$coherence_dtm)

Any scripts or data that you put into this service are public.

textmineR documentation built on June 28, 2021, 9:08 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

textmineR
Functions for Text Mining and Topic Modeling

inst/doc/d_text_embeddings.R
In textmineR: Functions for Text Mining and Topic Modeling

Try the textmineR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

textmineR Functions for Text Mining and Topic Modeling

inst/doc/d_text_embeddings.R In textmineR: Functions for Text Mining and Topic Modeling

Try the textmineR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

textmineR
Functions for Text Mining and Topic Modeling

inst/doc/d_text_embeddings.R
In textmineR: Functions for Text Mining and Topic Modeling