#' @title topic.coherence
#' @description Calculates topic coherence for each topic in a topic model. Uses the implementation put forward by Mimno et al. (2011) - Note, this directly uses the code in the package 'SpeedyReader'
#' @param ldaOut fitted topic model
#' @param dtm document term matrix for the fitted topic model
#' @param n.terms The number of terms in each topic to calculate topic coherence on
#' @return vector of topic coherence scores (same length as the number of topics in the model)
#' @export
# Calculate average topic coherence
topic.coherence = function(ldaOut, dtm, n.terms){
# Implements UMass topic coherence based on PMI described by Minmo et al. "Optimizing Semantic Coherence in Topic Models"
# coherence function - taken from SpeedReader package for calculating coherence
# replicated here rather than imported as SpeedyReader is still being developed
# https://www.rdocumentation.org/packages/SpeedReader/versions/0.9.1/topics/topic_coherence
my.topic.coherence <- function(top_words,
document_term_matrix,
vocabulary = NULL,
numeric_top_words = FALSE,
K = length(top_words)){
# make sure the data is the right format
vocabulary <- as.character(vocabulary)
# perform some basic checks and throw errors if we see something weird.
if(is.null(vocabulary) & !numeric_top_words){
stop("You must provide a vocabulary vector!")
}
if(K > length(top_words)){
K <- length(top_words)
warning(paste("You must select a value for K that is less than length(top_words). K has automatically been set to :",K,sep = " "))
}
if(length(vocabulary) != ncol(document_term_matrix)){
stop("The vocaublary vector must have the same number of entries as the number of columns in the document_term_matrix, and the word indicated by entries in the i'th column of document_term_matrix must correspond to the i'th entry in vocabulary.")
}
#if we are only using the K top words then reduce our top words vector
top_words <- top_words[1:K]
# binarize the document term matrix
document_term_matrix <- matrix(as.numeric(document_term_matrix > 0),
nrow = nrow(document_term_matrix),
ncol = ncol(document_term_matrix))
coherence_score <- 0
for(i in 2:length(top_words)){
for(j in 1:(i-1)){
# we can either look up against vocab or just use indexes
if(numeric_top_words){
jindex <- top_words[j]
iindex <- top_words[i]
}else{
jindex <- which(vocabulary == top_words[j])
iindex <- which(vocabulary == top_words[i])
}
document_frequency <- sum(document_term_matrix[,jindex])
j_positive <- which(document_term_matrix[,jindex] > 0)
i_positive <- which(document_term_matrix[,iindex] > 0)
co_document_frequency <- sum(i_positive %in% j_positive)
coherence_score <- coherence_score + log((co_document_frequency + 1)/document_frequency)
}
}
if(is.infinite(coherence_score)){
coherence_score <- NA
warning("The coherence score was not finite. Make sure that all words in your vocabulary appear atleast once.")
}
return(coherence_score)
}
# top terms in each topic
ldaOut.topics = as.data.frame(terms(ldaOut, n.terms))
# get the coherence for each topic
topic.coherence = c()
# make the document term matrix
document_term_matrix = as.matrix(dtm)
for (i in seq(1, ncol(ldaOut.topics))){
# get the terms for just the ith topic
top_words = as.character(ldaOut.topics[, i])
# create a document term matrix with just these terms
document_term_matrix_k = document_term_matrix[,which(colnames(document_term_matrix) %in% top_words)]
# check that the ordering of the colnames matches up with the ordering of the terms
top_words = top_words[match(colnames(document_term_matrix_k), top_words)]
# calculate topic coherence using SpeedReader function
top.coherence.k = my.topic.coherence(top_words, document_term_matrix_k, vocabulary = top_words,
numeric_top_words = FALSE, K = length(top_words))
# save values
topic.coherence = c(topic.coherence, top.coherence.k)
}
return(topic.coherence)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.