############# Similarity between Topic Matrices #################
#' Calculate Maximimum Likehood Topic
#'
#' Accepts two topic term matrices generated by different LDA runs.
#' Topics may have different terms in their respective vocabulary.
#' Non-existent terms will be added with probability 0 to each other.
#' See \code{\link{CalculateTopicFlow}} for an example of it's usage.
#'
#' @param ttm1 A topic term matrix
#' @param ttm2 A topic term matrix (e.g. of the following month)
#'
#' @return Returns a cartesian product data.table of the ttm pair containing the topics with maximum likehood.
CalculateHighestTopicCosineSimilarity <- function(ttm1,ttm2){
ttm1.vocab <- colnames(ttm1)
ttm2.vocab <- colnames(ttm2)
## Which terms of the first vector are NOT in the second vector? Returns the indices.
ttm1.vocab.missing.index <- which(!ttm2.vocab %in% ttm1.vocab)
ttm1.vocab <- c(ttm1.vocab,ttm2.vocab[ttm1.vocab.missing.index])
# Next we need to add column vectors of probability 0 for every new term on ttm1. We start by creating the matrix.
ttm1.missing <- matrix(data=0,nrow=nrow(ttm1),ncol=length(ttm1.vocab.missing.index))
colnames(ttm1.missing) <- ttm2.vocab[ttm1.vocab.missing.index]
#Then we add the columns to the original matrix.
ttm1 <- cbind(ttm1,ttm1.missing) #Notice columns are appended in the end. This results in a unmatching order of columns to ttm2.
## Same idea for ttm2
ttm2.vocab <- colnames(ttm2)
ttm2.vocab.missing.index <- which(!ttm1.vocab %in% ttm2.vocab)
ttm2.vocab <- c(ttm2.vocab,ttm1.vocab[ttm2.vocab.missing.index])
ttm2.missing <- matrix(data=0,nrow=nrow(ttm2),ncol=length(ttm2.vocab.missing.index))
colnames(ttm2.missing) <- ttm1.vocab[ttm2.vocab.missing.index]
ttm2 <- cbind(ttm2,ttm2.missing)
## ttm1 columns and ttm2 columns now needs to be ordered so that the vectors for Dkl are consistent
# Since the vocabularies are now the same, ordering both should result in the same order of words
ttm1 <- ttm1[,order(ttm1.vocab,decreasing=TRUE)]
ttm2 <- ttm2[,order(ttm2.vocab,decreasing=TRUE)]
# Next, we must calculate the Cartesian Product of the rows of ttm1 versus the rows of ttm2, i.e.
# We compare the similarity of topics of one month versus topics of the other month (but not of the same month, LDAVis package already does that)
cartesian.ttm1.ttm2.index <- expand.grid(ttm1=1:nrow(ttm1),ttm2=1:nrow(ttm2))
# The number of similarity scores which will be obtained is equal to the number of possible combinations of topic 1 against topic 2.
similarity <- array(data=NA,dim=nrow(cartesian.ttm1.ttm2.index))
for (i in 1:nrow(cartesian.ttm1.ttm2.index)){
topic1.index <- cartesian.ttm1.ttm2.index$ttm1[i]
topic2.index <- cartesian.ttm1.ttm2.index$ttm2[i]
topic1 <- ttm1[topic1.index,]
topic2 <- ttm2[topic2.index,]
similarity[i] <- cosine(topic1,topic2)
}
topic.similarity <- as.data.table(cbind(cartesian.ttm1.ttm2.index,similarity))
#Necessary aux function to add ttm2 column for the group by
aux.highest.similarity <- function(ttm2,similarity){
index <- which.max(similarity)
return(list(ttm2=ttm2[index],similarity=similarity[index]))
}
highest.topic.similarity <- topic.similarity[,aux.highest.similarity(ttm2,similarity),by=c("ttm1")]
return(highest.topic.similarity)
}
#' Calculate Topic Flow
#'
#' Creates a topic flow table connecting various monthly topics.
#'
#' @param models A list of models. See \code{\link{rawToLDA}}.
#'
#' @return A topic flow data table.
CalculateTopicFlow <- function(models){
similarity.list <- list()
for (i in 2:length(models)){
#print(i)
ttm1 <- posterior(models[[i-1]][["LDA"]])$terms
ttm2 <- posterior(models[[i]][["LDA"]])$terms
similarity.index <- paste(names(models)[i-1],names(models)[i],sep="_")
similarity.list[[similarity.index]] <- CalculateHighestTopicCosineSimilarity(ttm1,ttm2)
colnames(similarity.list[[similarity.index]]) <- c(names(models)[i-1],names(models)[i],paste0(similarity.index,"_similarity"))
}
similarity.df <- similarity.list[[1]]
for (i in 2:length(similarity.list)){
#print(i)
# Overlapping column is always second to last on left table
topic.overlap.x <- colnames(similarity.df)[[ncol(similarity.df)-1]]
# And first column on right table
topic.overlap.y <- colnames(similarity.list[[i]])[[1]]
similarity.df <- merge(similarity.df,similarity.list[[i]],by.x=topic.overlap.x,by.y=topic.overlap.y,all.x=TRUE,all.y=TRUE)
}
#move Dec column to the left to stay consistent
dec.index <- ncol(similarity.df)-1
immediately.before.dec.index <- dec.index-1
immediately.after.dec.index <- dec.index+1
similarity.df <- similarity.df[,c(dec.index,1:immediately.before.dec.index,immediately.after.dec.index:ncol(similarity.df)),with=FALSE]
return(similarity.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.