R/topic_flow.R

Defines functions CalculateHighestTopicCosineSimilarity CalculateTopicFlow

Documented in CalculateHighestTopicCosineSimilarity CalculateTopicFlow

############# Similarity between Topic Matrices #################

#' Calculate Maximimum Likehood Topic
#'
#' Accepts two topic term matrices generated by different LDA runs.
#' Topics may have different terms in their respective vocabulary.
#' Non-existent terms will be added with probability 0 to each other.
#' See \code{\link{CalculateTopicFlow}} for an example of it's usage.
#'
#' @param ttm1 A topic term matrix
#' @param ttm2 A topic term matrix (e.g. of the following month)
#'
#' @return Returns a cartesian product data.table of the ttm pair containing the topics with maximum likehood.
CalculateHighestTopicCosineSimilarity <- function(ttm1,ttm2){
  ttm1.vocab <- colnames(ttm1)
  ttm2.vocab <- colnames(ttm2)
  ## Which terms of the first vector are NOT in the second vector? Returns the indices.
  ttm1.vocab.missing.index <- which(!ttm2.vocab %in% ttm1.vocab)
  ttm1.vocab <- c(ttm1.vocab,ttm2.vocab[ttm1.vocab.missing.index])
  # Next we need to add column vectors of probability 0 for every new term on ttm1. We start by creating the matrix.
  ttm1.missing <- matrix(data=0,nrow=nrow(ttm1),ncol=length(ttm1.vocab.missing.index))
  colnames(ttm1.missing) <- ttm2.vocab[ttm1.vocab.missing.index]
  #Then we add the columns to the original matrix.
  ttm1 <- cbind(ttm1,ttm1.missing) #Notice columns are appended in the end. This results in a unmatching order of columns to ttm2.

  ## Same idea for ttm2
  ttm2.vocab <- colnames(ttm2)

  ttm2.vocab.missing.index <- which(!ttm1.vocab %in% ttm2.vocab)
  ttm2.vocab <- c(ttm2.vocab,ttm1.vocab[ttm2.vocab.missing.index])

  ttm2.missing <- matrix(data=0,nrow=nrow(ttm2),ncol=length(ttm2.vocab.missing.index))
  colnames(ttm2.missing) <- ttm1.vocab[ttm2.vocab.missing.index]
  ttm2 <- cbind(ttm2,ttm2.missing)

  ## ttm1 columns and ttm2 columns now needs to be ordered so that the vectors for Dkl are consistent
  # Since the vocabularies are now the same, ordering both should result in the same order of words
  ttm1 <- ttm1[,order(ttm1.vocab,decreasing=TRUE)]
  ttm2 <- ttm2[,order(ttm2.vocab,decreasing=TRUE)]

  # Next, we must calculate the Cartesian Product of the rows of ttm1 versus the rows of ttm2, i.e.
  # We compare the similarity of topics of one month versus topics of the other month (but not of the same month, LDAVis package already does that)
  cartesian.ttm1.ttm2.index <- expand.grid(ttm1=1:nrow(ttm1),ttm2=1:nrow(ttm2))

  # The number of similarity scores which will be obtained is equal to the number of possible combinations of topic 1 against topic 2.
  similarity <- array(data=NA,dim=nrow(cartesian.ttm1.ttm2.index))

  for (i in 1:nrow(cartesian.ttm1.ttm2.index)){
    topic1.index <- cartesian.ttm1.ttm2.index$ttm1[i]
    topic2.index <- cartesian.ttm1.ttm2.index$ttm2[i]
    topic1 <- ttm1[topic1.index,]
    topic2 <- ttm2[topic2.index,]
    similarity[i] <- cosine(topic1,topic2)
  }
  topic.similarity <- as.data.table(cbind(cartesian.ttm1.ttm2.index,similarity))

  #Necessary aux function to add ttm2 column for the group by
  aux.highest.similarity <- function(ttm2,similarity){
    index <- which.max(similarity)
    return(list(ttm2=ttm2[index],similarity=similarity[index]))
  }
  highest.topic.similarity <- topic.similarity[,aux.highest.similarity(ttm2,similarity),by=c("ttm1")]

  return(highest.topic.similarity)
}

#' Calculate Topic Flow
#'
#' Creates a topic flow table connecting various monthly topics.
#'
#' @param models A list of models. See \code{\link{rawToLDA}}.
#'
#' @return A topic flow data table.
CalculateTopicFlow <- function(models){
  similarity.list <- list()
  for (i in 2:length(models)){
    #print(i)
    ttm1 <- posterior(models[[i-1]][["LDA"]])$terms
    ttm2 <- posterior(models[[i]][["LDA"]])$terms
    similarity.index <- paste(names(models)[i-1],names(models)[i],sep="_")
    similarity.list[[similarity.index]] <- CalculateHighestTopicCosineSimilarity(ttm1,ttm2)
    colnames(similarity.list[[similarity.index]]) <- c(names(models)[i-1],names(models)[i],paste0(similarity.index,"_similarity"))

  }

  similarity.df <- similarity.list[[1]]
  for (i in 2:length(similarity.list)){
    #print(i)

    # Overlapping column is always second to last on left table
    topic.overlap.x <- colnames(similarity.df)[[ncol(similarity.df)-1]]
    # And first column on right table
    topic.overlap.y <- colnames(similarity.list[[i]])[[1]]
    similarity.df <- merge(similarity.df,similarity.list[[i]],by.x=topic.overlap.x,by.y=topic.overlap.y,all.x=TRUE,all.y=TRUE)
  }

  #move Dec column to the left to stay consistent
  dec.index <- ncol(similarity.df)-1
  immediately.before.dec.index <- dec.index-1
  immediately.after.dec.index <- dec.index+1
  similarity.df <- similarity.df[,c(dec.index,1:immediately.before.dec.index,immediately.after.dec.index:ncol(similarity.df)),with=FALSE]
  return(similarity.df)
}
sailuh/topicflowr documentation built on May 27, 2019, 8:46 a.m.