test-coherence.R
In text2vec: Modern Text Mining Framework for R

context("coherence")

# generation of test data -------------------------
#library(text2vec)
#data("movie_review")
N = 500
tokens = word_tokenizer(tolower(movie_review$review[1:N]))
it = itoken(tokens, progressbar = FALSE)
v = create_vocabulary(it)
v = prune_vocabulary(v, term_count_min = 5, doc_proportion_max = 0.2)
dtm = create_dtm(it, vocab_vectorizer(v))
n_topics = 100
n_top_terms = 10
lda_model = text2vec::LDA$new(n_topics = n_topics)
fitted = lda_model$fit_transform(dtm)
top_terms = lda_model$get_top_words(n = n_top_terms, topic_number = 1L:n_topics)
topic_word_distribution = lda_model$topic_word_distribution
# -------------------------------------------------

test_that("coherence, general functionality", {

# intrinsic reference tcm from corpus for testing general functionality
  tcm_intrinsic = Matrix::crossprod(sign(dtm))
  coherence_res = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm))
  expect_true(inherits(coherence_res, "matrix"))
  expect_equal(typeof(coherence_res), "double")
  expect_true(setequal(colnames(coherence_res),
                       c("mean_logratio", "mean_pmi", "mean_npmi", "mean_difference", "mean_npmi_cosim", "mean_npmi_cosim2")))
  expect_equal(nrow(coherence_res), n_topics)

# different smoothing constants
  coherence_res_adapted_smooth = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm), smooth = .01)
  expect_false(sum(as.vector(coherence_res)) == sum(as.vector(coherence_res_adapted_smooth)))

# incomplete input
  tcm_err = tcm_intrinsic[1:2,1:2]
  tcm_err[1,2] = 0
  tcm_err[2,1] = 0
  expect_warning({coherence_err = coherence(x = top_terms, tcm = tcm_err, n_doc_tcm = nrow(dtm))})
  expect_true(all(is.na(coherence_err)))

})

test_that("coherence, vectorized vs. mapply loop calculation of PMI", {

  #1. some simple toy data----------------------------
  tcm = matrix(rbind(c(40, 1, 2, 3),
                     c(1, 30, 4, 5),
                     c(2,  4,20, 6),
                     c(3,  5, 6,10)), ncol = 4)

  #pmi nonvectorized
  #to make it easy take all indices of tcm
  idxs = 1:ncol(tcm)
  #one index with preceeding indices combinations
  idxs_combis = t(combn(idxs,2, FUN = function(x) sort(x, decreasing = TRUE)))
  pmi = mapply(function(x,y) {log2((tcm[x,y]) + 1e-12) - log2(tcm[x,x]) - log2(tcm[y,y])} ,idxs_combis[,1], idxs_combis[,2])

  #pmi vectorized
  res = as.matrix(tcm[idxs, idxs])
  res[upper.tri(res)] = res[upper.tri(res)] + 1e-12
  d = diag(res)
  res = res/d
  res = res %*% diag(1 / d)
  res = res[upper.tri(res)]
  pmi_vect = log2(res)

  expect_equal(sort(pmi), sort(pmi_vect))

  #2. some real data and several arbitrary subsets-----
  tcm = Matrix::crossprod(dtm)

  for (i in 1:10) {
    set.seed(i)
    idxs =  sample(1:ncol(tcm), 4)

    #one index with preceeding indices combinations
    idxs_combis = t(combn(idxs,2, FUN = function(x) sort(x, decreasing = TRUE)))
    pmi = mapply(function(x,y) {log2((tcm[x,y]) + 1e-12) - log2(tcm[x,x]) - log2(tcm[y,y])} ,idxs_combis[,1], idxs_combis[,2])

    #pmi vectorized
    res = as.matrix(tcm[idxs, idxs])
    res[upper.tri(res)] = res[upper.tri(res)] + 1e-12
    d = diag(res)
    res = res/d
    res = res %*% diag(1 / d)
    res = res[upper.tri(res)]
    pmi_vect = log2(res)

    expect_equal(sort(pmi), sort(pmi_vect))
  }
})

test_that("coherence, results of text2vec vs other packages", {

  #textmineR: difference metric via function CalcProbCoherence: https://github.com/bstewart/stm/blob/master/R/semanticCoherence.R
  #stm: adapted UMass via the function semCoh1beta: https://github.com/bstewart/stm/blob/master/R/semanticCoherence.R

  # definition/copy of functions from other packages ------------------
  # apart from some  slight changes of the stm function to allow direct comparison
  # marked via "<<<<<<<<<<< NOTE FROM TEXT2VEC"
  # below functions are simply a copy from the other packages

  CalcProbCoherence <- function(phi, dtm, M = 5){

    # phi is a numeric matrix or numeric vector?
    if( ! is.numeric(phi) ){
      stop("phi must be a numeric matrix whose rows index topics and columns\n",
           " index terms or phi must be a numeric vector whose entries index terms.")
    }
    # is dtm a matrix we can work with?
    if( ! is.matrix(dtm) &&
        ! inherits(dtm, 'Matrix')){
      stop("dtm must be a matrix. This can be a standard R dense matrix or a\n",
           " matrix of class CsparseMatrix, TsparseMatrix, RsparseMatrix, or dgeMatrix")
    }

    # is M numeric? If it is not an integer, give a warning.
    if( ! is.numeric(M) | M < 1){
      stop("M must be an integer in 1:ncol(phi) or 1:length(phi)")
    }

    if(length(M) != 1){
      warning("M is a vector when scalar is expected. Taking only the first value")
      M <- M[[1 ]]
    }

    if(floor(M) != M){
      warning("M is expected to be an integer. floor(M) is being used.")
      M <- floor(M)
    }

    # dtm has colnames?
    if( is.null(colnames(dtm))){
      stop("dtm must have colnames")
    }

    # Names of phi in colnames(dtm)
    if( ! is.matrix(phi) ){
      if(sum(names(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)){
        stop("names(phi)[ 1:M ] are not in colnames(dtm)")
      }
    }else if(sum(colnames(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)){
      stop("colnames(phi)[ 1:M ] are not in colnames(dtm)")
    }

    # Declare a function to get probabilistic coherence on one topic
    pcoh <- function(topic, dtm, M){
      terms <- names(topic)[order(topic, decreasing = TRUE)][1:M]
      dtm.t <- dtm[, terms]
      dtm.t[dtm.t > 0] <- 1
      count.mat <- Matrix::t(dtm.t) %*% dtm.t
      num.docs <- nrow(dtm)
      p.mat <- count.mat/num.docs
      # result <- sapply(1:(ncol(count.mat) - 1), function(x) {
      #   mean(p.mat[x, (x + 1):ncol(p.mat)]/p.mat[x, x] - Matrix::diag(p.mat)[(x +
      #                                                                           1):ncol(p.mat)], na.rm = TRUE)
      # })
      # mean(result, na.rm = TRUE)
      result <- sapply(1:(ncol(count.mat) - 1), function(x) {
        p.mat[x, (x + 1):ncol(p.mat)]/p.mat[x, x] -
          Matrix::diag(p.mat)[(x + 1):ncol(p.mat)]
      })
      mean(unlist(result), na.rm = TRUE)
    }

    # if phi is a single topic vector get that one coherence
    if( ! is.matrix(phi) ){
      return(pcoh(topic = phi, dtm = dtm, M = M))
    }

    # Otherwise, do it for all the topics
    apply(phi, 1, function(x){
      pcoh(topic = x, dtm = dtm, M = M)
    })
  }

  semCoh1beta_adapted <- function(mat, M, beta) {
    #Get the Top N Words
    top.words <- apply(beta, 1, order, decreasing=TRUE)[1:M,]
    wordlist <- unique(as.vector(top.words))
    mat <- mat[,wordlist]
    mat = sign(mat)

    #do the cross product to get co-occurences
    cross <- tcrossprod(t(mat))

    #create a list object with the renumbered words (so now it corresponds to the rows in the table)
    temp <- match(as.vector(top.words),wordlist)
    labels <- split(temp, rep(1:nrow(beta), each=M))

    #Note this could be done with recursion in an elegant way, but let's just be simpler about it.
    sem <- function(ml,cross) {
      m <- ml[1]; l <- ml[2]
      #The following commented line is the original line from stm <<<<<<<<<<<<<<<<<<<<<<<<< NOTE FROM TEXT2VEC
      #log(.01 + cross[m,l]) - log(cross[l,l] + .01)
      #it was adapted allow direct comparison with text2vec implementation by
      #(i) removing smoothing from diagonal values
      #this is not done in text2vec since any(diag(res) == 0) is FALSE or in other words)
      #(ii) using smaller smoohting value to
      log(1e-12 + cross[m,l]) - log(cross[l,l])
    }
    result <- vector(length=nrow(beta))
    for(k in 1:nrow(beta)) {
      grid <- expand.grid(labels[[k]],labels[[k]])
      colnames(grid) <- c("m", "l") #corresponds to original paper
      grid <- grid[grid$m > grid$l,]
      calc <- apply(grid,1,sem,cross)
      #The following commented line is the original line from stm <<<<<<<<<<<<<<<<<<<<<<<<< NOTE NOTE FROM TEXT2VEC
      result[k] <- sum(calc)
      #it was adapted allow direct comparison with text2vec implementation by
      #(i) using the mean insted of the sum to
      result[k] <- mean(calc, na.rm = TRUE)
    }
    return(result)
  }

# intrinsic reference tcm from corpus as used in the other packages
tcm_intrinsic = Matrix::crossprod(sign(dtm))

coherence_text2vec = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm)
                               #select the corresponding metrics for testing against other packages
                               ,metrics = c("mean_difference", "mean_logratio"))

#calculate coherence scores with other packages
logratio_stm_adapted = semCoh1beta_adapted(mat = dtm, M = n_top_terms, beta = topic_word_distribution)
mean_difference_textmineR = CalcProbCoherence(phi = topic_word_distribution, dtm = as.matrix(dtm), M =  n_top_terms)

#compare results
compare = cbind(coherence_text2vec, mean_difference_textmineR, logratio_stm_adapted)

expect_equal(sort(compare[,"mean_logratio"]), sort(compare[,"logratio_stm_adapted"]))
expect_equal(sort(compare[,"mean_difference"]), sort(compare[,"mean_difference_textmineR"]))
})