Nothing
context("coherence")
# generation of test data -------------------------
#library(text2vec)
#data("movie_review")
N = 500
tokens = word_tokenizer(tolower(movie_review$review[1:N]))
it = itoken(tokens, progressbar = FALSE)
v = create_vocabulary(it)
v = prune_vocabulary(v, term_count_min = 5, doc_proportion_max = 0.2)
dtm = create_dtm(it, vocab_vectorizer(v))
n_topics = 100
n_top_terms = 10
lda_model = text2vec::LDA$new(n_topics = n_topics)
fitted = lda_model$fit_transform(dtm)
top_terms = lda_model$get_top_words(n = n_top_terms, topic_number = 1L:n_topics)
topic_word_distribution = lda_model$topic_word_distribution
# -------------------------------------------------
test_that("coherence, general functionality", {
# intrinsic reference tcm from corpus for testing general functionality
tcm_intrinsic = Matrix::crossprod(sign(dtm))
coherence_res = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm))
expect_true(inherits(coherence_res, "matrix"))
expect_equal(typeof(coherence_res), "double")
expect_true(setequal(colnames(coherence_res),
c("mean_logratio", "mean_pmi", "mean_npmi", "mean_difference", "mean_npmi_cosim", "mean_npmi_cosim2")))
expect_equal(nrow(coherence_res), n_topics)
# different smoothing constants
coherence_res_adapted_smooth = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm), smooth = .01)
expect_false(sum(as.vector(coherence_res)) == sum(as.vector(coherence_res_adapted_smooth)))
# incomplete input
tcm_err = tcm_intrinsic[1:2,1:2]
tcm_err[1,2] = 0
tcm_err[2,1] = 0
expect_warning({coherence_err = coherence(x = top_terms, tcm = tcm_err, n_doc_tcm = nrow(dtm))})
expect_true(all(is.na(coherence_err)))
})
test_that("coherence, vectorized vs. mapply loop calculation of PMI", {
#1. some simple toy data----------------------------
tcm = matrix(rbind(c(40, 1, 2, 3),
c(1, 30, 4, 5),
c(2, 4,20, 6),
c(3, 5, 6,10)), ncol = 4)
#pmi nonvectorized
#to make it easy take all indices of tcm
idxs = 1:ncol(tcm)
#one index with preceeding indices combinations
idxs_combis = t(combn(idxs,2, FUN = function(x) sort(x, decreasing = TRUE)))
pmi = mapply(function(x,y) {log2((tcm[x,y]) + 1e-12) - log2(tcm[x,x]) - log2(tcm[y,y])} ,idxs_combis[,1], idxs_combis[,2])
#pmi vectorized
res = as.matrix(tcm[idxs, idxs])
res[upper.tri(res)] = res[upper.tri(res)] + 1e-12
d = diag(res)
res = res/d
res = res %*% diag(1 / d)
res = res[upper.tri(res)]
pmi_vect = log2(res)
expect_equal(sort(pmi), sort(pmi_vect))
#2. some real data and several arbitrary subsets-----
tcm = Matrix::crossprod(dtm)
for (i in 1:10) {
set.seed(i)
idxs = sample(1:ncol(tcm), 4)
#one index with preceeding indices combinations
idxs_combis = t(combn(idxs,2, FUN = function(x) sort(x, decreasing = TRUE)))
pmi = mapply(function(x,y) {log2((tcm[x,y]) + 1e-12) - log2(tcm[x,x]) - log2(tcm[y,y])} ,idxs_combis[,1], idxs_combis[,2])
#pmi vectorized
res = as.matrix(tcm[idxs, idxs])
res[upper.tri(res)] = res[upper.tri(res)] + 1e-12
d = diag(res)
res = res/d
res = res %*% diag(1 / d)
res = res[upper.tri(res)]
pmi_vect = log2(res)
expect_equal(sort(pmi), sort(pmi_vect))
}
})
test_that("coherence, results of text2vec vs other packages", {
#textmineR: difference metric via function CalcProbCoherence: https://github.com/bstewart/stm/blob/master/R/semanticCoherence.R
#stm: adapted UMass via the function semCoh1beta: https://github.com/bstewart/stm/blob/master/R/semanticCoherence.R
# definition/copy of functions from other packages ------------------
# apart from some slight changes of the stm function to allow direct comparison
# marked via "<<<<<<<<<<< NOTE FROM TEXT2VEC"
# below functions are simply a copy from the other packages
CalcProbCoherence <- function(phi, dtm, M = 5){
# phi is a numeric matrix or numeric vector?
if( ! is.numeric(phi) ){
stop("phi must be a numeric matrix whose rows index topics and columns\n",
" index terms or phi must be a numeric vector whose entries index terms.")
}
# is dtm a matrix we can work with?
if( ! is.matrix(dtm) &&
! inherits(dtm, 'Matrix')){
stop("dtm must be a matrix. This can be a standard R dense matrix or a\n",
" matrix of class CsparseMatrix, TsparseMatrix, RsparseMatrix, or dgeMatrix")
}
# is M numeric? If it is not an integer, give a warning.
if( ! is.numeric(M) | M < 1){
stop("M must be an integer in 1:ncol(phi) or 1:length(phi)")
}
if(length(M) != 1){
warning("M is a vector when scalar is expected. Taking only the first value")
M <- M[[1 ]]
}
if(floor(M) != M){
warning("M is expected to be an integer. floor(M) is being used.")
M <- floor(M)
}
# dtm has colnames?
if( is.null(colnames(dtm))){
stop("dtm must have colnames")
}
# Names of phi in colnames(dtm)
if( ! is.matrix(phi) ){
if(sum(names(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)){
stop("names(phi)[ 1:M ] are not in colnames(dtm)")
}
}else if(sum(colnames(phi)[ 1:M ] %in% colnames(dtm)) != length(1:M)){
stop("colnames(phi)[ 1:M ] are not in colnames(dtm)")
}
# Declare a function to get probabilistic coherence on one topic
pcoh <- function(topic, dtm, M){
terms <- names(topic)[order(topic, decreasing = TRUE)][1:M]
dtm.t <- dtm[, terms]
dtm.t[dtm.t > 0] <- 1
count.mat <- Matrix::t(dtm.t) %*% dtm.t
num.docs <- nrow(dtm)
p.mat <- count.mat/num.docs
# result <- sapply(1:(ncol(count.mat) - 1), function(x) {
# mean(p.mat[x, (x + 1):ncol(p.mat)]/p.mat[x, x] - Matrix::diag(p.mat)[(x +
# 1):ncol(p.mat)], na.rm = TRUE)
# })
# mean(result, na.rm = TRUE)
result <- sapply(1:(ncol(count.mat) - 1), function(x) {
p.mat[x, (x + 1):ncol(p.mat)]/p.mat[x, x] -
Matrix::diag(p.mat)[(x + 1):ncol(p.mat)]
})
mean(unlist(result), na.rm = TRUE)
}
# if phi is a single topic vector get that one coherence
if( ! is.matrix(phi) ){
return(pcoh(topic = phi, dtm = dtm, M = M))
}
# Otherwise, do it for all the topics
apply(phi, 1, function(x){
pcoh(topic = x, dtm = dtm, M = M)
})
}
semCoh1beta_adapted <- function(mat, M, beta) {
#Get the Top N Words
top.words <- apply(beta, 1, order, decreasing=TRUE)[1:M,]
wordlist <- unique(as.vector(top.words))
mat <- mat[,wordlist]
mat = sign(mat)
#do the cross product to get co-occurences
cross <- tcrossprod(t(mat))
#create a list object with the renumbered words (so now it corresponds to the rows in the table)
temp <- match(as.vector(top.words),wordlist)
labels <- split(temp, rep(1:nrow(beta), each=M))
#Note this could be done with recursion in an elegant way, but let's just be simpler about it.
sem <- function(ml,cross) {
m <- ml[1]; l <- ml[2]
#The following commented line is the original line from stm <<<<<<<<<<<<<<<<<<<<<<<<< NOTE FROM TEXT2VEC
#log(.01 + cross[m,l]) - log(cross[l,l] + .01)
#it was adapted allow direct comparison with text2vec implementation by
#(i) removing smoothing from diagonal values
#this is not done in text2vec since any(diag(res) == 0) is FALSE or in other words)
#(ii) using smaller smoohting value to
log(1e-12 + cross[m,l]) - log(cross[l,l])
}
result <- vector(length=nrow(beta))
for(k in 1:nrow(beta)) {
grid <- expand.grid(labels[[k]],labels[[k]])
colnames(grid) <- c("m", "l") #corresponds to original paper
grid <- grid[grid$m > grid$l,]
calc <- apply(grid,1,sem,cross)
#The following commented line is the original line from stm <<<<<<<<<<<<<<<<<<<<<<<<< NOTE NOTE FROM TEXT2VEC
result[k] <- sum(calc)
#it was adapted allow direct comparison with text2vec implementation by
#(i) using the mean insted of the sum to
result[k] <- mean(calc, na.rm = TRUE)
}
return(result)
}
# intrinsic reference tcm from corpus as used in the other packages
tcm_intrinsic = Matrix::crossprod(sign(dtm))
coherence_text2vec = coherence(x = top_terms ,tcm = tcm_intrinsic, n_doc_tcm = nrow(dtm)
#select the corresponding metrics for testing against other packages
,metrics = c("mean_difference", "mean_logratio"))
#calculate coherence scores with other packages
logratio_stm_adapted = semCoh1beta_adapted(mat = dtm, M = n_top_terms, beta = topic_word_distribution)
mean_difference_textmineR = CalcProbCoherence(phi = topic_word_distribution, dtm = as.matrix(dtm), M = n_top_terms)
#compare results
compare = cbind(coherence_text2vec, mean_difference_textmineR, logratio_stm_adapted)
expect_equal(sort(compare[,"mean_logratio"]), sort(compare[,"logratio_stm_adapted"]))
expect_equal(sort(compare[,"mean_difference"]), sort(compare[,"mean_difference_textmineR"]))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.