context("model alignment")
library(dplyr)
options(java.parameters="-Xmx2g",
dfrtopics.mallet_logging="none",
dplyr.show_progress=FALSE,
dfrtopics.verbose=F)
# run a scrap model: we're just doing manipulation here, not worrying
# about quality
data_dir <- file.path(path.package("dfrtopics"),
"test-data", "pmla-modphil1905-1915")
fs <- list.files(file.path(data_dir, "wordcounts"), full.names=T)[1:60]
stoplist_file <- file.path(path.package("dfrtopics"), "stoplist",
"stoplist.txt")
insts <- read_wordcounts(fs) %>%
wordcounts_remove_rare(200) %>%
wordcounts_texts() %>%
make_instances(stoplist_file)
V <- length(instances_vocabulary(insts))
meta <- read_dfr_metadata(file.path(data_dir, "citations.tsv"))
M <- 4
K <- 8
s <- sample(10000, 1)
set.seed(s)
ms <- replicate(M, train_model(
insts,
n_topics=K,
n_iters=200,
threads=1,
alpha_sum=5,
beta=0.01,
n_hyper_iters=20,
n_burn_in=20,
n_max_iters=10,
metadata=meta,
seed=sample(10000, 1)
),
simplify=F
)
# use all words
dst <- model_distances(ms, V)
test_that("naive_cluster does the right thing with trivial data", {
expect_equal(
dfrtopics:::naive_cluster(c(1, 100, 100, 1), c(2, 2), 1)$clusters,
list(0:1, 0:1)
)
# setting a big threshold should make no difference
expect_equal(
dfrtopics:::naive_cluster(c(1, 100, 100, 1), c(2, 2), 1000)$clusters,
list(0:1, 0:1)
)
})
test_that("model_distances returns something of the right form", {
expect_is(dst, "model_distances")
expect_equal(length(dst$d), M - 1)
expect_equal(sapply(dst$d, length), seq(M - 1, 1))
expect_equal(lapply(do.call(c, dst$d), dim),
rep(list(c(K, K)), M * (M - 1) / 2))
# calculate 1.1:M.K
jsd <- JS_divergence(
tw_smooth_normalize(ms[[1]])(topic_words(ms[[1]]))[1, ],
tw_smooth_normalize(ms[[M]])(topic_words(ms[[M]]))[K, ]
)
# which should be retrieved as follows
expect_equal(dst$d[[1]][[M - 1]][1, K], jsd)
# or as follows
expect_equal(dst[1, M, 1, K], jsd)
})
test_that("unnesting distances is correct", {
dummy <- list(d=list(
list(matrix(1:4, nrow=2, byrow=T), matrix(5:8, nrow=2, byrow=T)),
list(matrix(9:12, nrow=2, byrow=T))
))
expect_equal(dfrtopics:::unnest_model_distances(dummy), 1:12)
})
test_that("we can take distances between non-overlapping models", {
m_voc <- list.files(file.path(data_dir, "wordcounts"),
full.names=T)[61:120] %>%
read_wordcounts() %>%
wordcounts_remove_rare(200) %>%
wordcounts_texts() %>%
make_instances(stoplist_file) %>%
train_model(n_topics=K, n_iters=200,
threads=1,
alpha_sum=5,
beta=0.01,
n_hyper_iters=20,
n_burn_in=20,
n_max_iters=10,
metadata=meta
)
dd <- model_distances(list(ms[[1]], ms[[2]], m_voc), 20)
expect_true(all(is.finite(
dfrtopics:::unnest_model_distances(dd)
)))
})
test_that("clustering two models with high threshold fully aligns them", {
dd <- model_distances(ms[1:2], 20)
cl <- align_topics(dd, 10000000)
expect_equal(sort(cl$clusters[[1]]), 1:K)
expect_equal(sort(cl$clusters[[2]]), 1:K)
})
test_that("in clustering two models, the closest pair is indeed grouped", {
dd <- model_distances(ms[1:2], 20)
cl <- align_topics(dd, 10000000)
closest <- which(dd$d[[1]][[1]] == min(dd$d[[1]][[1]]), arr.ind=TRUE)
expect_equal(cl$clusters[[1]][closest[1]], cl$clusters[[2]][closest[2]])
})
test_that("clustering meets up-to-one constraint", {
cl <- matrix(unlist(align_topics(dst)$clusters), byrow=T, nrow=M)
expect_equal(dim(cl), c(M, K))
expect_true(all(1 <= cl & cl <= M * K))
ck <- gather_matrix(cl, col_names=c("model", "topic", "cluster")) %>%
group_by(cluster) %>%
summarize(no_dupes=anyDuplicated(model) == 0)
expect_true(all(ck$no_dupes))
})
test_that("clustering with default infinite threshold leaves no isolates", {
cl <- align_topics(dst)$clusters
# it can leave some non-full clusters, because of the greedy algorithm
# and the mapping constraint
expect_true(!(1 %in% tabulate(unlist(cl))))
})
test_that("clustering with low threshold leaves some isolates", {
cldst <- unlist(align_topics(dst)$distances)
thresh <- quantile(cldst[cldst > 0], 0.25)
cl <- align_topics(dst, thresh)
expect_true(length(unique(unlist(cl$clusters))) > K)
expect_true(all(unlist(cl$distances) <= thresh))
})
test_that("alignment_frame gives an expected result", {
cl <- align_topics(dst)
frm <- alignment_frame(cl)
expect_equal(colnames(frm),
c("cluster", "model", "topic", "label", "d"))
expect_equal(nrow(frm), K * M)
})
test_that("an obvious clustering is found", {
faked <- structure(
list(d=list( # triangle inequality, shmiangle inequality
list(
matrix(c(100, 1, # remember these are transposed
1, 100), nrow=2),
matrix(c(1, 100,
1, 100), nrow=2)),
list(
matrix(c(100, 1,
1, 100), nrow=2)))),
class="model_distances")
cl <- align_topics(faked)
expect_equal(cl$clusters, list(1:2, 2:1, 1:2))
})
test_that("a trivial clustering is found", {
dtriv <- model_distances(ms[c(1, 1, 1)], V)
cl <- align_topics(dtriv)$clusters
expect_equal(cl, rep(cl[1], 3))
})
test_that("cluster widths are right", {
cl <- align_topics(dst)
wd1 <- widths(cl)
clmat <- matrix(unlist(cl$clusters), byrow=T, nrow=M)
# find widths by (even more) brute force
n_clust <- max(unlist(cl$clusters))
wd2 <- sapply(1:n_clust, function (clst) {
members <- which(clmat == clst, arr.ind=T)
if (nrow(members) <= 1) {
NA
} else {
max(apply(combn(nrow(members), 2), 2, function (p)
dst[members[p[1], 1], members[p[2], 1],
members[p[1], 2], members[p[2], 2]
]
))
}
})
expect_equal(wd1, wd2)
})
test_that("we can also cluster models with varying K", {
ks <- 7:10
msk <- lapply(ks, function (K) train_model(
insts,
n_topics=K,
n_iters=200,
threads=1,
alpha_sum=5,
beta=0.01,
n_hyper_iters=20,
n_burn_in=20,
n_max_iters=10,
metadata=meta
))
dstk <- model_distances(msk, V)
expect_equal(dim(dstk$d[[1]][[length(ks) - 1]]), ks[c(1, length(ks))])
cl <- align_topics(dstk)
expect_equal(sapply(cl$clusters, length), ks)
})
test_that("different topic-word matrix types is okay", {
msmat <- ms
# The gremlin comes and converts one Matrix into a matrix
gremlin <- sample(M, 1)
msmat[[gremlin]]$topic_words <- as.matrix(topic_words(ms[[gremlin]]))
expect_true(!is(topic_words(msmat[[gremlin]]), "Matrix"))
expect_equal(model_distances(msmat, V)$d, dst$d)
})
test_that("naive_cluster is equivalent to naivest_cluster", {
cl <- align_topics(dst)
ncl <- dfrtopics:::naivest_cluster(dst)
expect_equal(cl$clusters, ncl)
})
message("random seed started at: ", s)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.