library("quanteda")
mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>%
tokens() %>%
dfm()
mt <- dfm_trim(mt, min_termfreq = 10)
test_that("dist object has all the attributes", {
d1 <- textstat_dist(mt) %>% as.matrix %>% as.dist()
expect_equal(class(attr(d1, "Labels")), "character")
expect_equal(class(attr(d1, "Size")), "integer")
expect_equal(class(attr(d1, "call")), "call")
expect_equal(class(attr(d1, "Diag")), "logical")
expect_equal(class(attr(d1, "Upper")), "logical")
})
test_that("y errors if not a dfm", {
expect_error(
textstat_dist(mt, y = c("mr", "president"), margin = "features"),
"y must be a dfm matching x in the margin specified"
)
})
test_that("selection takes integer or logical vector", {
suppressWarnings({
expect_equivalent(textstat_dist(mt, selection = c(2, 5), margin = "features"),
textstat_dist(mt, selection = c("mr", "president"), margin = "features"))
l3 <- featnames(mt) %in% c("mr", "president")
expect_equivalent(textstat_dist(mt, selection = l3, margin = "features"),
textstat_dist(mt, selection = c("mr", "president"), margin = "features"))
})
expect_error(textstat_dist(mt, y = "xxxx", margin = "features"))
expect_error(textstat_dist(mt, y = 1000, margin = "features"))
expect_equivalent(textstat_dist(mt, y = mt[c(2, 4), ], margin = "documents"),
textstat_dist(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents"))
l4 <- docnames(mt) %in% c("1985-Reagan", "1993-Clinton")
expect_equivalent(textstat_dist(mt, y = mt[l4, ], margin = "documents"),
textstat_dist(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents"))
expect_error(textstat_dist(mt, y = "nothing", margin = "documents"))
expect_error(textstat_dist(mt, y = 100, margin = "documents"))
})
# test_that("textstat_dist() returns NA for empty dfm", {
# mt <- dfm_trim(data_dfm_lbgexample, 1000)
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "euclidean") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "euclidean")
# )
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "manhattan") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "manhattan")
# )
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "maximum") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "maximum")
# )
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "canberra") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "canberra")
# )
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "minkowski") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "minkowski", p = 2)
# )
# })
test_that("textstat_dist() returns NA for zero-variance documents", {
mt <- data_dfm_lbgexample[1:5, 1:20]
mt[1:2, ] <- 0
mt[3:4, ] <- 1
mt <- as.dfm(mt)
expect_equivalent(
textstat_dist(mt, method = "euclidean") %>% as.matrix() %>% as.dist(),
stats::dist(as.matrix(mt), method = "euclidean")
)
expect_equivalent(
textstat_dist(mt, method = "manhattan") %>% as.matrix() %>% as.dist(),
stats::dist(as.matrix(mt), method = "manhattan")
)
expect_equivalent(
textstat_dist(mt, method = "maximum") %>% as.matrix() %>% as.dist(),
stats::dist(as.matrix(mt), method = "maximum")
)
# stats::dist is wrong
# expect_equivalent(
# textstat_dist(mt, method = "canberra") %>% as.matrix() %>% as.dist(),
# stats::dist(as.matrix(mt), method = "canberra")
# )
expect_equivalent(
textstat_dist(mt, method = "minkowski") %>% as.matrix() %>% as.dist(),
stats::dist(as.matrix(mt), method = "minkowski", p = 2)
)
})
test_that("selection is always on columns (#1549)", {
mt <- dfm(tokens(corpus_subset(data_corpus_inaugural, Year > 1980)))
suppressWarnings({
expect_equal(
colnames(textstat_dist(mt, margin = "documents",
selection = c("1985-Reagan", "1989-Bush")) %>% as.matrix()),
c("1985-Reagan", "1989-Bush")
)
expect_equal(
colnames(textstat_dist(mt, margin = "documents", selection = c(2, 3)) %>% as.matrix()),
c("1985-Reagan", "1989-Bush")
)
expect_equal(
colnames(textstat_dist(mt, margin = "features", selection = c("justice", "and")) %>% as.matrix()),
c("justice", "and")
)
expect_equal(
colnames(textstat_dist(mt, margin = "features", selection = c(4, 6)) %>% as.matrix()),
c("mr", "chief")
)
})
})
test_that("all distances are bounded at 0", {
methods <- c("euclidean", "manhattan", "maximum", "canberra", "minkowski")
for (m in methods) {
expect_gte(min(textstat_dist(mt, method = m, margin = "features")), 0)
}
})
test_that("textstat_dist coercion methods work with options", {
mt2 <- mt[6:10, ]
# upper = TRUE, diag = TRUE
tstat <- textstat_dist(mt2, margin = "documents")
# expect_equal(nrow(tstat), nrow(mt2)^2)
mat <- as.matrix(tstat)
expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2)))
# in matrix, diagonal is 0
iden <- rep(0, ndoc(mt2)); names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
# upper = TRUE, diag = FALSE
tstat <- textstat_dist(mt2, margin = "documents")
# expect_equal(nrow(tstat), nrow(mt2)^2 - ndoc(mt2))
mat <- as.matrix(tstat)
expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2)))
iden <- rep(0, ndoc(mt2))
names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
# upper = FALSE, diag = TRUE
tstat <- textstat_dist(mt2, margin = "documents")
# expect_equal(nrow(tstat), (nrow(mt2)^2 - ndoc(mt2)) / 2 + ndoc(mt2))
mat <- as.matrix(tstat)
# expect_true(all(is.na(mat[upper.tri(mat)])))
# in matrix, diagonal is 0
iden <- rep(0, ndoc(mt2))
names(iden) <- docnames(mt2)
expect_equal(diag(as.matrix(tstat)), iden)
# upper = FALSE, diag = FALSE
tstat <- textstat_dist(mt2, margin = "documents")
# expect_equal(nrow(tstat), (nrow(mt2)^2 - ndoc(mt2)) / 2)
mat <- as.matrix(tstat)
loweranddiag <- upper.tri(mat)
diag(loweranddiag) <- TRUE
# expect_true(all(is.na(mat[upper.tri(mat)])))
# in matrix, diagonal is 0
iden <- rep(0, ndoc(mt2)); names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.