library("quanteda")
mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>%
tokens() %>%
dfm()
mt <- dfm_trim(mt, min_termfreq = 10)
test_that("y errors if not a dfm", {
expect_error(
textstat_simil(mt, y = c("mr", "president"), margin = "features"),
"y must be a dfm matching x in the margin specified"
)
})
test_that("selection takes integer or logical vector", {
expect_equivalent(textstat_simil(mt, y = mt[, c(2, 5)], margin = "features"),
textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features"))
suppressWarnings(expect_equivalent(textstat_simil(mt, y = mt[, c(2, 5)], margin = "features"),
textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features")))
l1 <- featnames(mt) %in% c("mr", "president")
expect_equivalent(textstat_simil(mt, y = mt[, l1], margin = "features"),
textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features"))
expect_error(textstat_simil(mt, "xxxx", margin = "features"))
expect_error(textstat_simil(mt, 1000, margin = "features"))
expect_equivalent(textstat_simil(mt, y = mt[c(2, 4), ], margin = "documents"),
textstat_simil(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents"))
l2 <- docnames(mt) %in% c("1985-Reagan", "1993-Clinton")
expect_equivalent(textstat_simil(mt, y = mt[l2, ], margin = "documents"),
textstat_simil(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents"))
expect_error(textstat_simil(mt, y = "nothing", margin = "documents"))
expect_error(textstat_simil(mt, y = 100, margin = "documents"))
})
test_that("textstat_simil() returns NA for empty dfm", {
skip_if_not_installed("proxy")
mt <- dfm_trim(data_dfm_lbgexample, 1000)
# # cor is wrong
# expect_equivalent(
# unclass(as.matrix(textstat_simil(mt, method = "correlation"))),
# unclass(cor(t(as.matrix(mt)), method = "pearson"))
# )
expect_equivalent(
unclass(as.dist(textstat_simil(mt, method = "correlation"))),
unclass(proxy::simil(as.matrix(mt), method = "correlation"))
)
expect_equivalent(
unclass(as.dist(textstat_simil(mt, method = "cosine"))),
unclass(proxy::simil(as.matrix(mt), method = "cosine"))
)
# proxy::simil is wrong
# expect_equivalent(
# unclass(as.dist(textstat_simil(mt, method = "jaccard"))),
# unclass(as.dist(proxy::simil(as.matrix(mt), method = "jaccard")))
# )
# proxy::simil is wrong
# expect_equivalent(
# unclass(as.dist(textstat_simil(mt, method = "ejaccard"))),
# unclass(as.dist(proxy::simil(as.matrix(mt), method = "ejaccard")))
# )
# proxy::simil is wrong
# expect_equivalent(
# unclass(as.dist(textstat_simil(mt, method = "dice"))),
# unclass(as.dist(proxy::simil(as.matrix(mt), method = "dice")))
# )
# expect_equivalent(
# unclass(as.dist(textstat_simil(mt, method = "edice"))),
# unclass(as.dist(proxy::simil(as.matrix(mt), method = "edice")))
# )
expect_equivalent(
unclass(as.dist(textstat_simil(mt, method = "hamman"))),
unclass(proxy::dist(as.matrix(mt), method = "hamman"))
)
expect_equivalent(
unclass(as.dist(textstat_simil(mt, method = "simple matching"))),
unclass(as.dist(proxy::simil(as.matrix(mt), method = "simple matching")))
)
})
test_that("textstat_simil() returns NA for zero-variance documents", {
mt <- data_dfm_lbgexample[1:5, 1:20]
mt[1:2, ] <- 0
mt[3:4, ] <- 1
mt <- as.dfm(mt)
mt_na_all <- matrix(NA, nrow = 5, ncol = 5,
dimnames = list(paste0("R", 1:5), paste0("R", 1:5)))
mt_na_some <- mt_na_all
mt_na_some[3:4, 3:4] <- 1
expect_equivalent(
as.matrix(textstat_simil(mt, method = "correlation")),
mt_na_all
)
expect_equal(
as.matrix(textstat_simil(mt, method = "cosine")),
mt_na_some
)
# proxy::simil is wrong
# expect_equivalent(
# as.matrix(textstat_simil(mt, method = "jaccard")),
# mt_na_some
# )
# proxy::simil is wrong
# expect_equivalent(
# as.matrix(textstat_simil(mt, method = "ejaccard")),
# mt_na_some
# )
# proxy::simil is wrong
# expect_equivalent(
# as.matrix(textstat_simil(mt, method = "dice")),
# mt_na_some
# )
# proxy::simil is wrong
# expect_equal(
# as.matrix(textstat_simil(mt, method = "edice")),
# mt_na_some
# )
# proxyC::simil is wrong (#44)
# expect_equal(
# as.matrix(textstat_simil(mt, method = "hamman")),
# mt_na_some
# )
# proxy::simil is wrong
# expect_equal(
# as.matrix(textstat_simil(mt, method = "simple matching")),
# mt_na_some
# )
})
test_that("selection is always on columns (#1549)", {
mt <- dfm(tokens(
corpus_subset(data_corpus_inaugural, Year > 1980)
))
suppressWarnings(expect_equal(
textstat_simil(mt, margin = "documents", selection = c("1985-Reagan", "1989-Bush")) %>%
as.matrix() %>%
colnames(),
c("1985-Reagan", "1989-Bush")
))
suppressWarnings(expect_equal(
textstat_simil(mt, margin = "documents", selection = c(2, 3)) %>%
as.matrix() %>%
colnames(),
c("1985-Reagan", "1989-Bush")
))
suppressWarnings(expect_equal(
textstat_simil(mt, margin = "features", selection = c("justice", "and")) %>%
as.matrix() %>%
colnames(),
c("justice", "and")
))
suppressWarnings(expect_equal(
textstat_simil(mt, margin = "features", selection = c(4, 6)) %>%
as.matrix() %>%
colnames(),
c("mr", "chief")
))
})
test_that("all similarities are between 0 and 1", {
methods <- c("correlation", "cosine", "jaccard", "ejaccard",
"dice", "edice", "hamann", "simple matching")
for (m in methods) {
minmax <- range(textstat_simil(mt, method = m, margin = "documents"))
tol <- .000001
expect_gte(minmax[1], 0)
expect_lte(minmax[2], 1.0 + tol)
}
})
test_that("textstat_simil is stable across repetitions", {
res <- textstat_simil(mt, y = mt[c(2, 4), ],
margin = "documents")
set.seed(10)
resv <- list()
for (i in 1:100) {
resv[[i]] <- as.matrix(textstat_simil(mt, y = mt[2, ],
margin = "documents"))
}
rescols <- do.call(cbind, resv)
expect_true(all(apply(rescols, 1, sd) == 0))
})
test_that("textstat_simil coercion methods work with options", {
mt2 <- mt[6:10, ]
# upper = TRUE, diag = TRUE
tstat <- textstat_simil(mt2, margin = "documents")
expect_equal(
nrow(as.data.frame(tstat, diag = TRUE, upper = TRUE)),
nrow(mt2) ^ 2
)
mat <- as.matrix(tstat)
expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2)))
# in matrix, diagonal is 1.0
iden <- rep(1, ndoc(mt2)); names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
lis <- as.list(tstat, sort = TRUE, diag = TRUE)
lislen <- rep(ndoc(mt2), 5); names(lislen) <- docnames(mt2)
expect_equivalent(lengths(lis), rep(ndoc(mt2), ndoc(mt2)))
# in list, sorted first item is comparison to itself
expect_identical(names(lis), names(sapply(lis, "[[", 1)))
expect_equal(iden, sapply(lis, "[[", 1))
# upper = TRUE, diag = FALSE
tstat <- textstat_simil(mt2, margin = "documents")
expect_equal(
nrow(as.data.frame(tstat, upper = TRUE, diag = FALSE)),
nrow(mt2) ^ 2 - ndoc(mt2)
)
mat <- as.matrix(tstat)
expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2)))
# # in matrix, diagonal is NA
# iden <- rep(as.numeric(NA), ndoc(mt2)); names(iden) <- docnames(mt2)
# expect_equal(diag(as.matrix(tstat)), iden)
# in matrix, diagonal is 1.0
iden <- rep(1, ndoc(mt2))
names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
lis <- as.list(tstat, sort = TRUE, diag = FALSE)
expect_equivalent(lengths(lis), rep(ndoc(mt2) - 1, ndoc(mt2)))
expect_identical(names(lis), names(sapply(lis, "[[", 1)))
# in list, item not compared to itself
expect_true(all(sapply(seq_along(lis), function(y) ! names(lis[y]) %in% names(y))))
# upper = FALSE, diag = TRUE
tstat <- textstat_simil(mt2, margin = "documents")
expect_equal(
nrow(as.data.frame(tstat, upper = FALSE, diag = TRUE)),
(nrow(mt2) ^ 2 - ndoc(mt2)) / 2 + ndoc(mt2)
)
mat <- as.matrix(tstat)
# expect_true(all(is.na(mat[upper.tri(mat)])))
# in matrix, diagonal is 1.0
iden <- rep(1, ndoc(mt2)); names(iden) <- docnames(mt2)
expect_equal(diag(as.matrix(tstat)), iden)
# in matrix, lower is NA
lis <- as.list(tstat, sort = TRUE, diag = TRUE)
lislen <- rep(ndoc(mt2), ndoc(mt2))
names(lislen) <- docnames(mt2)
expect_equivalent(lengths(lis), rep(ndoc(mt2), ndoc(mt2)))
# in list, sorted first item is comparison to itself
expect_identical(names(lis), names(sapply(lis, "[[", 1)))
expect_equal(iden, sapply(lis, "[[", 1))
# upper = FALSE, diag = FALSE
tstat <- textstat_simil(mt2, margin = "documents")
expect_equal(
nrow(as.data.frame(tstat, upper = FALSE, diag = FALSE)),
(nrow(mt2) ^ 2 - ndoc(mt2)) / 2
)
mat <- as.matrix(tstat)
loweranddiag <- upper.tri(mat)
diag(loweranddiag) <- TRUE
# expect_true(all(is.na(mat[upper.tri(mat)])))
# in matrix, diagonal is 1.0
iden <- rep(1, ndoc(mt2))
names(iden) <- docnames(mt2)
expect_equal(diag(mat), iden)
lis <- as.list(tstat, sort = TRUE, diag = FALSE)
expect_equivalent(lengths(lis), rep(ndoc(mt2) - 1, ndoc(mt2)))
# in list, item not compared to itself
expect_true(all(sapply(seq_along(lis), function(y) ! names(lis[y]) %in% names(y))))
})
test_that("as.list.texstat_simil() is robust", {
expect_error(
as.list(textstat_simil(mt), n = 0),
"n must be 1 or greater"
)
expect_equivalent(
lengths(as.list(textstat_simil(mt), n = 2)),
rep(2, ndoc(mt))
)
expect_equivalent(
lengths(as.list(textstat_simil(mt), n = ndoc(mt) + 20, diag = TRUE)),
rep(ndoc(mt), ndoc(mt))
)
expect_warning(
as.list(textstat_simil(mt), n = 2, sort = FALSE),
"ignoring n when sorted = FALSE"
)
})
test_that("as.list.textstat_simil works with features margin", {
tstat <- textstat_simil(mt, y = mt[, c("world", "freedom")],
method = "cosine", margin = "features")
lis <- as.list(tstat, n = 5, diag = FALSE)
expect_equal(
sapply(lis, head, 1),
c("world.today" = 0.952, "freedom.independence" = 0.937),
tol = .01
)
expect_identical(names(lis), c("world", "freedom"))
tstat <- textstat_simil(mt, y = mt[, "freedom"],
method = "cosine", margin = "features")
lis <- as.list(tstat, n = 5, diag = TRUE)
expect_equal(
sapply(lis, head, 1),
c("freedom.freedom" = 1)
)
})
test_that("as.data.frame.textstat_simildist works with selection", {
mt2 <- mt[6:10, ]
tstat <- textstat_simil(mt2, y = mt[c("2017-Trump", "2001-Bush"), ], method = "cosine")
expect_equal(
as.character(as.data.frame(tstat, diag = FALSE, upper = FALSE)$document2),
c(rep("2017-Trump", 4), rep("2001-Bush", 4))
)
expect_equal(
as.character(as.data.frame(tstat, diag = TRUE, upper = FALSE)$document2),
c(rep("2017-Trump", 5), rep("2001-Bush", 5))
)
suppressWarnings(expect_equal(
as.character(as.data.frame(tstat, diag = FALSE, upper = TRUE)$document2),
c(rep("2017-Trump", 4), rep("2001-Bush", 4))
))
suppressWarnings(expect_equal(
as.character(as.data.frame(tstat, diag = TRUE, upper = TRUE)$document2),
c(rep("2017-Trump", 5), rep("2001-Bush", 5))
))
expect_warning(
as.data.frame(tstat, upper = TRUE),
"upper = TRUE has no effect when columns have been selected"
)
expect_identical(
names(as.data.frame(textstat_simil(mt2, method = "cosine")))[3],
"cosine"
)
expect_identical(
names(as.data.frame(textstat_simil(mt2, method = "correlation")))[3],
"correlation"
)
expect_identical(
names(as.data.frame(textstat_dist(mt2, method = "euclidean")))[3],
"euclidean"
)
})
test_that("textstat_simil validator works", {
expect_error(
textstat_simil(data_dfm_lbgexample, min_simil = -1.1),
"min_simil must range from -1.0 to 1.0"
)
})
test_that("textstat_simil show/head/tail methods work", {
# skip("until this is separate from quanteda")
expect_output(
show(textstat_simil(data_dfm_lbgexample, method = "cosine")),
"textstat_simil object;"
)
expect_equal(
as.matrix(head(textstat_simil(data_dfm_lbgexample, method = "cosine"), n = 3)),
as.matrix(textstat_simil(data_dfm_lbgexample, method = "cosine"))[1:3, ]
)
expect_equal(
as.matrix(tail(textstat_simil(data_dfm_lbgexample, method = "cosine"), n = 3)),
as.matrix(textstat_simil(data_dfm_lbgexample, method = "cosine"))[4:6, ]
)
})
test_that("min_simil argument works", {
tstat <- textstat_simil(mt, method = "cosine", min_simil = 0.98)
expect_output(
show(tstat),
"1.000 0.982 . . ",
fixed = TRUE
)
expect_equal(
as.data.frame(tstat, diag = FALSE, upper = FALSE),
data.frame(document1 = factor(c("1981-Reagan"),
levels = rownames(tstat)),
document2 = factor(c("1985-Reagan"),
levels = rownames(tstat)),
cosine = c(0.9817)),
tol = .0001
)
expect_equal(
as.data.frame(tstat, diag = FALSE, upper = TRUE),
data.frame(document1 = factor(c("1985-Reagan", "1981-Reagan"),
levels = rownames(tstat)),
document2 = factor(c("1981-Reagan", "1985-Reagan"),
levels = rownames(tstat)),
cosine = c(0.9817, 0.9817)),
tol = .0001
)
expect_equal(
as.list(tstat, diag = FALSE),
list("1981-Reagan" = c("1985-Reagan" = 0.981771),
"1985-Reagan" = c("1981-Reagan" = 0.981771)),
tol = .001
)
expect_equal(
sapply(as.list(tstat, diag = TRUE), "[", 1),
structure(rep(1, ndoc(mt)),
names = paste(docnames(mt), docnames(mt), sep = "."))
)
})
test_that("test that min_simil coercion to matrix works as expected", {
library("quanteda")
dfmat <- corpus_subset(data_corpus_inaugural, Year > 2000) %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(stopwords("english")) %>%
dfm()
tstat1 <- textstat_simil(dfmat, method = "cosine", margin = "documents", min_simil = 0.6)
expect_equal(
as.matrix(tstat1)[3, 4:5],
c("2013-Obama" = 0.6373, "2017-Trump" = NA),
tol = .0001
)
expect_equal(
as.matrix(tstat1, omitted = 0)[3, 4:5],
c("2013-Obama" = 0.6373, "2017-Trump" = 0),
tol = .0001
)
tstat2 <- textstat_simil(dfmat, y = dfmat[c("2009-Obama", "2013-Obama"), ],
method = "cosine", margin = "documents", min_simil = 0.6)
expect_equal(
as.matrix(tstat2)[3:5, 1],
c("2009-Obama" = 1, "2013-Obama" = 0.6373, "2017-Trump" = NA),
tol = .0001
)
expect_equal(
as.matrix(tstat2, omitted = 0)[3:5, 1],
c("2009-Obama" = 1, "2013-Obama" = 0.6373, "2017-Trump" = 0),
tol = .0001
)
})
test_that("y is working in the same way as selection (#1714)", {
suppressWarnings({
expect_identical(textstat_simil(mt, selection = c("2009-Obama", "2013-Obama"),
margin = "documents"),
textstat_simil(mt, mt[c("2009-Obama", "2013-Obama"), ],
margin = "documents"))
expect_identical(textstat_simil(mt, selection = c("world", "freedom"),
margin = "features"),
textstat_simil(mt, mt[, c("world", "freedom")],
margin = "features"))
expect_identical(textstat_dist(mt, selection = c("2009-Obama", "2013-Obama"),
margin = "documents"),
textstat_dist(mt, mt[c("2009-Obama", "2013-Obama"), ],
margin = "documents"))
expect_identical(textstat_dist(mt, selection = c("world", "freedom"),
margin = "features"),
textstat_dist(mt, mt[, c("world", "freedom")],
margin = "features"))
})
})
test_that("diag2na is working", {
mat1 <- Matrix::Matrix(1:9, nrow = 3,
dimnames = list(c("a", "b", "c"), c("b", "c", "d")))
expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat1, "TsparseMatrix"), "dgTMatrix"))),
matrix(c(1, NA, 3, 4, 5, NA, 7, 8, 9), nrow = 3,
dimnames = list(c("a", "b", "c"), c("b", "c", "d"))))
mat2 <- Matrix::Matrix(1:9, nrow = 3,
dimnames = list(c("a", "b", "c"), c("d", "c", "b")))
expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat2, "TsparseMatrix"), "dgTMatrix"))),
matrix(c(1, 2, 3, 4, 5, NA, 7, NA, 9), nrow = 3,
dimnames = list(c("a", "b", "c"), c("d", "c", "b"))))
mat3 <- Matrix::Matrix(1:6, nrow = 3,
dimnames = list(c("a", "b", "c"), c("c", "b")))
expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat3, "TsparseMatrix"), "dgTMatrix"))),
matrix(c(1, 2, NA, 4, NA, 6), nrow = 3,
dimnames = list(c("a", "b", "c"), c("c", "b"))))
mat4 <- Matrix::forceSymmetric(mat1)
expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat4, "TsparseMatrix"), "dsTMatrix"))),
matrix(c(NA, 4, 7, 4, NA, 8, 7, 8, NA), nrow = 3,
dimnames = list(c("b", "c", "d"), c("b", "c", "d"))))
mat5 <- Matrix::Matrix(rep(0, 9), nrow = 3,
dimnames = list(c("a", "b", "c"), c("b", "c", "d")))
expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat5, "TsparseMatrix"), "generalMatrix"))),
matrix(c(0, NA, 0, 0, 0, NA, 0, 0, 0), nrow = 3,
dimnames = list(c("a", "b", "c"), c("b", "c", "d"))))
# expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat5, "TsparseMatrix"), "dgTMatrix"))),
# matrix(c(0, NA, 0, 0, 0, NA, 0, 0, 0), nrow = 3,
# dimnames = list(c("a", "b", "c"), c("b", "c", "d"))))
})
test_that("symmetric class is correctly given", {
# skip("Until these classes are separate from quanteda")
dist1 <- textstat_dist(mt)
expect_identical(
Matrix::tril(dist1),
t(Matrix::triu(dist1))
)
dist2 <- textstat_dist(mt, mt)
expect_identical(
Matrix::tril(dist2),
t(Matrix::triu(dist2))
)
siml1 <- textstat_simil(mt)
expect_identical(
Matrix::tril(siml1),
t(Matrix::triu(siml1))
)
siml2 <- textstat_simil(mt, mt)
expect_identical(
Matrix::tril(siml2),
t(Matrix::triu(siml2))
)
})
test_that("as.data.frame works with subsetted object", {
levs <- c(paste0("R", 1:5), "V1")
simildf <- textstat_simil(data_dfm_lbgexample[-1, ], data_dfm_lbgexample[1, ]) %>%
as.data.frame()
expect_equal(
simildf,
data.frame(document1 = factor(levs[-1], levels = levs),
document2 = factor(rep("R1", 5), levels = levs),
correlation = c(0.18, -0.29, -0.32, -0.32, -0.12)),
tol = .01
)
expect_identical(levels(simildf$document1), levels(simildf$document1))
simildf <- textstat_simil(data_dfm_lbgexample, data_dfm_lbgexample[c(1, 3), ]) %>%
as.data.frame()
levs2 <- levs[c(1, 3, 2, 4:6)]
expect_identical(
simildf[, -3],
data.frame(document1 = factor(c(levs[-1], levs[-3]), levels = levs2),
document2 = factor(c(rep("R1", 5), c(rep("R3", 5))), levels = levs2))
)
expect_identical(levels(simildf$document1), levels(simildf$document1))
})
test_that("hamman still works", {
expect_identical(
textstat_simil(data_dfm_lbgexample, method = "hamman"),
textstat_simil(data_dfm_lbgexample, method = "hamann")
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.