library("quanteda")
test_mt <- tokens(corpus_subset(data_corpus_inaugural, Year > 1980)) %>%
tokens_remove(stopwords("en")) %>%
tokens_wordstem("en") %>%
dfm() %>%
dfm_trim(min_termfreq = 5)
test_simil <- function(x, method, margin, ignore_upper = FALSE, ...) {
if (margin == "documents") {
by_rows <- TRUE
selection <- "1985-Reagan"
y <- x[selection, ]
} else {
by_rows <- FALSE
selection <- "soviet"
y <- x[, selection]
}
s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
s2 <- as.matrix(proxy::simil(as.matrix(x),
method = method, by_rows = by_rows, diag = TRUE, ...))
diag(s1) <- NA
diag(s2) <- NA
if (ignore_upper)
s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
expect_equal(s1, s2, tolerance = 0.001)
s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
s4 <- as.matrix(proxy::simil(as.matrix(x), as.matrix(y),
method = method, by_rows = by_rows, diag = TRUE, ...))
if (ignore_upper)
s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}
test_dist <- function(x, method, margin, ignore_upper = FALSE, ...) {
if (margin == "documents") {
by_rows <- TRUE
selection <- "1985-Reagan"
y <- x[selection, ]
} else {
by_rows <- FALSE
selection <- "soviet"
y <- x[, selection]
}
s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
s2 <- as.matrix(proxy::dist(as.matrix(x),
method = method, by_rows = by_rows, diag = TRUE, ...))
if (ignore_upper)
s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
expect_equal(s1, s2, tolerance = 0.001)
s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
s4 <- as.matrix(proxy::dist(as.matrix(x), as.matrix(y),
method = method, by_rows = by_rows, diag = TRUE, ...))
if (ignore_upper)
s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}
# Similarity measures -------------------------------------------
test_that("test textstat_proxy cosine similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "cosine", "documents")
test_simil(test_mt, "cosine", "features")
})
test_that("test textstat_proxy correlation similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "correlation", "documents")
test_simil(test_mt, "correlation", "features")
})
test_that("test textstat_proxy jaccard similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "jaccard", "documents")
test_simil(test_mt, "jaccard", "features")
})
test_that("test textstat_proxy ejaccard similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "ejaccard", "documents")
test_simil(test_mt, "ejaccard", "features")
})
test_that("test textstat_proxy dice similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "dice", "documents")
test_simil(test_mt, "dice", "features")
})
test_that("test textstat_proxy edice similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "edice", "documents")
test_simil(test_mt, "edice", "features")
})
test_that("test textstat_proxy simple matching similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "simple matching", "documents")
test_simil(test_mt, "simple matching", "features")
})
test_that("test textstat_proxy hamann similarity", {
skip_if_not_installed("proxy")
test_simil(test_mt, "hamman", "documents")
test_simil(test_mt, "hamman", "features")
expect_identical(
textstat_simil(test_mt, method = "hamman"),
textstat_simil(test_mt, method = "hamann")
)
})
# Distance measures -------------------------------------------
test_that("test textstat_proxy euclidean distance", {
skip_if_not_installed("proxy")
test_dist(test_mt, "euclidean", "documents")
test_dist(test_mt, "euclidean", "features")
})
# test_that("test textstat_proxy chisquared distance on documents", {
# skip_if_not_installed("ExPosition")
# s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "documents"))
# s2 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D)
# names(dimnames(s2)) <- NULL
# expect_equal(s1, s2, tolerance = 0.001)
#
# s3 <- as.matrix(textstat_proxy(test_mt, "1985-Reagan", method = "chisquared", margin = "documents"))
# s4 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D[,"1985-Reagan"])
# names(dimnames(s4)) <- NULL
# expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })
#
# test_that("test textstat_proxy chisquared distance on features", {
# skip_if_not_installed("ExPosition")
# s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "features"))
# s2 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D)
# names(dimnames(s2)) <- NULL
# expect_equal(s1, s2, tolerance = 0.001)
#
# s3 <- as.matrix(textstat_proxy(test_mt, "soviet", method = "chisquared", margin = "features"))
# s4 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D[,"soviet"])
# names(dimnames(s4)) <- NULL
# expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })
test_that("test kullback kullback similarity", {
skip_if_not_installed("proxy")
# make dense matrix to avoide Inf in proxy::dist
test_mt_dense <- test_mt + 1
# proxy::dist() also incorrectly produces symmetric matrix
test_dist(test_mt_dense, "kullback", "documents", ignore_upper = TRUE)
test_dist(test_mt_dense, "kullback", "features", ignore_upper = TRUE)
})
test_that("test textstat_proxy manhattan distance", {
skip_if_not_installed("proxy")
test_dist(test_mt, "manhattan", "documents")
test_dist(test_mt, "manhattan", "features")
})
test_that("test textstat_proxy maximum distance", {
skip_if_not_installed("proxy")
test_dist(test_mt, "maximum", "documents")
test_dist(test_mt, "maximum", "features")
})
test_that("test textstat_proxy canberra distance", {
skip_if_not_installed("proxy")
# proxyC and proxy disagree when sparsity is high
test_dist(as.dfm(test_mt + 1), "canberra", "documents")
test_dist(as.dfm(test_mt + 1), "canberra", "features")
})
test_that("test textstat_proxy minkowski distance", {
skip_if_not_installed("proxy")
test_dist(test_mt, "minkowski", "documents", p = 0.1)
test_dist(test_mt, "minkowski", "features", p = 0.1)
test_dist(test_mt, "minkowski", "documents", p = 2)
test_dist(test_mt, "minkowski", "features", p = 2)
test_dist(test_mt, "minkowski", "documents", p = 10)
test_dist(test_mt, "minkowski", "features", p = 10)
})
test_that("as.matrix works as expected", {
txt <- c("Bacon ipsum dolor amet tenderloin hamburger bacon t-bone,",
"Tenderloin turducken corned beef bacon.",
"Burgdoggen venison tail, hamburger filet mignon capicola meatloaf pig pork belly.")
mt <- dfm(tokens(txt))
expect_equivalent(diag(as.matrix(textstat_proxy(mt))),
rep(1, 3))
})
test_that("textstat_proxy stops as expected for methods not supported", {
expect_error(textstat_proxy(test_mt, method = "Yule"))
})
test_that("textstat_proxy works on zero-frequency features", {
d1 <- dfm(tokens(c("a b c", "a b c d")))
d2 <- dfm(tokens(letters[1:6]))
dtest <- dfm_match(d1, featnames(d2))
expect_equal(
textstat_proxy(dtest, method = "cosine")[2, 1], 0.866,
tolerance = 0.001
)
expect_equal(
textstat_proxy(dtest, method = "correlation")[2, 1], 0.707,
tolerance = 0.001
)
})
test_that("textstat_proxy works on zero-feature documents (#952)", {
corp <- corpus(c("a b c c", "b c d", "a"),
docvars = data.frame(grp = factor(c("A", "A", "B"), levels = LETTERS[1:3])))
mt <- dfm(tokens(corp))
mt <- dfm_group(mt, groups = corp$grp, fill = TRUE)
expect_equal(
as.numeric(textstat_proxy(mt, method = "cosine")[1, ]),
c(1, 0.2581, 0),
tolerance = 0.001
)
expect_equal(
as.numeric(textstat_proxy(mt, method = "correlation")[1, ]),
c(1, -0.5222, 0),
tolerance = 0.001
)
})
test_that("textstat_proxy works with non-intersecting documents or features", {
toks <- tokens(c(doc1 = "a b c d e", doc2 = "b c f e", doc3 = "c d f", doc4 = "f g h"), remove_punct = TRUE)
mt <- dfm(toks)
sim1 <- textstat_proxy(mt, margin = "features")
expect_equal(as.matrix(textstat_proxy(mt[, c("a", "b")], mt[, c("c", "d", "e")], margin = "features")),
as.matrix(sim1[c("a", "b"), c("c", "d", "e"), drop = FALSE]))
sim2 <- textstat_proxy(mt, margin = "documents")
expect_equal(as.matrix(textstat_proxy(mt[c("doc1", "doc2"), ], mt[c("doc4"), ], margin = "documents")),
as.matrix(sim2[c("doc1", "doc2"), c("doc4"), drop = FALSE]))
})
test_that("raises error when dfm is empty (#1419)", {
mt <- dfm_trim(data_dfm_lbgexample, 1000)
expect_silent(textstat_proxy(mt))
expect_silent(textstat_proxy(mt, mt))
})
test_that("raises error when p is smaller than 1", {
expect_error(textstat_proxy(test_mt, method = "minkowski", p = 0))
expect_error(textstat_proxy(test_mt, method = "minkowski", p = -1))
})
test_that("sparse objects are of expected class and occur when expected", {
expect_is(textstat_proxy(test_mt),
"dsTMatrix")
expect_is(textstat_proxy(test_mt, min_proxy = 10),
"dsTMatrix")
expect_is(textstat_proxy(test_mt, rank = 2),
"dgTMatrix")
expect_is(textstat_proxy(test_mt, method = "kullback"),
"dgTMatrix")
})
test_that("rank argument is working", {
expect_error(textstat_proxy(test_mt, rank = 0),
"rank must be great than or equal to 1")
expect_equal(as.matrix(textstat_proxy(test_mt)),
as.matrix(textstat_proxy(test_mt, rank = 100)))
expect_equal(as.matrix(textstat_proxy(test_mt, rank = 3)),
apply(as.matrix(textstat_proxy(test_mt)), 2,
function(x) ifelse(x >= sort(x, decreasing = TRUE)[3], x, 0)))
})
test_that("record zeros even in the sparse matrix", {
toks <- tokens(c(doc1 = "a b c", doc2 = "d e f"), remove_punct = TRUE)
mt <- dfm(toks)
expect_true(any(textstat_proxy(mt)@x == 0))
expect_true(any(textstat_proxy(mt, method = "cosine")@x == 0))
expect_true(any(textstat_proxy(mt, method = "cosine", min_proxy = -0.5)@x == 0))
expect_true(any(textstat_proxy(mt, method = "cosine", rank = 2)@x == 0))
expect_true(any(textstat_proxy(mt, method = "dice")@x == 0))
})
test_that("textstat_proxy raises error when documents are different for feature similarity", {
expect_silent(
textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], margin = "features")
)
expect_error(textstat_proxy(test_mt[1:5, ], test_mt[6:10, ], margin = "features"),
"x and y must contain the same documents")
})
test_that("textstat_proxy raises error when y is not a dfm", {
expect_error(textstat_proxy(test_mt[1:5, ], 6:10, margin = "features"),
"y must be a dfm")
})
test_that("use_na is working", {
mt <- as.dfm(matrix(c(rep(0, 4),
rep(1, 4),
c(1, 3, 2, 0)), ncol = 3))
cos1 <- textstat_proxy(mt, margin = "features", method = "cosine", use_na = TRUE)
cor1 <- textstat_proxy(mt, margin = "features", method = "correlation", use_na = TRUE)
euc1 <- textstat_proxy(mt, margin = "features", method = "euclidean", use_na = TRUE)
expect_equal(sum(is.na(cos1)), 5)
expect_equal(sum(is.na(cor1)), 8)
expect_equal(sum(is.na(euc1)), 0)
cos2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "cosine", use_na = TRUE)
cor2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "correlation", use_na = TRUE)
euc2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "euclidean", use_na = TRUE)
expect_equal(sum(is.na(cos2)), 1)
expect_equal(sum(is.na(cor2)), 2)
expect_equal(sum(is.na(euc2)), 0)
})
test_that("no value is greater than 1.0 (#1543)", {
cos1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "cosine")
expect_equal(sum(cos1 > 1), 0)
cor1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "correlation")
expect_true(all(cor1 <= 1.000000001))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.