quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

library("quanteda")

test_mt <- tokens(corpus_subset(data_corpus_inaugural, Year > 1980)) %>%
    tokens_remove(stopwords("en")) %>%
    tokens_wordstem("en") %>%
    dfm() %>%
    dfm_trim(min_termfreq = 5)

test_simil <- function(x, method, margin, ignore_upper = FALSE, ...) {
    if (margin == "documents") {
        by_rows <- TRUE
        selection <- "1985-Reagan"
        y <- x[selection, ]
    } else {
        by_rows <- FALSE
        selection <- "soviet"
        y <- x[, selection]
    }

    s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
    s2 <- as.matrix(proxy::simil(as.matrix(x),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    diag(s1) <- NA
    diag(s2) <- NA

    if (ignore_upper)
        s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
    expect_equal(s1, s2, tolerance = 0.001)

    s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
    s4 <- as.matrix(proxy::simil(as.matrix(x), as.matrix(y),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    if (ignore_upper)
        s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
    expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}

test_dist <- function(x, method, margin, ignore_upper = FALSE, ...) {

    if (margin == "documents") {
        by_rows <- TRUE
        selection <- "1985-Reagan"
        y <- x[selection, ]
    } else {
        by_rows <- FALSE
        selection <- "soviet"
        y <- x[, selection]
    }

    s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
    s2 <- as.matrix(proxy::dist(as.matrix(x),
                                method = method, by_rows = by_rows, diag = TRUE, ...))

    if (ignore_upper)
        s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
    expect_equal(s1, s2, tolerance = 0.001)

    s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
    s4 <- as.matrix(proxy::dist(as.matrix(x), as.matrix(y),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    if (ignore_upper)
        s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
    expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}

# Similarity measures -------------------------------------------

test_that("test textstat_proxy cosine similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "cosine", "documents")
    test_simil(test_mt, "cosine", "features")
})

test_that("test textstat_proxy correlation similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "correlation", "documents")
    test_simil(test_mt, "correlation", "features")
})

test_that("test textstat_proxy jaccard similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "jaccard", "documents")
    test_simil(test_mt, "jaccard", "features")
})

test_that("test textstat_proxy ejaccard similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "ejaccard", "documents")
    test_simil(test_mt, "ejaccard", "features")
})

test_that("test textstat_proxy dice similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "dice", "documents")
    test_simil(test_mt, "dice", "features")
})

test_that("test textstat_proxy edice similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "edice", "documents")
    test_simil(test_mt, "edice", "features")
})

test_that("test textstat_proxy simple matching similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "simple matching", "documents")
    test_simil(test_mt, "simple matching", "features")
})

test_that("test textstat_proxy hamann similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "hamman", "documents")
    test_simil(test_mt, "hamman", "features")

    expect_identical(
        textstat_simil(test_mt, method = "hamman"),
        textstat_simil(test_mt, method = "hamann")
    )
})


# Distance measures -------------------------------------------

test_that("test textstat_proxy euclidean distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "euclidean", "documents")
    test_dist(test_mt, "euclidean", "features")
})

# test_that("test textstat_proxy chisquared distance on documents", {
#     skip_if_not_installed("ExPosition")
#     s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "documents"))
#     s2 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D)
#     names(dimnames(s2)) <- NULL
#     expect_equal(s1, s2, tolerance = 0.001)
#
#     s3 <- as.matrix(textstat_proxy(test_mt, "1985-Reagan", method = "chisquared", margin = "documents"))
#     s4 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D[,"1985-Reagan"])
#     names(dimnames(s4)) <- NULL
#     expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })
#
# test_that("test textstat_proxy chisquared distance on features", {
#     skip_if_not_installed("ExPosition")
#     s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "features"))
#     s2 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D)
#     names(dimnames(s2)) <- NULL
#     expect_equal(s1, s2, tolerance = 0.001)
#
#     s3 <- as.matrix(textstat_proxy(test_mt, "soviet", method = "chisquared", margin = "features"))
#     s4 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D[,"soviet"])
#     names(dimnames(s4)) <- NULL
#     expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })


test_that("test kullback kullback similarity", {
    skip_if_not_installed("proxy")
    # make dense matrix to avoide Inf in proxy::dist
    test_mt_dense <- test_mt + 1
    # proxy::dist() also incorrectly produces symmetric matrix
    test_dist(test_mt_dense, "kullback", "documents", ignore_upper = TRUE)
    test_dist(test_mt_dense, "kullback", "features", ignore_upper = TRUE)
})

test_that("test textstat_proxy manhattan distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "manhattan", "documents")
    test_dist(test_mt, "manhattan", "features")
})

test_that("test textstat_proxy maximum distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "maximum", "documents")
    test_dist(test_mt, "maximum", "features")
})

test_that("test textstat_proxy canberra distance", {
    skip_if_not_installed("proxy")
    # proxyC and proxy disagree when sparsity is high
    test_dist(as.dfm(test_mt + 1), "canberra", "documents")
    test_dist(as.dfm(test_mt + 1), "canberra", "features")
})

test_that("test textstat_proxy minkowski distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "minkowski", "documents", p = 0.1)
    test_dist(test_mt, "minkowski", "features", p = 0.1)
    test_dist(test_mt, "minkowski", "documents", p = 2)
    test_dist(test_mt, "minkowski", "features", p = 2)
    test_dist(test_mt, "minkowski", "documents", p = 10)
    test_dist(test_mt, "minkowski", "features", p = 10)
})

test_that("as.matrix works as expected", {
    txt <- c("Bacon ipsum dolor amet tenderloin hamburger bacon t-bone,",
             "Tenderloin turducken corned beef bacon.",
             "Burgdoggen venison tail, hamburger filet mignon capicola meatloaf pig pork belly.")
    mt <- dfm(tokens(txt))
    expect_equivalent(diag(as.matrix(textstat_proxy(mt))),
                      rep(1, 3))
})

test_that("textstat_proxy stops as expected for methods not supported", {
    expect_error(textstat_proxy(test_mt, method = "Yule"))
})

test_that("textstat_proxy works on zero-frequency features", {
    d1 <- dfm(tokens(c("a b c", "a b c d")))
    d2 <- dfm(tokens(letters[1:6]))
    dtest <- dfm_match(d1, featnames(d2))

    expect_equal(
        textstat_proxy(dtest, method = "cosine")[2, 1], 0.866,
        tolerance = 0.001
    )
    expect_equal(
        textstat_proxy(dtest, method = "correlation")[2, 1], 0.707,
        tolerance = 0.001
    )
})

test_that("textstat_proxy works on zero-feature documents (#952)", {
    corp <- corpus(c("a b c c", "b c d", "a"),
                             docvars = data.frame(grp = factor(c("A", "A", "B"), levels = LETTERS[1:3])))
    mt <- dfm(tokens(corp))
    mt <- dfm_group(mt, groups = corp$grp, fill = TRUE)

    expect_equal(
        as.numeric(textstat_proxy(mt, method = "cosine")[1, ]),
        c(1, 0.2581, 0),
        tolerance = 0.001
    )
    expect_equal(
        as.numeric(textstat_proxy(mt, method = "correlation")[1, ]),
        c(1, -0.5222, 0),
        tolerance = 0.001
    )
})

test_that("textstat_proxy works with non-intersecting documents or features", {

    toks <- tokens(c(doc1 = "a b c d e", doc2 = "b c f e", doc3 = "c d f", doc4 = "f g h"), remove_punct = TRUE)
    mt <- dfm(toks)
    sim1 <- textstat_proxy(mt, margin = "features")
    expect_equal(as.matrix(textstat_proxy(mt[, c("a", "b")], mt[, c("c", "d", "e")], margin = "features")),
                 as.matrix(sim1[c("a", "b"), c("c", "d", "e"), drop = FALSE]))
    sim2 <- textstat_proxy(mt, margin = "documents")
    expect_equal(as.matrix(textstat_proxy(mt[c("doc1", "doc2"), ], mt[c("doc4"), ], margin = "documents")),
                 as.matrix(sim2[c("doc1", "doc2"), c("doc4"), drop = FALSE]))
})

test_that("raises error when dfm is empty (#1419)", {
    mt <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_silent(textstat_proxy(mt))
    expect_silent(textstat_proxy(mt, mt))
})

test_that("raises error when p is smaller than 1", {
    expect_error(textstat_proxy(test_mt, method = "minkowski", p = 0))
    expect_error(textstat_proxy(test_mt, method = "minkowski", p = -1))
})

test_that("sparse objects are of expected class and occur when expected", {
    expect_is(textstat_proxy(test_mt),
              "dsTMatrix")
    expect_is(textstat_proxy(test_mt, min_proxy = 10),
              "dsTMatrix")
    expect_is(textstat_proxy(test_mt, rank = 2),
              "dgTMatrix")
    expect_is(textstat_proxy(test_mt, method = "kullback"),
              "dgTMatrix")
})

test_that("rank argument is working", {
    expect_error(textstat_proxy(test_mt, rank = 0),
                 "rank must be great than or equal to 1")

    expect_equal(as.matrix(textstat_proxy(test_mt)),
                 as.matrix(textstat_proxy(test_mt, rank = 100)))

    expect_equal(as.matrix(textstat_proxy(test_mt, rank = 3)),
                 apply(as.matrix(textstat_proxy(test_mt)), 2,
                       function(x) ifelse(x >= sort(x, decreasing = TRUE)[3], x, 0)))
})

test_that("record zeros even in the sparse matrix", {
    toks <- tokens(c(doc1 = "a b c", doc2 = "d e f"), remove_punct = TRUE)
    mt <- dfm(toks)
    expect_true(any(textstat_proxy(mt)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine")@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine", min_proxy = -0.5)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine", rank = 2)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "dice")@x == 0))
})

test_that("textstat_proxy raises error when documents are different for feature similarity", {
    expect_silent(
        textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], margin = "features")
    )
    expect_error(textstat_proxy(test_mt[1:5, ], test_mt[6:10, ], margin = "features"),
                 "x and y must contain the same documents")
})

test_that("textstat_proxy raises error when y is not a dfm", {
    expect_error(textstat_proxy(test_mt[1:5, ], 6:10, margin = "features"),
                 "y must be a dfm")
})

test_that("use_na is working", {
    mt <- as.dfm(matrix(c(rep(0, 4),
                          rep(1, 4),
                          c(1, 3, 2, 0)), ncol = 3))

    cos1 <- textstat_proxy(mt, margin = "features", method = "cosine", use_na = TRUE)
    cor1 <- textstat_proxy(mt, margin = "features", method = "correlation", use_na = TRUE)
    euc1 <- textstat_proxy(mt, margin = "features", method = "euclidean", use_na = TRUE)
    expect_equal(sum(is.na(cos1)), 5)
    expect_equal(sum(is.na(cor1)), 8)
    expect_equal(sum(is.na(euc1)), 0)

    cos2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "cosine", use_na = TRUE)
    cor2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "correlation", use_na = TRUE)
    euc2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "euclidean", use_na = TRUE)
    expect_equal(sum(is.na(cos2)), 1)
    expect_equal(sum(is.na(cor2)), 2)
    expect_equal(sum(is.na(euc2)), 0)
})

test_that("no value is greater than 1.0 (#1543)", {
    cos1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "cosine")
    expect_equal(sum(cos1 > 1), 0)
    cor1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "correlation")
    expect_true(all(cor1 <= 1.000000001))
})
quanteda/quanteda.textstat documentation built on July 16, 2025, 6:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda/quanteda.textstat
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R
In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.textstat Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.textstat
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R
In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data