test-textstat_proxy.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

library("quanteda")

test_mt <- tokens(corpus_subset(data_corpus_inaugural, Year > 1980)) %>%
    tokens_remove(stopwords("en")) %>%
    tokens_wordstem("en") %>%
    dfm() %>%
    dfm_trim(min_termfreq = 5)

test_simil <- function(x, method, margin, ignore_upper = FALSE, ...) {
    if (margin == "documents") {
        by_rows <- TRUE
        selection <- "1985-Reagan"
        y <- x[selection, ]
    } else {
        by_rows <- FALSE
        selection <- "soviet"
        y <- x[, selection]
    }

    s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
    s2 <- as.matrix(proxy::simil(as.matrix(x),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    diag(s1) <- NA
    diag(s2) <- NA

    if (ignore_upper)
        s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
    expect_equal(s1, s2, tolerance = 0.001)

    s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
    s4 <- as.matrix(proxy::simil(as.matrix(x), as.matrix(y),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    if (ignore_upper)
        s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
    expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}

test_dist <- function(x, method, margin, ignore_upper = FALSE, ...) {

    if (margin == "documents") {
        by_rows <- TRUE
        selection <- "1985-Reagan"
        y <- x[selection, ]
    } else {
        by_rows <- FALSE
        selection <- "soviet"
        y <- x[, selection]
    }

    s1 <- as.matrix(textstat_proxy(x, method = method, margin = margin, ...))
    s2 <- as.matrix(proxy::dist(as.matrix(x),
                                method = method, by_rows = by_rows, diag = TRUE, ...))

    if (ignore_upper)
        s1[upper.tri(s1, TRUE)] <- s2[upper.tri(s2, TRUE)] <- 0
    expect_equal(s1, s2, tolerance = 0.001)

    s3 <- as.matrix(textstat_proxy(x, y, method = method, margin = margin, ...))
    s4 <- as.matrix(proxy::dist(as.matrix(x), as.matrix(y),
                                method = method, by_rows = by_rows, diag = TRUE, ...))
    if (ignore_upper)
        s3[upper.tri(s3, TRUE)] <- s4[upper.tri(s4, TRUE)] <- 0
    expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
}

# Similarity measures -------------------------------------------

test_that("test textstat_proxy cosine similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "cosine", "documents")
    test_simil(test_mt, "cosine", "features")
})

test_that("test textstat_proxy correlation similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "correlation", "documents")
    test_simil(test_mt, "correlation", "features")
})

test_that("test textstat_proxy jaccard similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "jaccard", "documents")
    test_simil(test_mt, "jaccard", "features")
})

test_that("test textstat_proxy ejaccard similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "ejaccard", "documents")
    test_simil(test_mt, "ejaccard", "features")
})

test_that("test textstat_proxy dice similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "dice", "documents")
    test_simil(test_mt, "dice", "features")
})

test_that("test textstat_proxy edice similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "edice", "documents")
    test_simil(test_mt, "edice", "features")
})

test_that("test textstat_proxy simple matching similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "simple matching", "documents")
    test_simil(test_mt, "simple matching", "features")
})

test_that("test textstat_proxy hamann similarity", {
    skip_if_not_installed("proxy")
    test_simil(test_mt, "hamman", "documents")
    test_simil(test_mt, "hamman", "features")

    expect_identical(
        textstat_simil(test_mt, method = "hamman"),
        textstat_simil(test_mt, method = "hamann")
    )
})


# Distance measures -------------------------------------------

test_that("test textstat_proxy euclidean distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "euclidean", "documents")
    test_dist(test_mt, "euclidean", "features")
})

# test_that("test textstat_proxy chisquared distance on documents", {
#     skip_if_not_installed("ExPosition")
#     s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "documents"))
#     s2 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D)
#     names(dimnames(s2)) <- NULL
#     expect_equal(s1, s2, tolerance = 0.001)
#
#     s3 <- as.matrix(textstat_proxy(test_mt, "1985-Reagan", method = "chisquared", margin = "documents"))
#     s4 <- as.matrix(ExPosition::chi2Dist(as.matrix(test_mt))$D[,"1985-Reagan"])
#     names(dimnames(s4)) <- NULL
#     expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })
#
# test_that("test textstat_proxy chisquared distance on features", {
#     skip_if_not_installed("ExPosition")
#     s1 <- as.matrix(textstat_proxy(test_mt, method = "chisquared", margin = "features"))
#     s2 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D)
#     names(dimnames(s2)) <- NULL
#     expect_equal(s1, s2, tolerance = 0.001)
#
#     s3 <- as.matrix(textstat_proxy(test_mt, "soviet", method = "chisquared", margin = "features"))
#     s4 <- as.matrix(ExPosition::chi2Dist(t(as.matrix(test_mt)))$D[,"soviet"])
#     names(dimnames(s4)) <- NULL
#     expect_equal(as.numeric(s3), as.numeric(s4), tolerance = 0.001)
# })


test_that("test kullback kullback similarity", {
    skip_if_not_installed("proxy")
    # make dense matrix to avoide Inf in proxy::dist
    test_mt_dense <- test_mt + 1
    # proxy::dist() also incorrectly produces symmetric matrix
    test_dist(test_mt_dense, "kullback", "documents", ignore_upper = TRUE)
    test_dist(test_mt_dense, "kullback", "features", ignore_upper = TRUE)
})

test_that("test textstat_proxy manhattan distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "manhattan", "documents")
    test_dist(test_mt, "manhattan", "features")
})

test_that("test textstat_proxy maximum distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "maximum", "documents")
    test_dist(test_mt, "maximum", "features")
})

test_that("test textstat_proxy canberra distance", {
    skip_if_not_installed("proxy")
    # proxyC and proxy disagree when sparsity is high
    test_dist(as.dfm(test_mt + 1), "canberra", "documents")
    test_dist(as.dfm(test_mt + 1), "canberra", "features")
})

test_that("test textstat_proxy minkowski distance", {
    skip_if_not_installed("proxy")
    test_dist(test_mt, "minkowski", "documents", p = 0.1)
    test_dist(test_mt, "minkowski", "features", p = 0.1)
    test_dist(test_mt, "minkowski", "documents", p = 2)
    test_dist(test_mt, "minkowski", "features", p = 2)
    test_dist(test_mt, "minkowski", "documents", p = 10)
    test_dist(test_mt, "minkowski", "features", p = 10)
})

test_that("as.matrix works as expected", {
    txt <- c("Bacon ipsum dolor amet tenderloin hamburger bacon t-bone,",
             "Tenderloin turducken corned beef bacon.",
             "Burgdoggen venison tail, hamburger filet mignon capicola meatloaf pig pork belly.")
    mt <- dfm(tokens(txt))
    expect_equivalent(diag(as.matrix(textstat_proxy(mt))),
                      rep(1, 3))
})

test_that("textstat_proxy stops as expected for methods not supported", {
    expect_error(textstat_proxy(test_mt, method = "Yule"))
})

test_that("textstat_proxy works on zero-frequency features", {
    d1 <- dfm(tokens(c("a b c", "a b c d")))
    d2 <- dfm(tokens(letters[1:6]))
    dtest <- dfm_match(d1, featnames(d2))

    expect_equal(
        textstat_proxy(dtest, method = "cosine")[2, 1], 0.866,
        tolerance = 0.001
    )
    expect_equal(
        textstat_proxy(dtest, method = "correlation")[2, 1], 0.707,
        tolerance = 0.001
    )
})

test_that("textstat_proxy works on zero-feature documents (#952)", {
    corp <- corpus(c("a b c c", "b c d", "a"),
                             docvars = data.frame(grp = factor(c("A", "A", "B"), levels = LETTERS[1:3])))
    mt <- dfm(tokens(corp))
    mt <- dfm_group(mt, groups = corp$grp, fill = TRUE)

    expect_equal(
        as.numeric(textstat_proxy(mt, method = "cosine")[1, ]),
        c(1, 0.2581, 0),
        tolerance = 0.001
    )
    expect_equal(
        as.numeric(textstat_proxy(mt, method = "correlation")[1, ]),
        c(1, -0.5222, 0),
        tolerance = 0.001
    )
})

test_that("textstat_proxy works with non-intersecting documents or features", {

    toks <- tokens(c(doc1 = "a b c d e", doc2 = "b c f e", doc3 = "c d f", doc4 = "f g h"), remove_punct = TRUE)
    mt <- dfm(toks)
    sim1 <- textstat_proxy(mt, margin = "features")
    expect_equal(as.matrix(textstat_proxy(mt[, c("a", "b")], mt[, c("c", "d", "e")], margin = "features")),
                 as.matrix(sim1[c("a", "b"), c("c", "d", "e"), drop = FALSE]))
    sim2 <- textstat_proxy(mt, margin = "documents")
    expect_equal(as.matrix(textstat_proxy(mt[c("doc1", "doc2"), ], mt[c("doc4"), ], margin = "documents")),
                 as.matrix(sim2[c("doc1", "doc2"), c("doc4"), drop = FALSE]))
})

test_that("raises error when dfm is empty (#1419)", {
    mt <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_silent(textstat_proxy(mt))
    expect_silent(textstat_proxy(mt, mt))
})

test_that("raises error when p is smaller than 1", {
    expect_error(textstat_proxy(test_mt, method = "minkowski", p = 0))
    expect_error(textstat_proxy(test_mt, method = "minkowski", p = -1))
})

test_that("sparse objects are of expected class and occur when expected", {
    expect_is(textstat_proxy(test_mt),
              "dsTMatrix")
    expect_is(textstat_proxy(test_mt, min_proxy = 10),
              "dsTMatrix")
    expect_is(textstat_proxy(test_mt, rank = 2),
              "dgTMatrix")
    expect_is(textstat_proxy(test_mt, method = "kullback"),
              "dgTMatrix")
})

test_that("rank argument is working", {
    expect_error(textstat_proxy(test_mt, rank = 0),
                 "rank must be great than or equal to 1")

    expect_equal(as.matrix(textstat_proxy(test_mt)),
                 as.matrix(textstat_proxy(test_mt, rank = 100)))

    expect_equal(as.matrix(textstat_proxy(test_mt, rank = 3)),
                 apply(as.matrix(textstat_proxy(test_mt)), 2,
                       function(x) ifelse(x >= sort(x, decreasing = TRUE)[3], x, 0)))
})

test_that("record zeros even in the sparse matrix", {
    toks <- tokens(c(doc1 = "a b c", doc2 = "d e f"), remove_punct = TRUE)
    mt <- dfm(toks)
    expect_true(any(textstat_proxy(mt)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine")@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine", min_proxy = -0.5)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "cosine", rank = 2)@x == 0))
    expect_true(any(textstat_proxy(mt, method = "dice")@x == 0))
})

test_that("textstat_proxy raises error when documents are different for feature similarity", {
    expect_silent(
        textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], margin = "features")
    )
    expect_error(textstat_proxy(test_mt[1:5, ], test_mt[6:10, ], margin = "features"),
                 "x and y must contain the same documents")
})

test_that("textstat_proxy raises error when y is not a dfm", {
    expect_error(textstat_proxy(test_mt[1:5, ], 6:10, margin = "features"),
                 "y must be a dfm")
})

test_that("use_na is working", {
    mt <- as.dfm(matrix(c(rep(0, 4),
                          rep(1, 4),
                          c(1, 3, 2, 0)), ncol = 3))

    cos1 <- textstat_proxy(mt, margin = "features", method = "cosine", use_na = TRUE)
    cor1 <- textstat_proxy(mt, margin = "features", method = "correlation", use_na = TRUE)
    euc1 <- textstat_proxy(mt, margin = "features", method = "euclidean", use_na = TRUE)
    expect_equal(sum(is.na(cos1)), 5)
    expect_equal(sum(is.na(cor1)), 8)
    expect_equal(sum(is.na(euc1)), 0)

    cos2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "cosine", use_na = TRUE)
    cor2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "correlation", use_na = TRUE)
    euc2 <- textstat_proxy(mt, mt[, 3], margin = "features", method = "euclidean", use_na = TRUE)
    expect_equal(sum(is.na(cos2)), 1)
    expect_equal(sum(is.na(cor2)), 2)
    expect_equal(sum(is.na(euc2)), 0)
})

test_that("no value is greater than 1.0 (#1543)", {
    cos1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "cosine")
    expect_equal(sum(cos1 > 1), 0)
    cor1 <- textstat_proxy(test_mt[1:5, ], test_mt[1:5, ], method = "correlation")
    expect_true(all(cor1 <= 1.000000001))
})
Any scripts or data that you put into this service are public.
quanteda.textstats documentation built on Sept. 11, 2024, 6:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_proxy.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data