tests/testthat/test-textstat_frequency.R

test_that("test textstat_frequency without groups", {
    dfm1 <- quanteda::dfm(quanteda::tokens(c("a a b b c d", "a d d d", "a a a")))
    expect_equivalent(
        textstat_frequency(dfm1, ties_method = "random"),
        data.frame(feature = c("a", "d", "b", "c"),
                   frequency = c(6,4,2,1),
                   rank = 1:4,
                   docfreq = c(3,2,1,1),
                   group = rep('all', 4),
                   stringsAsFactors = FALSE)
    )
    expect_equivalent(
      textstat_frequency(dfm1, n = 2, ties_method = "random"),
      data.frame(feature = c("a", "d", "b", "c"),
                 frequency = c(6,4,2,1),
                 rank = 1:4,
                 docfreq = c(3,2,1,1),
                 group = rep('all', 4),
                 stringsAsFactors = FALSE)[1:2, ]
    )
})

test_that("test textstat_frequency without groups", {
    txt <- c("a a b b c d", "a d d d", "a a a")
    grp1 <- c("one", "two", "one")
    corp1 <- quanteda::corpus(txt, docvars = data.frame(grp2 = grp1))

    expect_identical(
        textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = grp1, ties_method = "max"),
        textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = corp1$grp2, ties_method = "max")
    )
    expect_identical(
        textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = grp1, ties_method = "max"),
        textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = grp2, ties_method = "max")
    )

    set.seed(10)
    expect_equivalent(
        textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = grp2, ties_method = "random"),
        data.frame(feature = c("a", "b", "c", "d", "d", "a"),
                   frequency = c(5,2,1,1,3,1),
                   rank = c(1:4, 1:2),
                   docfreq = c(2,1,1,1,1,1),
                   group = c("one", "one", "one", "one", "two", "two"),
                   stringsAsFactors = FALSE)
    )

    expect_equivalent(
      textstat_frequency(quanteda::dfm(quanteda::tokens(corp1)), groups = grp2, n = 2, ties_method = "random"),
      data.frame(feature = c("a", "b", "d", "a"),
                 frequency = c(5, 2, 3, 1),
                 rank = c(1:2, 1:2),
                 docfreq = c(2, 1, 1, 1),
                 group = c("one", "one", "two", "two"),
                 stringsAsFactors = FALSE)
    )
})

test_that("test textstat_frequency works with weights", {
    txt <- c("a a b b c d", "a d d d", "a a a")
    grp1 <- c("one", "two", "one")
    corp1 <- quanteda::corpus(txt, docvars = data.frame(grp2 = grp1))

    dfm1 <- quanteda::dfm(quanteda::tokens(corp1))
    dfm1weighted <- quanteda::dfm_weight(dfm1, "prop")

    set.seed(10)
    expect_equivalent(
        textstat_frequency(dfm1weighted, ties_method = "random"),
        data.frame(feature = c("a", "d", "b", "c"),
                   frequency = c(1.58, .916, .333, .1666),
                   rank = 1:4,
                   docfreq = c(3,2,1,1),
                   group = rep('all', 4),
                   stringsAsFactors = FALSE),
        tolerance = .01
    )
})

test_that("raises error when dfm is empty (#1419)", {
    mx <- quanteda::dfm_trim(quanteda::data_dfm_lbgexample, 1000)
    expect_error(textstat_frequency(mx),
                 quanteda:::message_error("dfm_empty"))
})

test_that("test textstat_frequency ties methods defaults work (min)", {
    txt <- c("a a b b c d", "b b b d d d", "a a a")
    dfmat <- quanteda::dfm(quanteda::tokens(txt))
    expect_equivalent(
        textstat_frequency(dfmat)[, c("feature", "rank")],
        data.frame(feature = c("a", "b", "d", "c"),
                   frequency = c(1, 1, 3, 4),
                   stringsAsFactors = FALSE)
    )
})

test_that("test textstat_frequency with groups and weighted dfm (#1646)", {
    dfmat <- quanteda::dfm(quanteda::tokens(c("a a b b c d", "a d d d", "a a a"))) %>%
      quanteda::dfm_tfidf()

    expect_error(
        textstat_frequency(dfmat, groups = c(1, 2, 2)),
        "will not group a weighted dfm; use force = TRUE to override",
        fixed = TRUE
    )
    expect_equivalent(
        textstat_frequency(dfmat, groups = c(1, 2, 2), force = TRUE),
        data.frame(feature = c("b", "c", "d", "a", "d", "a"),
                   frequency = c(.95, .48, .18, 0, .53, .00),
                   rank = c(1, 2, 3, 4, 1, 2),
                   docfreq = c(1, 1, 1, 0, 1, 0),
                   group = as.character(c(rep(1, 4), 2, 2)),
                   stringsAsFactors = FALSE),
        tolerance = .01
    )
})

test_that("textstat_frequency does not return NAs when n > nfeat", {
  dfmat <- quanteda::dfm(quanteda::tokens(c("a a b c d", "a d d e", "a b b")))

  # should not have NA
  expect_identical(nrow(textstat_frequency(dfmat, n = 6)), 5L)

  # should not have NA
  expect_identical(nrow(textstat_frequency(dfmat, n = 6, groups = c(1, 2, 2))), 8L)
})

Try the quanteda.textstats package in your browser

Any scripts or data that you put into this service are public.

quanteda.textstats documentation built on Nov. 2, 2023, 5:07 p.m.