tests/testthat/test-textstat_summary.R

test_that("textstat_summary method works", {
    skip_on_os("solaris")
    corp <- quanteda::data_corpus_inaugural[10:15]
    toks <- quanteda::tokens_tolower(quanteda::tokens(corp))
    dfmt <- quanteda::dfm(toks)

    col_summ <- c("document", "chars", "sents", "tokens", "types",
                  "puncts", "numbers", "symbols", "urls",
                  "tags", "emojis")

    # corpus
    summ_corp <- textstat_summary(corp)
    expect_equal(
        summ_corp$sents,
        unname(quanteda::ntoken(quanteda::tokens(corp, what = "sentence")))
    )
    expect_equal(
        names(summ_corp),
        col_summ
    )

    # tokens
    summ_toks <- textstat_summary(toks)
    expect_equal(
        summ_toks$puncts,
        unname(quanteda::ntoken(quanteda::tokens_select(toks, quanteda:::removals_regex(punct = TRUE)[[1]],
                                    valuetype = "regex")))
    )
    expect_equal(
        summ_toks$numbers,
        unname(quanteda::ntoken(quanteda::tokens_select(toks, quanteda:::removals_regex(numbers = TRUE)[[1]],
                                    valuetype = "regex")))
    )
    expect_equal(
        summ_toks$tokens,
        unname(quanteda::ntoken(toks))
    )
    expect_equal(
        summ_toks$types,
        unname(quanteda::ntype(toks))
    )
    expect_equal(
        summ_toks$sents,
        rep(NA, quanteda::ndoc(toks))
    )
    expect_equal(
        names(summ_toks),
        col_summ
    )

    # dfm
    summ_dfm <- textstat_summary(dfmt)
    expect_equal(
        summ_dfm$puncts,
        unname(quanteda::ntoken(quanteda::dfm_select(dfmt, quanteda:::removals_regex(punct = TRUE)[[1]],
                                 valuetype = "regex")))
    )
    expect_equal(
        summ_dfm$numbers,
        unname(quanteda::ntoken(quanteda::dfm_select(dfmt, quanteda:::removals_regex(numbers = TRUE)[[1]],
                                 valuetype = "regex")))
    )
    expect_equal(
        summ_dfm$tokens,
        unname(quanteda::ntoken(dfmt))
    )
    expect_equal(
        summ_dfm$types,
        unname(quanteda::ntype(dfmt))
    )
    expect_equal(
        summ_dfm$sents,
        rep(NA, quanteda::ndoc(dfmt))
    )
    expect_equal(
        names(summ_dfm),
        col_summ
    )
})

test_that("textstat_summary counts hashtag and emoji correctly", {
    skip_on_os("solaris")
    txt <- c("£ € 👏 Rock on❗🎸",
             "Hi @qi #quanteda https://quanteda.io")
    toks <- quanteda::tokens(txt)
    summ <- textstat_summary(toks)
    expect_identical(summ$tokens, c(7L, 4L))
    expect_identical(summ$tags, c(0L, 2L))
    expect_identical(summ$emojis, c(3L, 0L))
    expect_identical(summ$urls, c(0L, 1L))
})

test_that("textstat_summary counts hashtag and emoji correctly", {
    skip("Until we fix #64")
    txt <- c("£ € 👏 Rock on❗ 💪️🎸",
             "Hi @qi #quanteda https://quanteda.io")
    toks <- quanteda::tokens(txt)
    summ <- textstat_summary(toks)
    expect_identical(summ$emojis, c(4L, 0L))
})

Try the quanteda.textstats package in your browser

Any scripts or data that you put into this service are public.

quanteda.textstats documentation built on Nov. 2, 2023, 5:07 p.m.