quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

library("quanteda")

test_that("textstat_lexdiv computation is correct", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "TTR"),
        data.frame(document = c("d1", "d2"), TTR = c(0.25, 0.5),
                   stringsAsFactors = FALSE)
        )
})

test_that("textstat_lexdiv CTTR works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "CTTR")$CTTR,
        c(2 / sqrt(2 * 8), 2 / sqrt(2 * 4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv R works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "R")$R,
        c(2 / sqrt(8), 2 / sqrt(4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv C works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "C")$C,
        c(log10(2) / log10(8), log10(2) / log10(4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv Maas works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                   d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "Maas")$Maas[1],
        sqrt((log10(8) - log10(2)) / log10(8) ^ 2),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv Yule's I works correctly", {
    mydfm <- dfm(tokens(c(d1 = "a b c",
                   d2 = "a a b b c")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "I")$I[1], 0, tolerance = 0.01
    )
    expect_equivalent(
        textstat_lexdiv(mydfm, "I")$I[2], (3^2) / ((1 + 2 * 2^2) - 3), tolerance = 0.01
    )
})

test_that("textstat_lexdiv works with a single document dfm (#706)", {
    mytxt <- "one one two one one two one"
    mydfm <- dfm(tokens(mytxt))
    expect_equivalent(
        textstat_lexdiv(mydfm, c("TTR", "C")),
        data.frame(document = "text1", TTR = 0.286, C = 0.356,
                   stringsAsFactors = FALSE),
        tolerance = 0.01
    )
})

test_that("raises error when dfm is empty (#1419)", {
    mx <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_error(textstat_lexdiv(mx, c("TTR", "C")),
                 quanteda.textstats:::message_error("dfm_empty"))
})

test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
    # read in Latin version of Ch 1 of the Gospel according to St. John
    # example from Table 1 of Miranda-Garcia, A, and J Calle-Martin. 2005.
    # “Yule's Characteristic K Revisited.” Language Resources and Evaluation
    # 39(4): 287–94.
    # text source: http://www.latinvulgate.com/verse.aspx?t=1&b=4&c=1
    df <- read.csv("../data/stjohn_latin.csv", stringsAsFactors = FALSE)
    data_corpus_stjohn <- df %>%
        corpus(text_field = "latin") %>%
        corpus_group(groups = df$chapter) # %>%
        # as.character() %>%  # combine verses into a single document
        # corpus(docvars = data.frame(chapter = 1:4))
    docnames(data_corpus_stjohn) <- paste0("chap", 1:4)

    data_dfm_stjohn <- data_corpus_stjohn %>%
        tokens(remove_punct = TRUE) %>%
        tokens_tolower() %>%
        dfm()

    # work with chapter 1
    data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn, chapter == 1)

    freqs <- data_dfm_stjohnch1 %>%
        featfreq() %>%
        head(n = 331) %>%
        sort(decreasing = FALSE)
    freqnames <- names(freqs)
    # from Table 1
    freqs <- c(rep(1, 212),
           rep(2, 51),
           rep(3, 26),
           rep(4, 13),
           rep(5, 6),
           rep(6, 6),
           rep(7, 3),
           rep(8, 4),
           rep(10, 1),
           rep(11, 1),
           rep(13, 3),
           rep(16, 1),
           rep(17, 1),
           rep(19, 1),
           rep(21, 1),
           rep(59, 1))
    names(freqs) <- freqnames
    dfmat <- as.dfm(matrix(freqs, nrow = 1, dimnames = list(docnames(data_dfm_stjohnch1),
                                                            freqnames)))
    expect_identical(
        as.integer(ntoken(dfmat)), # 770
        755L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
    )

    expect_identical(
        as.integer(ntype(dfmat)),  # 329
        331L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
    )

    expect_equivalent(
        textstat_lexdiv(dfmat, "K"),  # 112.767
        # from Miranda-Garcia and Calle-Martin (2005, Table 3)
        data.frame(document = "chap1", K = 113.091583, stringsAsFactors = FALSE),
        tolerance = 0.5
    )

    # tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
    # Table 3 of Miranda-Garcia and Calle-Martin (2005)
    data_dfm_stjohncomb <- rbind(data_dfm_stjohnch1,
                                 dfm_group(data_dfm_stjohn, rep(1, 4)))
    docnames(data_dfm_stjohncomb)[2] <- "chaps1-4"
    expect_equivalent(
        textstat_lexdiv(data_dfm_stjohncomb, "K"),
        data.frame(document = c("chap1", "chaps1-4"), K = c(113.091583, 109.957455),
                   stringsAsFactors = FALSE),
        tolerance = 1
    )

    # try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
    expect_true(
        all(textstat_lexdiv(data_dfm_stjohncomb, "D")[1, "D", drop = TRUE] > 0)
    )
    expect_true(
        all(textstat_lexdiv(data_dfm_stjohncomb, "Vm")[1, "Vm", drop = TRUE] > 0)
    )

    # test equality as per Tweedie and Baayen (1998, Eq. 19)
    # this needs checking - the tol value is a fudge
    result <- textstat_lexdiv(data_dfm_stjohncomb, c("K", "Vm"))
    K <- result[["K"]]
    Vm <- result[["Vm"]]
    expect_equal(
        Vm ^ 2,
        as.numeric(K / 10 ^ 4 + (1 / ntoken(data_dfm_stjohncomb) - 1 /
                                     ntype(data_dfm_stjohncomb))),
        tol = .0013
    )
})

# Tests for multiple static measures of lexical diversity
static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")

test_that("textstat_lexdiv works similarly for corpus and tokens", {
    txt <- c(d1 = "b a b a b a b a",
             d2 = "a a b b")
    mydfm <- dfm(tokens(txt))
    mytokens <- tokens(txt)
    expect_identical(
        textstat_lexdiv(mydfm, measure = static_measures),
        textstat_lexdiv(mytokens, measure = static_measures)
    )
})

test_that("textstat_lexdiv supports removal of punctuation, numbers and symbols", {
    txt <- c(d1 = "a a  b b  c c",
             d2 = "a a , b b . c c / & ^ *** ### 1 2 3 4")
    mt <- dfm(tokens(txt))
    toks <- tokens(txt)
    expect_identical(
        textstat_lexdiv(mt["d1", ], measure = static_measures)[, -1],
        textstat_lexdiv(mt["d2", ], measure = static_measures)[, -1]
    )
    expect_identical(
        textstat_lexdiv(toks["d1"], measure = static_measures)[, -1],
        textstat_lexdiv(toks["d2"], measure = static_measures)[, -1]
    )
})

test_that("textstat_lexdiv supports removal of hyphenation", {
    y <- dfm(tokens(c(d1 = "apple-pear orange-fruit elephant-ferrari",
               d2 = "alpha-beta charlie-delta echo-foxtrot")))
    z <- dfm(tokens(c(d1 = "apple pear orange fruit elephant ferrari",
               d2 = "alpha beta charlie delta echo foxtrot")))
    expect_identical(
        textstat_lexdiv(y, measure = static_measures, remove_hyphens = TRUE),
        textstat_lexdiv(z, measure = static_measures, remove_hyphens = TRUE)
    )
})

test_that("textstat_lexdiv can handle hyphenated words containing duplicated tokens ", {
    dfm_nested <- dfm(tokens(corpus(c(d1 = "have we not-we-have bicycle ! % 123 ^ "))))
    # not-we-have should be separated into three tokens, with hyphens being removed
    # remaining punctuation, symbols and numbers should also be removed
    # dfm_nested should only have 4 types with 6 tokens
    dfm_non_nested <- dfm(tokens(corpus(c(d1 = "a b b c c d"))))
    expect_identical(textstat_lexdiv(dfm_nested, measure = static_measures, remove_hyphens = TRUE),
                     textstat_lexdiv(dfm_non_nested, measure = static_measures))
})

test_that("textstat_lexdiv.dfm and .tokens work same with remove_* options", {
    txt <- c("There's shrimp-kabobs,
              shrimp creole, shrimp gumbo. Pan fried, deep fried, stir-fried. There's
              pineapple shrimp, lemon shrimp, coconut shrimp, pepper shrimp, shrimp soup,
              shrimp stew, shrimp salad, shrimp and potatoes, shrimp burger, shrimp
              sandwich.",
             "A shrimp-kabob costs $0.50, shrimp costs $0.25.")
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_hyphens = TRUE),
        textstat_lexdiv(dfm(tokens(txt), tolower = FALSE), measure = "TTR", remove_hyphens = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR",
                        remove_punct = TRUE, remove_hyphens = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR",
                        remove_punct = TRUE, remove_hyphens = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_punct = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_punct = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt[2]), measure = "TTR", remove_symbols = TRUE),
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)
    )
    expect_true(
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)[1, "TTR"] !=
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = FALSE)[1, "TTR"]
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_numbers = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_numbers = TRUE)
    )
})


test_that("textstat_lexdiv does not support dfm for MATTR and MSTTR", {
    mytxt <- "one one two one one two one"
    mydfm <- dfm(tokens(mytxt))
    expect_error(
        textstat_lexdiv(mydfm, measure = "MATTR"),
        "average-based measures are only available for tokens inputs"
    )
    expect_error(
        textstat_lexdiv(mydfm, measure = "MSTTR"),
        "average-based measures are only available for tokens inputs"
    )
})

test_that("textstat_lexdiv.tokens raises errors if parameters for moving measures are not specified", {
    # skip("defaults may have changed")
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)

    expect_warning(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
        "MATTR_window exceeds some documents' token lengths, resetting to 7"
    )
    # expect_error(
    #     textstat_lexdiv(mytoken, measure = "MSTTR"),
    #     "MSTTR_segment_size must be specified if MSTTR is to be computed"
    # )
})

test_that("textstat_lexdiv.tokens MATTR works correctly on its own", {
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6
    wsize3_MATTR <- (2/3 + 2/3 + 2/3 + 2/3 + 2/3) / 5
    wsize4_MATTR <- (2/4 + 2/4 + 2/4 + 2/4) / 4

    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 2)[["MATTR"]],
        wsize2_MATTR
    )
    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 3)[["MATTR"]],
        wsize3_MATTR
    )
    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 4)[["MATTR"]],
        wsize4_MATTR
    )

    expect_warning(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
        "MATTR_window exceeds some documents' token lengths, resetting to 7"
    )
})

test_that("textstat_lexdiv.tokens MATTR works correctly in conjunction with static measures", {
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6

    expect_equivalent(
        textstat_lexdiv(mytoken, measure = c("TTR", "MATTR"), MATTR_window = 2),
        data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MATTR = wsize2_MATTR)
    )
})

test_that("textstat_lexdiv.tokens MSTTR works correctly on its own", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4
    wsize3_MSTTR <- (2/3 + 2/3 ) / 2 # apple orange at the back is discarded
    wsize4_MSTTR <- (2/4 + 3/4) / 2

    # Test segment size = 2
    expect_equivalent(
        textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 2)[["MSTTR"]],
        wsize2_MSTTR
    )

    # Test segment size = 3
    expect_equivalent(
        textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 3)[[2]],
        wsize3_MSTTR
    )

    # Test segment size = 4
    expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 4)[[2]],
                      wsize4_MSTTR)

    # Test segment size = ntoken
    expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = length(mytoken[["text1"]]))[[2]],
                      textstat_lexdiv(mytoken, measure = "TTR")[[2]])
})

test_that("textstat_lexdiv.tokens MSTTR works correctly in conjunction with static measures", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4

    expect_equivalent(
        textstat_lexdiv(mytoken, measure = c("TTR", "MSTTR"), MSTTR_segment = 2),
        data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MSTTR = wsize2_MSTTR)
    )
})


test_that("compute_MSTTR internal function has working exception handlers", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)

    expect_warning(
        quanteda.textstats:::compute_msttr(mytoken, 20),
        "MSTTR_segment exceeds some documents' token lengths, resetting to 8"
    )

    # expect_identical(
    #     list(compute_msttr(mytoken,segment_size=2, mean_sttr = FALSE, all_segments=TRUE)),
    #     list(c(MSTTR_tokens1_2 = 2/2, MSTTR_tokens3_4 =  2/2, MSTTR_tokens5_6 = 1/2, MSTTR_tokens7_8 = 2/2))
    # )
    #
    # expect_identical(
    #     list(compute_msttr(mytoken,segment_size=3 , mean_sttr = FALSE, all_segments=TRUE, discard_remainder = FALSE)),
    #     list(c(MSTTR_tokens1_3 = 2/3, MSTTR_tokens4_6 =  2/3, MSTTR_tokens7_8 = 1))
    # )

    # Test misspecification of Segment Size
    expect_error(quanteda.textstats:::compute_msttr(mytoken, 0),
                 "MSTTR_segment must be positive")

    # # Case when neither mean segmental TTR or each segment TTR is not requested
    # expect_error(compute_msttr(mytoken,segment_size=2,mean_sttr = FALSE ,all_segments=FALSE),
    #              quanteda.textstats:::message_error("at least one MSTTR value type to be returned"))
})

test_that("textstat_lexdiv.tokens works right when all measures are requested", {
    skip("until all MA measures are made functional")
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (2/2 + 2/2 + 2/2 + 2/2 + 1/2 + 2/2 + 2/2) / 7
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 1) /4 # 7th entry is discarded

    static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")
    moving_measures_df <- data.frame(MATTR = wsize2_MATTR, MSTTR = wsize2_MSTTR)

    expect_identical(textstat_lexdiv(mytoken,
                                     measure = "all",
                                     MATTR_window = 2,
                                     MSTTR_segment_size = 2
    ),
    cbind(textstat_lexdiv(mytoken, measure = static_measures),
          moving_measures_df))
})

test_that("textstat_lexdiv works with measure = 'all'", {
    res <- textstat_lexdiv(dfm(tokens("What, oh what, are we doing?")),
                           measure = "all")
    expect_true(
        setequal(names(res),
                 c("document", "TTR", "C", "R", "CTTR", "U", "S", "K", "I", "D", "Vm", "Maas", "lgV0", "lgeV0"))
    )
})

test_that("dfm_split_hyphenated_features works as expected", {
    dfmat <- dfm(tokens("One-two one two three."))
    expect_identical(
        featnames(quanteda.textstats:::dfm_split_hyphenated_features(dfmat)),
        c("one", "two", "three", ".", "-")
    )
})

test_that("Exact tests for Yule's K", {
    txt <- c("a b c d d e e f f f",
             "a b c d d e e f f f g g g g")
    toks <- tokens(txt)
    textstat_lexdiv(toks, "K")

    # from koRpus and in issue #46
    expect_equal(
        round(textstat_lexdiv(toks, "K")$K, 3),
        c(1000, 1122.449)
    )
})
quanteda/quanteda.textstat documentation built on July 16, 2025, 6:04 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda/quanteda.textstat
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R
In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.textstat Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.textstat
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R
In quanteda/quanteda.textstat: Textual Statistics for the Quantitative Analysis of Textual Data