test-textstat_lexdiv.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

library("quanteda")

test_that("textstat_lexdiv computation is correct", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "TTR"),
        data.frame(document = c("d1", "d2"), TTR = c(0.25, 0.5),
                   stringsAsFactors = FALSE)
        )
})

test_that("textstat_lexdiv CTTR works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "CTTR")$CTTR,
        c(2 / sqrt(2 * 8), 2 / sqrt(2 * 4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv R works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "R")$R,
        c(2 / sqrt(8), 2 / sqrt(4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv C works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                             d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "C")$C,
        c(log10(2) / log10(8), log10(2) / log10(4)),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv Maas works correctly", {
    mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
                   d2 = "a a b b")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "Maas")$Maas[1],
        sqrt((log10(8) - log10(2)) / log10(8) ^ 2),
        tolerance = 0.01
    )
})

test_that("textstat_lexdiv Yule's I works correctly", {
    mydfm <- dfm(tokens(c(d1 = "a b c",
                   d2 = "a a b b c")))
    expect_equivalent(
        textstat_lexdiv(mydfm, "I")$I[1], 0, tolerance = 0.01
    )
    expect_equivalent(
        textstat_lexdiv(mydfm, "I")$I[2], (3^2) / ((1 + 2 * 2^2) - 3), tolerance = 0.01
    )
})

test_that("textstat_lexdiv works with a single document dfm (#706)", {
    mytxt <- "one one two one one two one"
    mydfm <- dfm(tokens(mytxt))
    expect_equivalent(
        textstat_lexdiv(mydfm, c("TTR", "C")),
        data.frame(document = "text1", TTR = 0.286, C = 0.356,
                   stringsAsFactors = FALSE),
        tolerance = 0.01
    )
})

test_that("raises error when dfm is empty (#1419)", {
    mx <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_error(textstat_lexdiv(mx, c("TTR", "C")),
                 quanteda.textstats:::message_error("dfm_empty"))
})

test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
    # read in Latin version of Ch 1 of the Gospel according to St. John
    # example from Table 1 of Miranda-Garcia, A, and J Calle-Martin. 2005.
    # “Yule's Characteristic K Revisited.” Language Resources and Evaluation
    # 39(4): 287–94.
    # text source: http://www.latinvulgate.com/verse.aspx?t=1&b=4&c=1
    df <- read.csv("../data/stjohn_latin.csv", stringsAsFactors = FALSE)
    data_corpus_stjohn <- df %>%
        corpus(text_field = "latin") %>%
        corpus_group(groups = df$chapter) # %>%
        # as.character() %>%  # combine verses into a single document
        # corpus(docvars = data.frame(chapter = 1:4))
    docnames(data_corpus_stjohn) <- paste0("chap", 1:4)

    data_dfm_stjohn <- data_corpus_stjohn %>%
        tokens(remove_punct = TRUE) %>%
        tokens_tolower() %>%
        dfm()

    # work with chapter 1
    data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn, chapter == 1)

    freqs <- data_dfm_stjohnch1 %>%
        featfreq() %>%
        head(n = 331) %>%
        sort(decreasing = FALSE)
    freqnames <- names(freqs)
    # from Table 1
    freqs <- c(rep(1, 212),
           rep(2, 51),
           rep(3, 26),
           rep(4, 13),
           rep(5, 6),
           rep(6, 6),
           rep(7, 3),
           rep(8, 4),
           rep(10, 1),
           rep(11, 1),
           rep(13, 3),
           rep(16, 1),
           rep(17, 1),
           rep(19, 1),
           rep(21, 1),
           rep(59, 1))
    names(freqs) <- freqnames
    dfmat <- as.dfm(matrix(freqs, nrow = 1, dimnames = list(docnames(data_dfm_stjohnch1),
                                                            freqnames)))
    expect_identical(
        as.integer(ntoken(dfmat)), # 770
        755L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
    )

    expect_identical(
        as.integer(ntype(dfmat)),  # 329
        331L     # from Miranda-Garcia and Calle-Martin (2005, Table 1)
    )

    expect_equivalent(
        textstat_lexdiv(dfmat, "K"),  # 112.767
        # from Miranda-Garcia and Calle-Martin (2005, Table 3)
        data.frame(document = "chap1", K = 113.091583, stringsAsFactors = FALSE),
        tolerance = 0.5
    )

    # tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
    # Table 3 of Miranda-Garcia and Calle-Martin (2005)
    data_dfm_stjohncomb <- rbind(data_dfm_stjohnch1,
                                 dfm_group(data_dfm_stjohn, rep(1, 4)))
    docnames(data_dfm_stjohncomb)[2] <- "chaps1-4"
    expect_equivalent(
        textstat_lexdiv(data_dfm_stjohncomb, "K"),
        data.frame(document = c("chap1", "chaps1-4"), K = c(113.091583, 109.957455),
                   stringsAsFactors = FALSE),
        tolerance = 1
    )

    # try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
    expect_true(
        all(textstat_lexdiv(data_dfm_stjohncomb, "D")[1, "D", drop = TRUE] > 0)
    )
    expect_true(
        all(textstat_lexdiv(data_dfm_stjohncomb, "Vm")[1, "Vm", drop = TRUE] > 0)
    )

    # test equality as per Tweedie and Baayen (1998, Eq. 19)
    # this needs checking - the tol value is a fudge
    result <- textstat_lexdiv(data_dfm_stjohncomb, c("K", "Vm"))
    K <- result[["K"]]
    Vm <- result[["Vm"]]
    expect_equal(
        Vm ^ 2,
        as.numeric(K / 10 ^ 4 + (1 / ntoken(data_dfm_stjohncomb) - 1 /
                                     ntype(data_dfm_stjohncomb))),
        tol = .0013
    )
})

# Tests for multiple static measures of lexical diversity
static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")

test_that("textstat_lexdiv works similarly for corpus and tokens", {
    txt <- c(d1 = "b a b a b a b a",
             d2 = "a a b b")
    mydfm <- dfm(tokens(txt))
    mytokens <- tokens(txt)
    expect_identical(
        textstat_lexdiv(mydfm, measure = static_measures),
        textstat_lexdiv(mytokens, measure = static_measures)
    )
})

test_that("textstat_lexdiv supports removal of punctuation, numbers and symbols", {
    txt <- c(d1 = "a a  b b  c c",
             d2 = "a a , b b . c c / & ^ *** ### 1 2 3 4")
    mt <- dfm(tokens(txt))
    toks <- tokens(txt)
    expect_identical(
        textstat_lexdiv(mt["d1", ], measure = static_measures)[, -1],
        textstat_lexdiv(mt["d2", ], measure = static_measures)[, -1]
    )
    expect_identical(
        textstat_lexdiv(toks["d1"], measure = static_measures)[, -1],
        textstat_lexdiv(toks["d2"], measure = static_measures)[, -1]
    )
})

test_that("textstat_lexdiv supports removal of hyphenation", {
    y <- dfm(tokens(c(d1 = "apple-pear orange-fruit elephant-ferrari",
               d2 = "alpha-beta charlie-delta echo-foxtrot")))
    z <- dfm(tokens(c(d1 = "apple pear orange fruit elephant ferrari",
               d2 = "alpha beta charlie delta echo foxtrot")))
    expect_identical(
        textstat_lexdiv(y, measure = static_measures, remove_hyphens = TRUE),
        textstat_lexdiv(z, measure = static_measures, remove_hyphens = TRUE)
    )
})

test_that("textstat_lexdiv can handle hyphenated words containing duplicated tokens ", {
    dfm_nested <- dfm(tokens(corpus(c(d1 = "have we not-we-have bicycle ! % 123 ^ "))))
    # not-we-have should be separated into three tokens, with hyphens being removed
    # remaining punctuation, symbols and numbers should also be removed
    # dfm_nested should only have 4 types with 6 tokens
    dfm_non_nested <- dfm(tokens(corpus(c(d1 = "a b b c c d"))))
    expect_identical(textstat_lexdiv(dfm_nested, measure = static_measures, remove_hyphens = TRUE),
                     textstat_lexdiv(dfm_non_nested, measure = static_measures))
})

test_that("textstat_lexdiv.dfm and .tokens work same with remove_* options", {
    txt <- c("There's shrimp-kabobs,
              shrimp creole, shrimp gumbo. Pan fried, deep fried, stir-fried. There's
              pineapple shrimp, lemon shrimp, coconut shrimp, pepper shrimp, shrimp soup,
              shrimp stew, shrimp salad, shrimp and potatoes, shrimp burger, shrimp
              sandwich.",
             "A shrimp-kabob costs $0.50, shrimp costs $0.25.")
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_hyphens = TRUE),
        textstat_lexdiv(dfm(tokens(txt), tolower = FALSE), measure = "TTR", remove_hyphens = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR",
                        remove_punct = TRUE, remove_hyphens = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR",
                        remove_punct = TRUE, remove_hyphens = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_punct = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_punct = TRUE)
    )
    expect_identical(
        textstat_lexdiv(tokens(txt[2]), measure = "TTR", remove_symbols = TRUE),
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)
    )
    expect_true(
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)[1, "TTR"] !=
        textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = FALSE)[1, "TTR"]
    )
    expect_identical(
        textstat_lexdiv(tokens(txt), measure = "TTR", remove_numbers = TRUE),
        textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_numbers = TRUE)
    )
})


test_that("textstat_lexdiv does not support dfm for MATTR and MSTTR", {
    mytxt <- "one one two one one two one"
    mydfm <- dfm(tokens(mytxt))
    expect_error(
        textstat_lexdiv(mydfm, measure = "MATTR"),
        "average-based measures are only available for tokens inputs"
    )
    expect_error(
        textstat_lexdiv(mydfm, measure = "MSTTR"),
        "average-based measures are only available for tokens inputs"
    )
})

test_that("textstat_lexdiv.tokens raises errors if parameters for moving measures are not specified", {
    # skip("defaults may have changed")
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)

    expect_warning(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
        "MATTR_window exceeds some documents' token lengths, resetting to 7"
    )
    # expect_error(
    #     textstat_lexdiv(mytoken, measure = "MSTTR"),
    #     "MSTTR_segment_size must be specified if MSTTR is to be computed"
    # )
})

test_that("textstat_lexdiv.tokens MATTR works correctly on its own", {
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6
    wsize3_MATTR <- (2/3 + 2/3 + 2/3 + 2/3 + 2/3) / 5
    wsize4_MATTR <- (2/4 + 2/4 + 2/4 + 2/4) / 4

    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 2)[["MATTR"]],
        wsize2_MATTR
    )
    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 3)[["MATTR"]],
        wsize3_MATTR
    )
    expect_identical(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 4)[["MATTR"]],
        wsize4_MATTR
    )

    expect_warning(
        textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
        "MATTR_window exceeds some documents' token lengths, resetting to 7"
    )
})

test_that("textstat_lexdiv.tokens MATTR works correctly in conjunction with static measures", {
    mytxt <- "one one two one one two one"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6

    expect_equivalent(
        textstat_lexdiv(mytoken, measure = c("TTR", "MATTR"), MATTR_window = 2),
        data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MATTR = wsize2_MATTR)
    )
})

test_that("textstat_lexdiv.tokens MSTTR works correctly on its own", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4
    wsize3_MSTTR <- (2/3 + 2/3 ) / 2 # apple orange at the back is discarded
    wsize4_MSTTR <- (2/4 + 3/4) / 2

    # Test segment size = 2
    expect_equivalent(
        textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 2)[["MSTTR"]],
        wsize2_MSTTR
    )

    # Test segment size = 3
    expect_equivalent(
        textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 3)[[2]],
        wsize3_MSTTR
    )

    # Test segment size = 4
    expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 4)[[2]],
                      wsize4_MSTTR)

    # Test segment size = ntoken
    expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = length(mytoken[["text1"]]))[[2]],
                      textstat_lexdiv(mytoken, measure = "TTR")[[2]])
})

test_that("textstat_lexdiv.tokens MSTTR works correctly in conjunction with static measures", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4

    expect_equivalent(
        textstat_lexdiv(mytoken, measure = c("TTR", "MSTTR"), MSTTR_segment = 2),
        data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MSTTR = wsize2_MSTTR)
    )
})


test_that("compute_MSTTR internal function has working exception handlers", {
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)

    expect_warning(
        quanteda.textstats:::compute_msttr(mytoken, 20),
        "MSTTR_segment exceeds some documents' token lengths, resetting to 8"
    )

    # expect_identical(
    #     list(compute_msttr(mytoken,segment_size=2, mean_sttr = FALSE, all_segments=TRUE)),
    #     list(c(MSTTR_tokens1_2 = 2/2, MSTTR_tokens3_4 =  2/2, MSTTR_tokens5_6 = 1/2, MSTTR_tokens7_8 = 2/2))
    # )
    #
    # expect_identical(
    #     list(compute_msttr(mytoken,segment_size=3 , mean_sttr = FALSE, all_segments=TRUE, discard_remainder = FALSE)),
    #     list(c(MSTTR_tokens1_3 = 2/3, MSTTR_tokens4_6 =  2/3, MSTTR_tokens7_8 = 1))
    # )

    # Test misspecification of Segment Size
    expect_error(quanteda.textstats:::compute_msttr(mytoken, 0),
                 "MSTTR_segment must be positive")

    # # Case when neither mean segmental TTR or each segment TTR is not requested
    # expect_error(compute_msttr(mytoken,segment_size=2,mean_sttr = FALSE ,all_segments=FALSE),
    #              quanteda.textstats:::message_error("at least one MSTTR value type to be returned"))
})

test_that("textstat_lexdiv.tokens works right when all measures are requested", {
    skip("until all MA measures are made functional")
    mytxt <- "apple orange apple orange pear pear apple orange"
    mytoken <- tokens(mytxt)
    wsize2_MATTR <- (2/2 + 2/2 + 2/2 + 2/2 + 1/2 + 2/2 + 2/2) / 7
    wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 1) /4 # 7th entry is discarded

    static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")
    moving_measures_df <- data.frame(MATTR = wsize2_MATTR, MSTTR = wsize2_MSTTR)

    expect_identical(textstat_lexdiv(mytoken,
                                     measure = "all",
                                     MATTR_window = 2,
                                     MSTTR_segment_size = 2
    ),
    cbind(textstat_lexdiv(mytoken, measure = static_measures),
          moving_measures_df))
})

test_that("textstat_lexdiv works with measure = 'all'", {
    res <- textstat_lexdiv(dfm(tokens("What, oh what, are we doing?")),
                           measure = "all")
    expect_true(
        setequal(names(res),
                 c("document", "TTR", "C", "R", "CTTR", "U", "S", "K", "I", "D", "Vm", "Maas", "lgV0", "lgeV0"))
    )
})

test_that("dfm_split_hyphenated_features works as expected", {
    dfmat <- dfm(tokens("One-two one two three."))
    expect_identical(
        featnames(quanteda.textstats:::dfm_split_hyphenated_features(dfmat)),
        c("one", "two", "three", ".", "-")
    )
})

test_that("Exact tests for Yule's K", {
    txt <- c("a b c d d e e f f f",
             "a b c d d e e f f f g g g g")
    toks <- tokens(txt)
    textstat_lexdiv(toks, "K")

    # from koRpus and in issue #46
    expect_equal(
        round(textstat_lexdiv(toks, "K")$K, 3),
        c(1000, 1122.449)
    )
})
Any scripts or data that you put into this service are public.
quanteda.textstats documentation built on Sept. 11, 2024, 6:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data

Try the quanteda.textstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textstats
Textual Statistics for the Quantitative Analysis of Textual Data

tests/testthat/test-textstat_lexdiv.R
In quanteda.textstats: Textual Statistics for the Quantitative Analysis of Textual Data