library("quanteda")
test_that("textstat_lexdiv computation is correct", {
mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
d2 = "a a b b")))
expect_equivalent(
textstat_lexdiv(mydfm, "TTR"),
data.frame(document = c("d1", "d2"), TTR = c(0.25, 0.5),
stringsAsFactors = FALSE)
)
})
test_that("textstat_lexdiv CTTR works correctly", {
mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
d2 = "a a b b")))
expect_equivalent(
textstat_lexdiv(mydfm, "CTTR")$CTTR,
c(2 / sqrt(2 * 8), 2 / sqrt(2 * 4)),
tolerance = 0.01
)
})
test_that("textstat_lexdiv R works correctly", {
mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
d2 = "a a b b")))
expect_equivalent(
textstat_lexdiv(mydfm, "R")$R,
c(2 / sqrt(8), 2 / sqrt(4)),
tolerance = 0.01
)
})
test_that("textstat_lexdiv C works correctly", {
mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
d2 = "a a b b")))
expect_equivalent(
textstat_lexdiv(mydfm, "C")$C,
c(log10(2) / log10(8), log10(2) / log10(4)),
tolerance = 0.01
)
})
test_that("textstat_lexdiv Maas works correctly", {
mydfm <- dfm(tokens(c(d1 = "b a b a b a b a",
d2 = "a a b b")))
expect_equivalent(
textstat_lexdiv(mydfm, "Maas")$Maas[1],
sqrt((log10(8) - log10(2)) / log10(8) ^ 2),
tolerance = 0.01
)
})
test_that("textstat_lexdiv Yule's I works correctly", {
mydfm <- dfm(tokens(c(d1 = "a b c",
d2 = "a a b b c")))
expect_equivalent(
textstat_lexdiv(mydfm, "I")$I[1], 0, tolerance = 0.01
)
expect_equivalent(
textstat_lexdiv(mydfm, "I")$I[2], (3^2) / ((1 + 2 * 2^2) - 3), tolerance = 0.01
)
})
test_that("textstat_lexdiv works with a single document dfm (#706)", {
mytxt <- "one one two one one two one"
mydfm <- dfm(tokens(mytxt))
expect_equivalent(
textstat_lexdiv(mydfm, c("TTR", "C")),
data.frame(document = "text1", TTR = 0.286, C = 0.356,
stringsAsFactors = FALSE),
tolerance = 0.01
)
})
test_that("raises error when dfm is empty (#1419)", {
mx <- dfm_trim(data_dfm_lbgexample, 1000)
expect_error(textstat_lexdiv(mx, c("TTR", "C")),
quanteda.textstats:::message_error("dfm_empty"))
})
test_that("Yule's K and Herndon's Vm correction are (approximately) correct", {
# read in Latin version of Ch 1 of the Gospel according to St. John
# example from Table 1 of Miranda-Garcia, A, and J Calle-Martin. 2005.
# “Yule's Characteristic K Revisited.” Language Resources and Evaluation
# 39(4): 287–94.
# text source: http://www.latinvulgate.com/verse.aspx?t=1&b=4&c=1
df <- read.csv("../data/stjohn_latin.csv", stringsAsFactors = FALSE)
data_corpus_stjohn <- df %>%
corpus(text_field = "latin") %>%
corpus_group(groups = df$chapter) # %>%
# as.character() %>% # combine verses into a single document
# corpus(docvars = data.frame(chapter = 1:4))
docnames(data_corpus_stjohn) <- paste0("chap", 1:4)
data_dfm_stjohn <- data_corpus_stjohn %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
dfm()
# work with chapter 1
data_dfm_stjohnch1 <- dfm_subset(data_dfm_stjohn, chapter == 1)
freqs <- data_dfm_stjohnch1 %>%
featfreq() %>%
head(n = 331) %>%
sort(decreasing = FALSE)
freqnames <- names(freqs)
# from Table 1
freqs <- c(rep(1, 212),
rep(2, 51),
rep(3, 26),
rep(4, 13),
rep(5, 6),
rep(6, 6),
rep(7, 3),
rep(8, 4),
rep(10, 1),
rep(11, 1),
rep(13, 3),
rep(16, 1),
rep(17, 1),
rep(19, 1),
rep(21, 1),
rep(59, 1))
names(freqs) <- freqnames
dfmat <- as.dfm(matrix(freqs, nrow = 1, dimnames = list(docnames(data_dfm_stjohnch1),
freqnames)))
expect_identical(
as.integer(ntoken(dfmat)), # 770
755L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
)
expect_identical(
as.integer(ntype(dfmat)), # 329
331L # from Miranda-Garcia and Calle-Martin (2005, Table 1)
)
expect_equivalent(
textstat_lexdiv(dfmat, "K"), # 112.767
# from Miranda-Garcia and Calle-Martin (2005, Table 3)
data.frame(document = "chap1", K = 113.091583, stringsAsFactors = FALSE),
tolerance = 0.5
)
# tests on multiple documents - this is Ch 1 and Chs 1-4 as per the first two rows of
# Table 3 of Miranda-Garcia and Calle-Martin (2005)
data_dfm_stjohncomb <- rbind(data_dfm_stjohnch1,
dfm_group(data_dfm_stjohn, rep(1, 4)))
docnames(data_dfm_stjohncomb)[2] <- "chaps1-4"
expect_equivalent(
textstat_lexdiv(data_dfm_stjohncomb, "K"),
data.frame(document = c("chap1", "chaps1-4"), K = c(113.091583, 109.957455),
stringsAsFactors = FALSE),
tolerance = 1
)
# try also Herdan's Vm and Simpson's D - these are VERY WEAK tests
expect_true(
all(textstat_lexdiv(data_dfm_stjohncomb, "D")[1, "D", drop = TRUE] > 0)
)
expect_true(
all(textstat_lexdiv(data_dfm_stjohncomb, "Vm")[1, "Vm", drop = TRUE] > 0)
)
# test equality as per Tweedie and Baayen (1998, Eq. 19)
# this needs checking - the tol value is a fudge
result <- textstat_lexdiv(data_dfm_stjohncomb, c("K", "Vm"))
K <- result[["K"]]
Vm <- result[["Vm"]]
expect_equal(
Vm ^ 2,
as.numeric(K / 10 ^ 4 + (1 / ntoken(data_dfm_stjohncomb) - 1 /
ntype(data_dfm_stjohncomb))),
tol = .0013
)
})
# Tests for multiple static measures of lexical diversity
static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")
test_that("textstat_lexdiv works similarly for corpus and tokens", {
txt <- c(d1 = "b a b a b a b a",
d2 = "a a b b")
mydfm <- dfm(tokens(txt))
mytokens <- tokens(txt)
expect_identical(
textstat_lexdiv(mydfm, measure = static_measures),
textstat_lexdiv(mytokens, measure = static_measures)
)
})
test_that("textstat_lexdiv supports removal of punctuation, numbers and symbols", {
txt <- c(d1 = "a a b b c c",
d2 = "a a , b b . c c / & ^ *** ### 1 2 3 4")
mt <- dfm(tokens(txt))
toks <- tokens(txt)
expect_identical(
textstat_lexdiv(mt["d1", ], measure = static_measures)[, -1],
textstat_lexdiv(mt["d2", ], measure = static_measures)[, -1]
)
expect_identical(
textstat_lexdiv(toks["d1"], measure = static_measures)[, -1],
textstat_lexdiv(toks["d2"], measure = static_measures)[, -1]
)
})
test_that("textstat_lexdiv supports removal of hyphenation", {
y <- dfm(tokens(c(d1 = "apple-pear orange-fruit elephant-ferrari",
d2 = "alpha-beta charlie-delta echo-foxtrot")))
z <- dfm(tokens(c(d1 = "apple pear orange fruit elephant ferrari",
d2 = "alpha beta charlie delta echo foxtrot")))
expect_identical(
textstat_lexdiv(y, measure = static_measures, remove_hyphens = TRUE),
textstat_lexdiv(z, measure = static_measures, remove_hyphens = TRUE)
)
})
test_that("textstat_lexdiv can handle hyphenated words containing duplicated tokens ", {
dfm_nested <- dfm(tokens(corpus(c(d1 = "have we not-we-have bicycle ! % 123 ^ "))))
# not-we-have should be separated into three tokens, with hyphens being removed
# remaining punctuation, symbols and numbers should also be removed
# dfm_nested should only have 4 types with 6 tokens
dfm_non_nested <- dfm(tokens(corpus(c(d1 = "a b b c c d"))))
expect_identical(textstat_lexdiv(dfm_nested, measure = static_measures, remove_hyphens = TRUE),
textstat_lexdiv(dfm_non_nested, measure = static_measures))
})
test_that("textstat_lexdiv.dfm and .tokens work same with remove_* options", {
txt <- c("There's shrimp-kabobs,
shrimp creole, shrimp gumbo. Pan fried, deep fried, stir-fried. There's
pineapple shrimp, lemon shrimp, coconut shrimp, pepper shrimp, shrimp soup,
shrimp stew, shrimp salad, shrimp and potatoes, shrimp burger, shrimp
sandwich.",
"A shrimp-kabob costs $0.50, shrimp costs $0.25.")
expect_identical(
textstat_lexdiv(tokens(txt), measure = "TTR", remove_hyphens = TRUE),
textstat_lexdiv(dfm(tokens(txt), tolower = FALSE), measure = "TTR", remove_hyphens = TRUE)
)
expect_identical(
textstat_lexdiv(tokens(txt), measure = "TTR",
remove_punct = TRUE, remove_hyphens = TRUE),
textstat_lexdiv(dfm(tokens(txt)), measure = "TTR",
remove_punct = TRUE, remove_hyphens = TRUE)
)
expect_identical(
textstat_lexdiv(tokens(txt), measure = "TTR", remove_punct = TRUE),
textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_punct = TRUE)
)
expect_identical(
textstat_lexdiv(tokens(txt[2]), measure = "TTR", remove_symbols = TRUE),
textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)
)
expect_true(
textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = TRUE)[1, "TTR"] !=
textstat_lexdiv(dfm(tokens(txt[2])), measure = "TTR", remove_symbols = FALSE)[1, "TTR"]
)
expect_identical(
textstat_lexdiv(tokens(txt), measure = "TTR", remove_numbers = TRUE),
textstat_lexdiv(dfm(tokens(txt)), measure = "TTR", remove_numbers = TRUE)
)
})
test_that("textstat_lexdiv does not support dfm for MATTR and MSTTR", {
mytxt <- "one one two one one two one"
mydfm <- dfm(tokens(mytxt))
expect_error(
textstat_lexdiv(mydfm, measure = "MATTR"),
"average-based measures are only available for tokens inputs"
)
expect_error(
textstat_lexdiv(mydfm, measure = "MSTTR"),
"average-based measures are only available for tokens inputs"
)
})
test_that("textstat_lexdiv.tokens raises errors if parameters for moving measures are not specified", {
# skip("defaults may have changed")
mytxt <- "one one two one one two one"
mytoken <- tokens(mytxt)
expect_warning(
textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
"MATTR_window exceeds some documents' token lengths, resetting to 7"
)
# expect_error(
# textstat_lexdiv(mytoken, measure = "MSTTR"),
# "MSTTR_segment_size must be specified if MSTTR is to be computed"
# )
})
test_that("textstat_lexdiv.tokens MATTR works correctly on its own", {
mytxt <- "one one two one one two one"
mytoken <- tokens(mytxt)
wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6
wsize3_MATTR <- (2/3 + 2/3 + 2/3 + 2/3 + 2/3) / 5
wsize4_MATTR <- (2/4 + 2/4 + 2/4 + 2/4) / 4
expect_identical(
textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 2)[["MATTR"]],
wsize2_MATTR
)
expect_identical(
textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 3)[["MATTR"]],
wsize3_MATTR
)
expect_identical(
textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 4)[["MATTR"]],
wsize4_MATTR
)
expect_warning(
textstat_lexdiv(mytoken, measure = "MATTR", MATTR_window = 100),
"MATTR_window exceeds some documents' token lengths, resetting to 7"
)
})
test_that("textstat_lexdiv.tokens MATTR works correctly in conjunction with static measures", {
mytxt <- "one one two one one two one"
mytoken <- tokens(mytxt)
wsize2_MATTR <- (1/2 + 1 + 1 + 1/2 + 1 + 1) / 6
expect_equivalent(
textstat_lexdiv(mytoken, measure = c("TTR", "MATTR"), MATTR_window = 2),
data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MATTR = wsize2_MATTR)
)
})
test_that("textstat_lexdiv.tokens MSTTR works correctly on its own", {
mytxt <- "apple orange apple orange pear pear apple orange"
mytoken <- tokens(mytxt)
wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4
wsize3_MSTTR <- (2/3 + 2/3 ) / 2 # apple orange at the back is discarded
wsize4_MSTTR <- (2/4 + 3/4) / 2
# Test segment size = 2
expect_equivalent(
textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 2)[["MSTTR"]],
wsize2_MSTTR
)
# Test segment size = 3
expect_equivalent(
textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 3)[[2]],
wsize3_MSTTR
)
# Test segment size = 4
expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = 4)[[2]],
wsize4_MSTTR)
# Test segment size = ntoken
expect_equivalent(textstat_lexdiv(mytoken, measure = "MSTTR", MSTTR_segment = length(mytoken[["text1"]]))[[2]],
textstat_lexdiv(mytoken, measure = "TTR")[[2]])
})
test_that("textstat_lexdiv.tokens MSTTR works correctly in conjunction with static measures", {
mytxt <- "apple orange apple orange pear pear apple orange"
mytoken <- tokens(mytxt)
wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 2/2) / 4
expect_equivalent(
textstat_lexdiv(mytoken, measure = c("TTR", "MSTTR"), MSTTR_segment = 2),
data.frame(textstat_lexdiv(mytoken, measure = "TTR"), MSTTR = wsize2_MSTTR)
)
})
test_that("compute_MSTTR internal function has working exception handlers", {
mytxt <- "apple orange apple orange pear pear apple orange"
mytoken <- tokens(mytxt)
expect_warning(
quanteda.textstats:::compute_msttr(mytoken, 20),
"MSTTR_segment exceeds some documents' token lengths, resetting to 8"
)
# expect_identical(
# list(compute_msttr(mytoken,segment_size=2, mean_sttr = FALSE, all_segments=TRUE)),
# list(c(MSTTR_tokens1_2 = 2/2, MSTTR_tokens3_4 = 2/2, MSTTR_tokens5_6 = 1/2, MSTTR_tokens7_8 = 2/2))
# )
#
# expect_identical(
# list(compute_msttr(mytoken,segment_size=3 , mean_sttr = FALSE, all_segments=TRUE, discard_remainder = FALSE)),
# list(c(MSTTR_tokens1_3 = 2/3, MSTTR_tokens4_6 = 2/3, MSTTR_tokens7_8 = 1))
# )
# Test misspecification of Segment Size
expect_error(quanteda.textstats:::compute_msttr(mytoken, 0),
"MSTTR_segment must be positive")
# # Case when neither mean segmental TTR or each segment TTR is not requested
# expect_error(compute_msttr(mytoken,segment_size=2,mean_sttr = FALSE ,all_segments=FALSE),
# quanteda.textstats:::message_error("at least one MSTTR value type to be returned"))
})
test_that("textstat_lexdiv.tokens works right when all measures are requested", {
skip("until all MA measures are made functional")
mytxt <- "apple orange apple orange pear pear apple orange"
mytoken <- tokens(mytxt)
wsize2_MATTR <- (2/2 + 2/2 + 2/2 + 2/2 + 1/2 + 2/2 + 2/2) / 7
wsize2_MSTTR <- (2/2 + 2/2 + 1/2 + 1) /4 # 7th entry is discarded
static_measures <- c("TTR", "C", "R", "CTTR", "U", "S", "K", "D", "Vm", "Maas")
moving_measures_df <- data.frame(MATTR = wsize2_MATTR, MSTTR = wsize2_MSTTR)
expect_identical(textstat_lexdiv(mytoken,
measure = "all",
MATTR_window = 2,
MSTTR_segment_size = 2
),
cbind(textstat_lexdiv(mytoken, measure = static_measures),
moving_measures_df))
})
test_that("textstat_lexdiv works with measure = 'all'", {
res <- textstat_lexdiv(dfm(tokens("What, oh what, are we doing?")),
measure = "all")
expect_true(
setequal(names(res),
c("document", "TTR", "C", "R", "CTTR", "U", "S", "K", "I", "D", "Vm", "Maas", "lgV0", "lgeV0"))
)
})
test_that("dfm_split_hyphenated_features works as expected", {
dfmat <- dfm(tokens("One-two one two three."))
expect_identical(
featnames(quanteda.textstats:::dfm_split_hyphenated_features(dfmat)),
c("one", "two", "three", ".", "-")
)
})
test_that("Exact tests for Yule's K", {
txt <- c("a b c d d e e f f f",
"a b c d d e e f f f g g g g")
toks <- tokens(txt)
textstat_lexdiv(toks, "K")
# from koRpus and in issue #46
expect_equal(
round(textstat_lexdiv(toks, "K")$K, 3),
c(1000, 1122.449)
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.