Nothing
txt <- c(doc1 = "a B c D e",
doc2 = "a BBB c D e",
doc3 = "Aaaa BBB cc")
toks <- tokens(txt)
testdfm <- dfm(toks, tolower = FALSE)
test_that("test dfm_select, fixed", {
expect_equal(
featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "keep", valuetype = "fixed", verbose = FALSE)),
c("a", "B", "c")
)
expect_equal(
featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "remove", valuetype = "fixed", verbose = FALSE)),
setdiff(featnames(testdfm), c("a", "B", "c"))
)
expect_equal(
featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "keep", valuetype = "fixed", case_insensitive = FALSE, verbose = FALSE)),
c("a", "c")
)
expect_equal(
featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "remove", valuetype = "fixed", case_insensitive = FALSE, verbose = FALSE)),
setdiff(featnames(testdfm), c("a", "c"))
)
expect_equal(
featnames(dfm_select(testdfm, c("aaaa", "bbb", "cc"), selection = "keep", valuetype = "fixed", min_nchar = 3, verbose = FALSE)),
c("BBB", "Aaaa")
)
expect_equal(
featnames(dfm_select(testdfm, c("bbb"), selection = "remove", valuetype = "fixed", min_nchar = 3, verbose = FALSE)),
c("Aaaa")
)
expect_equal(
featnames(dfm_select(testdfm, c("aaaa", "bbb", "cc"), selection = "keep", valuetype = "fixed", min_nchar = 3, max_nchar = 3, verbose = FALSE)),
c("BBB")
)
expect_equal(
featnames(dfm_select(testdfm, c("bbb"), selection = "remove", valuetype = "fixed", min_nchar = 3, max_nchar = 3, verbose = FALSE)),
character()
)
})
test_that("test dfm_select, glob", {
feats <- c("a*", "B*", "c")
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", verbose = FALSE)),
c("a", "B", "c", "BBB", "Aaaa")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", verbose = FALSE)),
c("D", "e", "cc")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", case_insensitive = FALSE, verbose = FALSE)),
c("a", "B", "c", "BBB")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", case_insensitive = FALSE, verbose = FALSE)),
c("D", "e", "Aaaa", "cc")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", min_nchar = 3, verbose = FALSE)),
c("BBB", "Aaaa")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", min_nchar = 3, verbose = FALSE)),
character()
)
})
test_that("test dfm_select, regex", {
feats <- c("[A-Z].*", "c.+")
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", verbose = FALSE)),
c("a", "B", "c", "D", "e", "BBB", "Aaaa", "cc")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", verbose = FALSE)),
character()
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", case_insensitive = FALSE, verbose = FALSE)),
c("B", "D", "BBB", "Aaaa", "cc")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", case_insensitive = FALSE, verbose = FALSE)),
c("a", "c", "e")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", min_nchar = 3, verbose = FALSE)),
c("BBB", "Aaaa")
)
expect_equal(
featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", min_nchar = 3, verbose = FALSE)),
character()
)
})
test_that("glob works if results in no features", {
expect_equal(featnames(dfm_select(testdfm, "notthere")), character())
})
test_that("selection that is out of bounds", {
expect_equal(dfm_select(testdfm), testdfm)
expect_equal(
featnames(dfm_select(testdfm, selection = "keep", min_nchar = 5)),
character()
)
expect_equal(
featnames(dfm_select(testdfm, selection = "remove", min_nchar = 5)),
character()
)
})
test_that("longer selection than longer than features that exist (related to #447)", {
dfmtest <- dfm(tokens(c(d1 = "a b", d2 = "a b c d e")))
feat <- c("b", "c", "d", "e", "f", "g")
expect_message(dfm_select(dfmtest, feat, verbose = TRUE),
"kept 4 features")
expect_message(dfm_remove(dfmtest, feat, verbose = TRUE),
"removed 4 features")
expect_equivalent(
as.matrix(dfm_select(dfmtest, feat)),
matrix(c(1, 1, 0, 1, 0, 1, 0, 1), nrow = 2)
)
})
test_that("test dfm_select with ngrams #589", {
ngramdfm <- dfm(tokens(c("of_the", "in_the", "to_the", "of_our", "and_the", " it_is", "by_the", "for_the")))
expect_equal(featnames(dfm_select(ngramdfm, pattern = c("of_the", "in_the"), valuetype = "fixed")),
c("of_the", "in_the"))
expect_equal(featnames(dfm_select(ngramdfm, pattern = "*_the", valuetype = "glob")),
c("of_the", "in_the", "to_the", "and_the", "by_the", "for_the"))
})
test_that("test dfm_select with ngrams concatenated with whitespace", {
ngramdfm <- dfm(tokens(c("of_the", "in_the", "to_the", "of_our", "and_the", " it_is", "by_the", "for_the")))
colnames(ngramdfm) <- stringi::stri_replace_all_fixed(colnames(ngramdfm), "_", " ")
expect_equal(
featnames(dfm_select(ngramdfm, pattern = c("of the", "in the"), valuetype = "fixed")),
c("of the", "in the")
)
expect_equal(
featnames(dfm_select(ngramdfm, pattern = "* the", valuetype = "glob")),
c("of the", "in the", "to the", "and the", "by the", "for the")
)
})
test_that("dfm_select on a dfm returns equal feature sets", {
txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three")
dfmt1 <- dfm(tokens(txts[1:2]))
dfmt2 <- dfm(tokens(txts[2:3]))
expect_error({
dfmt3 <- dfm_select(dfmt1, dfmt2)
}, "dfm cannot be used as pattern; use 'dfm_match' instead")
})
test_that("dfm_select removes padding", {
txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three")
toks <- tokens(txts)
toks <- tokens_remove(toks, stopwords(), padding = TRUE)
testdfm <- dfm(toks)
expect_true("" %in% featnames(testdfm))
testdfm <- dfm_remove(dfm(toks), "")
expect_false("" %in% featnames(testdfm))
})
# test_that("dfm_select raises warning when padding = TRUE but not valuetype = fixed", {
#
# expect_warning(dfm_select(testdfm, c("z", "d", "e"), padding = TRUE),
# "padding is used only when valuetype is 'fixed'")
#
# })
test_that("dfm_select returns empty dfm when not maching features", {
expect_equal(dim(dfm_select(testdfm, pattern = c("x", "y", "z"))),
c(3, 0))
})
test_that("dfm_remove works even when it does not remove anything, issue 711", {
txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three")
testdfm <- dfm(tokens(txts))
expect_silent(dfm_remove(testdfm, c("xxx", "yyy", "x y")))
expect_equal(featnames(dfm_remove(testdfm, c("xxx", "yyy", "x y"))),
featnames(testdfm))
})
test_that("dfm_select errors when dictionary has multi-word features, issue 775", {
dfm_inaug <- dfm(tokens(data_corpus_inaugural[50:58]))
testdict1 <- dictionary(list(eco = c("compan*", "factory worker*"),
pol = c("political part*", "election*")),
separator = " ")
testdict2 <- dictionary(list(eco = c("compan*", "factory_worker"),
pol = c("political_part*", "election*")),
separator = "_")
expect_equal(
featnames(dfm_select(dfm_inaug, pattern = testdict1, valuetype = "glob")),
c("election", "elections", "company", "companies")
)
expect_equal(
featnames(dfm_select(dfm_inaug, pattern = phrase(testdict1), valuetype = "glob")),
c("political", "election", "part", "parties", "elections", "partisan", "company", "participation", "party", "partisanship", "partial", "companies")
)
expect_equal(
featnames(dfm_select(dfm_inaug, pattern = testdict2, valuetype = "glob")),
c("election", "elections", "company", "companies")
)
expect_equal(
featnames(dfm_select(dfm_inaug, pattern = phrase(testdict2), valuetype = "glob")),
c("political", "election", "part", "parties", "elections", "partisan", "company", "participation", "party", "partisanship", "partial", "companies")
)
})
# test_that("dfm_select works when selecting on collocations", {
# txt <- c(d1 = "a b c d e g h", d2 = "a b e g h i j")
# toks_uni <- tokens(txt)
# dfm_uni <- dfm(toks_uni)
# toks_bi <- tokens(txt) %>% tokens_ngrams(n = 2, concatenator = " ")
# dfm_bi <- dfm(toks_bi)
# coll_bi <- textstat_collocations(toks_uni, size = 2, min_count = 2)
# coll_tri <- textstat_collocations(toks_uni, size = 3, min_count = 2)
#
# expect_equal(
# dim(dfm_select(dfm_uni, coll_bi)),
# c(2, 0)
# )
# expect_equal(
# dim(dfm_select(dfm_uni, coll_tri)),
# c(2, 0)
# )
#
# expect_equal(sum(dfm_select(dfm_bi, coll_bi)), 6)
# expect_equal(featnames(dfm_select(dfm_bi, coll_bi)), c("a b", "e g", "g h"))
#
# # wrong
# expect_equal(dim(dfm_select(dfm_bi, coll_tri)), c(2, 0))
# expect_equal(featnames(dfm_select(dfm_bi, coll_tri)), character())
# })
test_that("shortcut functions works", {
testdfm <- dfm(tokens(data_corpus_inaugural[1:5]))
expect_equal(dfm_select(testdfm, stopwords("english"), selection = "keep"),
dfm_keep(testdfm, stopwords("english")))
expect_equal(dfm_select(testdfm, stopwords("english"), selection = "remove"),
dfm_remove(testdfm, stopwords("english")))
})
test_that("dfm_remove/keep fail if selection argument is used", {
dfmt <- dfm(tokens(c("a b c d d", "a a b c d")))
expect_error(
dfm_remove(dfmt, c("b", "c"), selection = "remove"),
"dfm_remove cannot include selection argument"
)
expect_error(
dfm_keep(dfmt, c("b", "c"), selection = "keep"),
"dfm_keep cannot include selection argument"
)
})
test_that("dfm_remove works when selection is a dfm (#1320)", {
d1 <- dfm(tokens("a b b c c c d d d d"))
d2 <- dfm(tokens("d d d a a"))
expect_error({
d3 <- dfm_remove(d1, pattern = d2)
}, "dfm cannot be used as pattern; use 'dfm_match' instead")
expect_error({
d4 <- dfm_select(d1, pattern = d2, selection = "remove")
}, "dfm cannot be used as pattern; use 'dfm_match' instead")
})
test_that("really long words are not removed in tokens() (#1713)", {
dfmat <- dfm(tokens("one two DonaudampfschiffahrtselektrizittenhauptbetriebswerkbauunterbeamtengesellschaftXXX"))
expect_equivalent(nfeat(dfmat), 3)
})
test_that("padding in dfm_select works in the same way as tokens_select (#2152)", {
corp <- corpus(c("a b c d d", "a a b c d"),
docvars = data.frame(var1 = c(1, 2), var2 = c(TRUE, FALSE)))
toks1 <- tokens(corp)
dfmt1 <- dfm(toks1)
expect_equal(
as.vector(dfm_remove(dfmt1, c("a", "b"), padding = TRUE)[,1]),
c(2, 3)
)
expect_equal(
dim(dfm_remove(dfmt1, c("a", "b"), padding = TRUE)),
c(2, 3)
)
expect_equal(
dim(dfm_remove(dfmt1, c("z"), padding = TRUE)),
c(2, 4)
)
expect_equal(dfm(tokens_select(toks1, "d", padding = TRUE)),
dfm_select(dfmt1, "d", padding = TRUE))
expect_equal(dfm(tokens_select(toks1, "z", padding = TRUE)),
dfm_select(dfmt1, "z", padding = TRUE))
expect_equal(dfm(tokens_remove(toks1, "d", padding = TRUE)),
dfm_remove(dfmt1, "d", padding = TRUE))
expect_equal(dfm(tokens_remove(toks1, "z", padding = TRUE)),
dfm_remove(dfmt1, "z", padding = TRUE))
# objects that already have padding
toks2 <- tokens_remove(toks1, "c", padding = TRUE)
dfmt2 <- dfm(toks2, remove_padding = FALSE)
expect_equal(
as.vector(dfm_remove(dfmt2, c("a", "b"), padding = TRUE)[,1]),
c(3, 4)
)
expect_equal(
dim(dfm_remove(dfmt2, c("a", "b"), padding = TRUE)),
c(2, 2)
)
expect_equal(
dim(dfm_remove(dfmt2, c("z"), padding = TRUE)),
c(2, 4)
)
expect_equal(dfm(tokens_select(toks2, "d", padding = TRUE)),
dfm_select(dfmt2, "d", padding = TRUE))
expect_equal(dfm(tokens_select(toks2, "z", padding = TRUE)),
dfm_select(dfmt2, "z", padding = TRUE))
expect_equal(dfm(tokens_remove(toks2, "d", padding = TRUE)),
dfm_remove(dfmt2, "d", padding = TRUE))
expect_equal(dfm(tokens_remove(toks2, "z", padding = TRUE)),
dfm_remove(dfmt2, "z", padding = TRUE))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.