Nothing
test_that("test c.corpus", {
suppressWarnings({
expect_equal(
matrix(dfm(corpus(c("What does the fox say?", "What does the fox say?", "")),
remove_punct = TRUE)),
matrix(rep(c(1, 1, 0), 5), nrow = 15, ncol = 1)
)
})
})
## rbind.dfm
# TODO: Add function for testing the equality of dfms
test_that("test rbind.dfm with different columns", {
dfmt1 <- dfm(tokens(c(text1 = "What does the fox?"), remove_punct = TRUE))
dfmt2 <- dfm(tokens(c(text2 = "fox say"), remove_punct = TRUE))
dfmt3 <- rbind(dfmt1, dfmt2)
dfmt4 <- as.dfm(matrix(c(1, 0, 1, 1, 0, 1, 1, 0, 1, 0), nrow = 2,
dimnames = list(c("text1", "text2"),
c("does", "fox", "say", "the", "what"))))
expect_true(
setequal(featnames(dfmt3), featnames(dfmt4))
)
expect_that(
rbind(dfmt1, dfmt2),
is_a("dfm")
)
})
test_that("test rbind.dfm with different columns, three args and repeated words", {
dfmt1 <- dfm(tokens("What does the?", remove_punct = TRUE))
dfmt2 <- dfm(tokens("fox say fox", remove_punct = TRUE))
dfmt3 <- dfm(tokens("The quick brown fox", remove_punct = TRUE))
dfmt4 <- rbind(dfmt1, dfmt2, dfmt3)
dfmt5 <- as.dfm(matrix(
c(0, 0, 1, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0),
nrow = 3,
dimnames = list(
c("text1", "text1", "text1"),
c("brown", "does", "fox", "quick", "say", "the", "what")
)
))
expect_true(
setequal(featnames(dfmt4), featnames(dfmt5))
)
expect_that(
rbind(dfmt1, dfmt2, dfmt3),
is_a("dfm")
)
})
test_that("test rbind.dfm with a single argument returns the same dfm", {
fox <- "What does the fox say?"
expect_true(
all(
rbind(dfm(tokens(fox))) == dfm(tokens(fox))
)
)
expect_that(
rbind(dfm(tokens(fox, remove_punct = TRUE))),
is_a("dfm")
)
})
test_that("test rbind.dfm with the same features, but in a different order", {
fox <- "What does the fox say?"
xof <- "say fox the does What??"
foxdfm <- rep(1, 20)
dim(foxdfm) <- c(4, 5)
colnames(foxdfm) <- c("does", "fox", "say", "the", "what")
rownames(foxdfm) <- rep(c("text1", "text2"), 2)
dfm1 <- dfm(tokens(c(fox, xof), remove_punct = TRUE))
expect_true(
all(rbind(dfm1, dfm1) == foxdfm)
)
})
test_that("dfm keeps all types with > 10,000 documents (#438) (a)", {
generate_testdfm <- function(n) {
dfm(tokens(paste("X", seq_len(n), sep = "")))
}
expect_equal(nfeat(generate_testdfm(10000)), 10000)
expect_equal(nfeat(generate_testdfm(20000)), 20000)
})
test_that("dfm keeps all types with > 10,000 documents (#438) (b)", {
set.seed(10)
generate_testdfm <- function(n) {
dfm(tokens(paste(sample(letters, n, replace = TRUE), 1:n)))
}
expect_equal(nfeat(generate_testdfm(10000)), 10026)
expect_equal(nfeat(generate_testdfm(10001)), 10027)
})
test_that("dfm.dfm works as expected", {
corp <- data_corpus_inaugural
toks <- tokens(corp)
dfmt <- dfm(toks, tolower = FALSE)
expect_identical(dfm(toks, tolower = FALSE), dfm(dfmt, tolower = FALSE))
expect_identical(dfm(toks, tolower = TRUE), dfm(dfmt, tolower = TRUE))
expect_identical(dfmt, dfm(dfmt, tolower = FALSE))
expect_identical(dfm_tolower(dfmt), dfm(dfmt, tolower = TRUE))
# REMOVED in v3
# expect_true({
# sum(suppressWarnings(dfm(tokens(corp), select = c("The", "a", "an")))) >
# sum(suppressWarnings(dfm(tokens(corp), select = c("The", "a", "an"), case_insensitive = FALSE)))
# })
# expect_identical(dfm(dfmt, remove = c("The", "a", "an"), case_insensitive = FALSE, tolower = FALSE),
# dfm_remove(dfmt, c("The", "a", "an"), case_insensitive = FALSE))
# expect_identical(dfm(dfmt, remove = c("The", "a", "an"), case_insensitive = TRUE, tolower = FALSE),
# dfm_remove(dfmt, c("The", "a", "an"), case_insensitive = TRUE))
# expect_identical(dfm(dfmt, remove = c("The", "a", "an"), case_insensitive = FALSE),
# dfm(tokens_remove(toks, c("The", "a", "an"), case_insensitive = FALSE)))
# expect_identical(dfm(dfmt, remove = c("The", "a", "an"), case_insensitive = TRUE),
# dfm(tokens_remove(toks, c("The", "a", "an"), case_insensitive = TRUE)))
# DEPRECATED
dfmt_group <- suppressWarnings(dfm(dfmt,
groups = ifelse(docvars(data_corpus_inaugural, "Party") %in%
c("Democratic", "Republican"), "Mainstream", "Minor"),
tolower = FALSE))
expect_identical(colSums(dfmt_group), colSums(dfmt_group))
expect_identical(docnames(dfmt_group), c("Mainstream", "Minor"))
dict <- dictionary(list(articles = c("The", "a", "an"),
preps = c("of", "for", "In")), tolower = FALSE)
# REMOVED in v3
# expect_true({
# sum(dfm(tokens(corp), dictionary = dict)) >
# sum(dfm(tokens(corp), dictionary = dict, case_insensitive = FALSE))
# })
# DEPRECATED
expect_equivalent(
suppressWarnings(dfm(corp, dictionary = dict)),
suppressWarnings(dfm(dfmt, dictionary = dict))
)
# DEPRECATED
expect_equivalent(
suppressWarnings(dfm(dfmt, dictionary = dict)),
dfm(tokens_lookup(toks, dict))
)
# REMOVED
# expect_equivalent(
# suppressWarnings(dfm(corp, dictionary = dict, case_insensitive = FALSE)),
# dfm(dfmt, dictionary = dict, case_insensitive = FALSE)
# )
# REMOVED
# expect_equivalent(
# dfm(dfmt, dictionary = dict, case_insensitive = FALSE),
# dfm(tokens_lookup(toks, dict, case_insensitive = FALSE))
# )
# DEPRECATED
expect_identical(
suppressWarnings(dfm(tokens(corp), stem = TRUE)),
suppressWarnings(dfm(dfmt, stem = TRUE))
)
expect_identical(
suppressWarnings(dfm(tokens(corp), stem = TRUE)),
suppressWarnings(dfm(dfmt, stem = TRUE))
)
})
test_that("cbind.dfm works as expected", {
dfm1 <- dfm(tokens("This is one sample text sample"))
dfm2 <- dfm(tokens("More words here"))
dfm12 <- cbind(dfm1, dfm2)
expect_equal(nfeat(dfm12), 8)
expect_equal(names(dimnames(dfm12)),
c("docs", "features"))
})
test_that("cbind.dfm works with non-dfm objects", {
dfm1 <- dfm(tokens(c("a b c", "c d e")))
vec <- c(10, 20)
expect_equal(
as.matrix(cbind(dfm1, vec)),
matrix(c(1, 1, 1, 0, 0, 10, 0, 0, 1, 1, 1, 20), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "feat1")))
)
expect_equal(
as.matrix(cbind(vec, dfm1)),
matrix(c(10, 1, 1, 1, 0, 0, 20, 0, 0, 1, 1, 1), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c("feat1", letters[1:5])))
)
mat <- matrix(1:4, nrow = 2, dimnames = list(NULL, c("f1", "f2")))
expect_equal(
as.matrix(cbind(dfm1, mat)),
matrix(c(1,1,1,0,0,1,3, 0,0,1,1,1,2,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "f1", "f2")))
)
expect_equal(
as.matrix(cbind(mat, dfm1)),
matrix(c(1,3,1,1,1,0,0, 2,4,0,0,1,1,1), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c("f1", "f2", letters[1:5])))
)
expect_equal(
as.matrix(cbind(dfm1, vec, mat)),
matrix(c(1,1,1,0,0,10,1,3, 0,0,1,1,1,20,2,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"),
features = c(letters[1:5], "feat1", "f1", "f2")))
)
expect_equal(
suppressWarnings(as.matrix(cbind(vec, dfm1, vec))),
matrix(c(10,1,1,1,0,0,10, 20,0,0,1,1,1,20), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"),
features = c("feat1", letters[1:5], "feat1")))
)
expect_warning(
cbind(vec, dfm1, vec),
"cbinding dfms with overlapping features"
)
expect_warning(
cbind(dfm1, dfm1),
"cbinding dfms with overlapping features"
)
expect_equal(
as.matrix(cbind(dfm1, 100)),
matrix(c(1, 1, 1, 0, 0, 100, 0, 0, 1, 1, 1, 100), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "feat1")))
)
})
test_that("more cbind tests for dfms", {
txts <- c("a b c d", "b c d e")
mydfm <- dfm(tokens(txts))
expect_equal(
as.matrix(cbind(mydfm, as.dfm(cbind("ALL" = ntoken(mydfm))))),
matrix(c(1,1,1,1,0,4, 0,1,1,1,1,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "ALL")))
)
expect_equal(
as.matrix(cbind(mydfm, cbind("ALL" = ntoken(mydfm)))),
matrix(c(1,1,1,1,0,4, 0,1,1,1,1,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "ALL")))
)
expect_equal(
as.matrix(cbind(mydfm, "ALL" = ntoken(mydfm))),
matrix(c(1,1,1,1,0,4, 0,1,1,1,1,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "ALL")))
)
expect_equal(
as.matrix(cbind(mydfm, ntoken(mydfm))),
matrix(c(1,1,1,1,0,4, 0,1,1,1,1,4), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = c(letters[1:5], "feat1")))
)
})
test_that("cbind.dfm keeps attributes of the dfm", {
mx1 <- as.dfm(matrix(c(0, 0, 0, 0, 1, 2), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("aa", "bb", "cc"))))
mx2 <- as.dfm(matrix(c(2, 3, 0, 0, 0, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("dd", "ee", "ff"))))
meta(mx1, "settings") <- list(somesetting = "somevalue")
mx3 <- cbind(mx1, mx2)
expect_equal(meta(mx3), list(settings = list(somesetting = "somevalue")))
})
test_that("rbind.dfm works as expected", {
dfm1 <- dfm(tokens("This is one sample text sample"))
dfm2 <- dfm(tokens("More words here"))
dfm12 <- rbind(dfm1, dfm2)
expect_equal(nfeat(dfm12), 8)
expect_equal(ndoc(dfm12), 2)
expect_equal(names(dimnames(dfm12)),
c("docs", "features"))
})
test_that("dfm(x, dictionary = mwvdict) works with multi-word values", {
mwvdict <- dictionary(list(sequence1 = "a b", sequence2 = "x y", notseq = c("d", "e")))
txt <- c(d1 = "a b c d e f g x y z",
d2 = "a c d x z",
d3 = "x y",
d4 = "f g")
toks <- tokens(txt)
# as dictionary
dfm1 <- suppressWarnings(dfm(toks, dictionary = mwvdict, verbose = TRUE))
expect_identical(
as.matrix(dfm1),
matrix(c(1, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 0),
nrow = 4,
dimnames = list(docs = paste0("d", 1:4),
features = c("sequence1", "sequence2", "notseq")))
)
# as thesaurus
dfm2 <- suppressWarnings(dfm(toks, thesaurus = mwvdict, verbose = TRUE))
expect_identical(
as.matrix(dfm2),
matrix(c(1, 0, 0, 0, 1, 1, 0, 0, 2, 1, 0, 0, 1, 0, 0, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0),
nrow = 4,
dimnames = list(docs = paste0("d", 1:4),
features = c("SEQUENCE1", "c", "NOTSEQ", "f", "g",
"SEQUENCE2", "z", "a", "x")))
)
})
test_that("dfm works with relational operators", {
testdfm <- dfm(tokens(c("This is an example.", "This is a second example.")))
expect_is(testdfm == 0, "lgCMatrix")
expect_is(testdfm >= 0, "lgCMatrix")
expect_is(testdfm <= 0, "lgCMatrix")
expect_is(testdfm < 0, "lgCMatrix")
expect_is(testdfm < 1, "lgCMatrix")
expect_is(testdfm > 0, "lgCMatrix")
expect_is(testdfm > 1, "lgCMatrix")
expect_is(testdfm > -1, "lgCMatrix")
expect_is(testdfm < -1, "lgCMatrix")
})
test_that("dfm addition (+) keeps attributes #1279", {
dfmt <- head(data_dfm_lbgexample, 4)
# @settings slot
meta(dfmt, "testsetting") <- list(test = 1)
expect_equal(
meta(dfmt + 1)["testsetting"],
list(testsetting = list(test = 1))
)
expect_equal(
meta(1 + dfmt)["testsetting"],
list(testsetting = list(test = 1))
)
# @weightTf slot
dfmt@meta$object$weight_tf <- list(scheme = "prop", base = exp(1), K = 2)
expect_equal(
(dfmt + 1)@meta$object$weight_tf,
list(scheme = "prop", base = exp(1), K = 2)
)
expect_equal(
(1 + dfmt)@meta$object$weight_tf,
list(scheme = "prop", base = exp(1), K = 2)
)
# @weightDf slot
weight <- list(scheme = "idf", base = NULL, c = NULL,
smoothing = NULL, threshold = NULL)
dfmt@meta$object$weight_df <- weight
expect_equal(
(dfmt + 1)@meta$object$weight_df,
weight
)
expect_equal(
(1 + dfmt)@meta$object$weight_df,
weight
)
# @smooth slot
dfmt@meta$object$smooth <- 5.5
expect_equal(
(dfmt + 1)@meta$object$smooth,
5.5
)
expect_equal(
(1 + dfmt)@meta$object$smooth,
5.5
)
# @ngrams slot
dfmt@meta$object$ngram <- 5L
expect_equal(
(dfmt + 1)@meta$object$ngram,
5L
)
expect_equal(
(1 + dfmt)@meta$object$ngram,
5L
)
# @skip slot
dfmt@meta$object$skip <- 3L
expect_equal(
(dfmt + 1)@meta$object$skip,
3L
)
expect_equal(
(1 + dfmt)@meta$object$skip,
3L
)
# @concatenator slot
dfmt@meta$object$concatenator <- "+-+"
expect_equal(
(dfmt + 1)@meta$object$concatenator,
"+-+"
)
expect_equal(
(1 + dfmt)@meta$object$concatenator,
"+-+"
)
# @version slot
dfmt@meta$system$`package-version` <- as.package_version("10.5.1")
expect_equal(
(dfmt + 1)@meta$system$`package-version`,
as.package_version("10.5.1")
)
expect_equal(
(1 + dfmt)@meta$system$`package-version`,
as.package_version("10.5.1")
)
# @docvars slot
dfmt@docvars <- data.frame(test = letters[1:ndoc(dfmt)])
expect_equal(
(dfmt + 1)@docvars,
data.frame(test = letters[1:ndoc(dfmt)])
)
expect_equal(
(1 + dfmt)@docvars,
data.frame(test = letters[1:ndoc(dfmt)])
)
})
test_that("dfm's document counts in verbose message is correct", {
txt <- c(d1 = "a b c d e f g x y z",
d2 = "a c d x z",
d3 = "x y",
d4 = "f g")
expect_message(suppressWarnings(dfm(tokens(txt), remove = c("a", "f"), verbose = TRUE)),
"removed 2 features")
expect_message(suppressWarnings(dfm(tokens(txt), select = c("a", "f"), verbose = TRUE)),
"kept 2 features")
})
test_that("dfm print works with options as expected", {
dfmt <- dfm(tokens(data_corpus_inaugural[1:14],
remove_punct = FALSE, remove_numbers = FALSE, split_hyphens = TRUE))
expect_output(
print(dfmt, max_ndoc = 6, max_nfeat = 10, show_summary = TRUE),
paste0("^Document-feature matrix of: 14 documents, 4,452 features \\(81\\.97% sparse\\) and 4 docvars",
".*",
"\\[ reached max_ndoc \\.\\.\\. 8 more documents, reached max_nfeat \\.\\.\\. 4,442 more features \\]$")
)
expect_output(
print(dfmt[1:5, 1:5], max_ndoc = 6, max_nfeat = 10, show_summary = TRUE),
paste0("^Document-feature matrix of: 5 documents, 5 features \\(4\\.00% sparse\\) and 4 docvars\\.",
".*",
"1789-Washington\\s+3\\s+2\\s+5\\s+71\\s+116")
)
expect_output(
print(dfmt[1:5, 1:5], max_ndoc = -1, max_nfeat = -1, show_summary = TRUE),
paste0("^Document-feature matrix of: 5 documents, 5 features \\(4\\.00% sparse\\) and 4 docvars\\.",
".*",
"1805-Jefferson\\s+8\\s+1\\s+10\\s+101\\s+143")
)
expect_output(
print(dfmt[1:5, 1:5], max_ndoc = 0, max_nfeat = -1, show_summary = TRUE),
"^Document-feature matrix of: 5 documents, 5 features \\(4\\.00% sparse\\) and 4 docvars\\.$"
)
expect_output(
print(dfmt[1:5, 1:5], max_ndoc = -1, max_nfeat = 0, show_summary = TRUE),
paste0("^Document-feature matrix of: 5 documents, 5 features \\(4\\.00% sparse\\) and 4 docvars\\.",
"\\n",
"\\[ reached max_nfeat \\.\\.\\. 5 more features ]$")
)
expect_output(
print(dfmt, max_ndoc = 6, max_nfeat = 10, show_summary = FALSE),
paste0("^\\s+features",
".*",
"\\[ reached max_ndoc \\.\\.\\. 8 more documents, reached max_nfeat \\.\\.\\. 4,442 more features \\]$")
)
expect_error(print(dfmt, max_ndoc = -2),
"The value of max_ndoc must be between -1 and Inf")
expect_error(print(dfmt, max_nfeat = -2),
"The value of max_nfeat must be between -1 and Inf")
})
test_that("cannot supply remove and select in one call (#793)", {
txt <- c(d1 = "one two three", d2 = "two three four", d3 = "one three four")
corp <- corpus(txt, docvars = data.frame(grp = c(1, 1, 2)))
toks <- tokens(corp)
expect_error(
suppressWarnings(dfm(txt, select = "one", remove = "two")),
"only one of select and remove may be supplied at once"
)
expect_error(
suppressWarnings(dfm(corp, select = "one", remove = "two")),
"only one of select and remove may be supplied at once"
)
expect_error(
dfm(toks, select = "one", remove = "two"),
"only one of select and remove may be supplied at once"
)
expect_error(
dfm(dfm(toks), select = "one", remove = "two"),
"only one of select and remove may be supplied at once"
)
})
test_that("dfm with selection options produces correct output", {
txt <- c(d1 = "a b", d2 = "a b c d e")
toks <- tokens(txt)
dfmt <- dfm(toks)
feat <- c("b", "c", "d", "e", "f", "g")
expect_message(
suppressWarnings(dfm(txt, remove = feat, verbose = TRUE)),
"removed 4 features"
)
expect_message(
suppressWarnings(dfm(toks, remove = feat, verbose = TRUE)),
"removed 4 features"
)
expect_message(
suppressWarnings(dfm(dfmt, remove = feat, verbose = TRUE)),
"removed 4 features"
)
})
test_that("dfm works with stem options", {
txt_english <- "running ran runs"
txt_french <- "courant courir cours"
quanteda_options(language_stemmer = "english")
expect_equal(
as.character(tokens_wordstem(tokens(txt_english))),
c("run", "ran", "run")
)
expect_equal(
featnames(dfm(tokens(txt_english))),
c("running", "ran", "runs")
)
expect_equal(
featnames(suppressWarnings(dfm(tokens(txt_english), stem = TRUE))),
c("run", "ran")
)
expect_error(
suppressWarnings(dfm(tokens(txt_english), stem = c(TRUE, FALSE))),
"The length of stem must be 1"
)
quanteda_options(language_stemmer = "french")
expect_equal(
as.character(tokens_wordstem(tokens(txt_french))),
rep("cour", 3)
)
expect_equal(
featnames(dfm(tokens(txt_french))),
c("courant", "courir", "cours")
)
expect_equal(
featnames(suppressWarnings(dfm(tokens(txt_french), stem = TRUE))),
"cour"
)
quanteda_options(reset = TRUE)
})
test_that("dfm verbose option prints correctly", {
txt <- c(d1 = "a b c d e", d2 = "a a b c c c")
corp <- corpus(txt)
toks <- tokens(txt)
mydfm <- dfm(toks)
expect_message(suppressWarnings(dfm(txt, verbose = TRUE)), "Creating a dfm from a character input")
expect_message(suppressWarnings(dfm(corp, verbose = TRUE)), "Creating a dfm from a corpus input")
expect_message(dfm(toks, verbose = TRUE), "Creating a dfm from a tokens input")
expect_message(dfm(mydfm, verbose = TRUE), "Creating a dfm from a dfm input")
})
test_that("dfm works with purrr::map (#928)", {
skip_if_not_installed("purrr")
a <- "a b"
b <- "a a a b b"
suppressWarnings(expect_identical(
vapply(purrr::map(list(a, b), dfm), is.dfm, logical(1)),
c(TRUE, TRUE)
))
suppressWarnings(expect_identical(
vapply(purrr::map(list(corpus(a), corpus(b)), dfm), is.dfm, logical(1)),
c(TRUE, TRUE)
))
expect_identical(
vapply(purrr::map(list(tokens(a), tokens(b)), dfm), is.dfm, logical(1)),
c(TRUE, TRUE)
)
expect_identical(
vapply(purrr::map(list(dfm(tokens(a)), dfm(tokens(b))), dfm), is.dfm, logical(1)),
c(TRUE, TRUE)
)
})
test_that("dfm works when features are created (#946", {
dfm1 <- as.dfm(matrix(1:6, nrow = 2,
dimnames = list(c("doc1", "doc2"), c("a", "b", "c"))))
dfm2 <- as.dfm(matrix(1:6, nrow = 2,
dimnames = list(c("doc1", "doc2"), c("b", "c", "feat_2"))))
expect_equal(
as.matrix(dfm_match(dfm1, featnames(dfm2))),
matrix(c(3, 4, 5, 6, 0, 0), nrow = 2,
dimnames = list(docs = c("doc1", "doc2"), features = c("b", "c", "feat_2")))
)
expect_equal(
as.matrix(cbind(dfm(tokens("a b")), dfm(tokens("feat_1")))),
matrix(c(1, 1, 1), nrow = 1, dimnames = list(docs = "text1", features = c("a", "b", "feat_1")))
)
})
test_that("dfm warns of argument not used", {
txt <- c(d1 = "a b c d e", d2 = "a a b c c c")
corp <- corpus(txt)
toks <- tokens(txt)
mx <- dfm(toks)
expect_warning(dfm(txt, xxxxx = "something", yyyyy = "else"),
"^xxxxx, yyyyy arguments are not used")
expect_warning(dfm(corp, xxxxx = "something", yyyyy = "else"),
"^xxxxx, yyyyy arguments are not used")
expect_warning(dfm(toks, xxxxx = "something", yyyyy = "else"),
"^xxxxx, yyyyy arguments are not used")
expect_warning(dfm(mx, xxxxx = "something", yyyyy = "else"),
"^xxxxx, yyyyy arguments are not used")
})
test_that("dfm pass arguments to tokens, issue #1121", {
txt <- data_char_sampletext
corp <- corpus(txt)
suppressWarnings({
expect_equal(dfm(txt, what = "character"),
dfm(tokens(corp, what = "character")))
expect_equivalent(dfm(txt, what = "character"),
dfm(tokens(txt, what = "character")))
expect_equal(dfm(txt, remove_punct = TRUE),
dfm(tokens(corp, remove_punct = TRUE)))
expect_equivalent(dfm(txt, remove_punct = TRUE),
dfm(tokens(txt, remove_punct = TRUE)))
})
})
test_that("dfm error when a dfm is given to for feature selection when x is not a dfm, #1067", {
txt <- c(d1 = "a b c d e", d2 = "a a b c c c")
corp <- corpus(txt)
toks <- tokens(txt)
mx <- dfm(toks)
mx2 <- dfm(tokens(c("a b", "c")))
expect_error(suppressWarnings(dfm(txt, select = mx2)),
"dfm cannot be used as pattern")
expect_error(suppressWarnings(dfm(corp, select = mx2)),
"dfm cannot be used as pattern")
expect_error(suppressWarnings(dfm(toks, select = mx2)),
"dfm cannot be used as pattern")
expect_error(suppressWarnings(dfm(mx, select = mx2)),
"dfm cannot be used as pattern; use 'dfm_match' instead")
})
test_that("test topfeatures", {
expect_identical(
topfeatures(dfm(tokens("a a a a b b b c c d")), scheme = "count"),
c(a = 4, b = 3, c = 2, d = 1)
)
expect_error(
topfeatures(dfm(tokens("a a a a b b b c c d")), "count"),
"n must be a number"
)
dfmat <- corpus(c("a b b c", "b d", "b c"),
docvars = data.frame(numdv = c(1, 2, 1))) %>%
tokens() %>%
dfm()
expect_identical(
topfeatures(dfmat, groups = numdv),
list("1" = c(b = 3, c = 2, a = 1, d = 0),
"2" = c(b = 1, d = 1, a = 0, c = 0))
)
expect_identical(
topfeatures(dfmat, scheme = "docfreq"),
c(b = 3L, c = 2L, a = 1L, d = 1L)
)
expect_identical(
topfeatures(dfm_weight(dfmat, scheme = "prop"), groups = numdv),
list("1" = c(b = 1.00, c = 0.75, a = 0.25, d = 0.00),
"2" = c(b = 0.5, d = 0.5, a = 0, c = 0))
)
})
test_that("test sparsity", {
expect_equal(
sparsity(dfm(tokens(c("a a a a c c d", "b b b")))),
0.5
)
})
test_that("test null dfm is handled properly", {
mx <- quanteda:::make_null_dfm()
# constructor
expect_equal(dfm(mx), mx)
# selection and grouping
expect_equal(dfm_select(mx), mx)
expect_equal(dfm_select(mx, "a"), mx)
expect_equal(dfm_trim(mx), mx)
expect_equal(dfm_sample(mx), mx)
expect_equal(dfm_subset(mx), mx)
expect_equal(dfm_compress(mx, "both"), mx)
expect_equal(dfm_compress(mx, "features"), mx)
expect_equal(dfm_compress(mx, "documents"), mx)
expect_equal(dfm_sort(mx, margin = "both"), mx)
expect_equal(dfm_sort(mx, margin = "features"), mx)
expect_equal(dfm_sort(mx, margin = "documents"), mx)
expect_equal(dfm_lookup(mx, dictionary(list(A = "a"))), mx)
expect_equal(dfm_group(mx), mx)
expect_equal(dfm_replace(mx, "A", "a"), mx)
expect_equal(head(mx), mx)
expect_equal(tail(mx), mx)
# weighting
expect_equal(topfeatures(mx), numeric())
expect_equal(dfm_weight(mx, "count"), mx)
expect_equal(dfm_weight(mx, "prop"), mx)
expect_equal(dfm_weight(mx, "propmax"), mx)
expect_equal(dfm_weight(mx, "logcount"), mx)
expect_equal(dfm_weight(mx), mx)
expect_equal(dfm_weight(mx, "augmented"), mx)
expect_equal(dfm_weight(mx, "boolean"), mx)
expect_equal(dfm_weight(mx, "logave"), mx)
expect_equal(dfm_tfidf(mx), mx)
expect_equal(docfreq(mx), numeric())
expect_equal(dfm_smooth(mx), mx)
# transformation
expect_equal(dfm_tolower(mx), mx)
expect_equal(dfm_toupper(mx), mx)
expect_equal(dfm_wordstem(mx), mx)
# binding
expect_equal(rbind(mx, mx), mx)
expect_equal(cbind(mx, mx), mx)
expect_output(print(mx),
"Document-feature matrix of: 0 documents, 0 features (0.00% sparse) and 0 docvars.", fixed = TRUE)
})
test_that("test empty dfm is handled properly (#1419)", {
mx <- dfm_trim(data_dfm_lbgexample, 1000)
docvars(mx) <- data.frame(var = c(1, 5, 3, 6, 6, 4))
# constructor
expect_equal(dfm(mx), mx)
# selection and grouping
expect_equal(dfm_select(mx), mx)
expect_equal(dfm_select(mx, "a"), mx)
expect_equal(dfm_trim(mx), mx)
expect_equal(ndoc(dfm_sample(mx)), ndoc(mx))
expect_equal(dfm_subset(mx, var > 5), mx[4:5, ])
expect_equal(dfm_compress(mx, "both"), mx)
expect_equal(dfm_compress(mx, "features"), mx)
expect_equal(dfm_compress(mx, "documents"), mx)
expect_equal(dfm_sort(mx, margin = "both"), mx)
expect_equal(dfm_sort(mx, margin = "features"), mx)
expect_equal(dfm_sort(mx, margin = "documents"), mx)
expect_equal(dfm_lookup(mx, dictionary(list(A = "a"))), mx)
expect_equal(dfm_group(mx), mx)
expect_equal(dfm_replace(mx, "A", "a"), mx)
expect_equal(head(mx), mx)
expect_equal(tail(mx), mx)
# weighting
expect_equal(topfeatures(mx), numeric())
expect_equal(dfm_weight(mx, "count"), mx)
expect_equal(dfm_weight(mx, "prop"), mx)
expect_equal(dfm_weight(mx, "propmax"), mx)
expect_equal(dfm_weight(mx, "logcount"), mx)
expect_equal(dfm_weight(mx), mx)
expect_equal(dfm_weight(mx, "augmented"), mx)
expect_equal(dfm_weight(mx, "boolean"), mx)
expect_equal(dfm_weight(mx, "logave"), mx)
expect_equal(dfm_tfidf(mx), mx)
expect_equal(docfreq(mx), numeric())
expect_equal(dfm_smooth(mx), mx)
# transformation
expect_equal(dfm_tolower(mx), mx)
expect_equal(dfm_toupper(mx), mx)
expect_equal(dfm_wordstem(mx), mx)
# binding
expect_equal(ndoc(rbind(mx, mx)), ndoc(mx) * 2)
expect_equal(ndoc(cbind(mx, mx)), ndoc(mx))
expect_output(print(mx),
"Document-feature matrix of: 6 documents, 0 features (0.00% sparse) and 1 docvar.", fixed = TRUE)
})
test_that("dfm raise nicer error message, #1267", {
txt <- c(d1 = "one two three", d2 = "two three four", d3 = "one three four")
mx <- dfm(tokens(txt))
expect_error(mx["d4"], "Subscript out of bounds")
expect_error(mx["d4", ], "Subscript out of bounds")
expect_error(mx[4], "Subscript out of bounds")
expect_error(mx[4, ], "Subscript out of bounds")
expect_error(mx["d4", , TRUE], "Subscript out of bounds")
expect_error(mx[4, , TRUE], "Subscript out of bounds")
expect_error(mx[1:4, , TRUE], "Subscript out of bounds")
expect_error(mx[1:4, , TRUE], "Subscript out of bounds")
expect_error(mx["five"], "Subscript out of bounds")
expect_error(mx[, "five"], "Subscript out of bounds")
expect_error(mx[5], "Subscript out of bounds")
expect_error(mx[, 5], "Subscript out of bounds")
expect_error(mx[, 1:5], "Subscript out of bounds")
expect_error(mx["d4", "five"], "Subscript out of bounds")
expect_error(mx[, "five", TRUE], "Subscript out of bounds")
expect_error(mx[, 5, TRUE], "Subscript out of bounds")
expect_error(mx[, 1:5, TRUE], "Subscript out of bounds")
expect_error(mx["d4", "five", TRUE], "Subscript out of bounds")
expect_error(mx[4, 5], "Subscript out of bounds")
expect_error(mx[4:5], "Subscript out of bounds")
expect_error(mx[1:4, 1:5], "Subscript out of bounds")
expect_error(mx[4, 5, TRUE], "Subscript out of bounds")
expect_error(mx[1:4, 1:5, TRUE], "Subscript out of bounds")
})
test_that("dfm keeps non-existent types, #1278", {
toks <- tokens("a b c")
dict <- dictionary(list(A = "a", B = "b", Z = "z"))
toks_key <- tokens_lookup(toks, dict)
expect_equal(types(toks_key), c("A", "B", "Z"))
expect_equal(featnames(dfm(toks_key, tolower = TRUE)),
c("a", "b", "z"))
expect_equal(featnames(dfm(toks_key, tolower = FALSE)),
c("A", "B", "Z"))
})
test_that("arithmetic/linear operation works with dfm", {
mt <- dfm(tokens(c(d1 = "a a b", d2 = "a b b c", d3 = "c c d")))
expect_true(is.dfm(mt + 2))
expect_true(is.dfm(mt - 2))
expect_true(is.dfm(mt * 2))
expect_true(is.dfm(mt / 2))
expect_true(is.dfm(mt ^ 2))
expect_true(is.dfm(2 + mt))
expect_true(is.dfm(2 - mt))
expect_true(is.dfm(2 * mt))
expect_true(is.dfm(2 / mt))
expect_true(is.dfm(2 ^ mt))
expect_true(is.dfm(t(mt)))
expect_equal(rowSums(mt), colSums(t(mt)))
})
test_that("rbind and cbind wokrs with empty dfm", {
mt <- dfm(tokens(c(d1 = "a a b", d2 = "a b b c", d3 = "c c d")))
expect_identical(docnames(rbind(mt, quanteda:::make_null_dfm())),
docnames(mt))
expect_identical(docnames(mt),
docnames(rbind(mt, quanteda:::make_null_dfm())))
expect_identical(docnames(cbind(mt, quanteda:::make_null_dfm())),
docnames(mt))
expect_identical(docnames(mt),
docnames(cbind(mt, quanteda:::make_null_dfm())))
})
test_that("format_sparsity works correctly", {
expect_error(
quanteda:::format_sparsity(-1),
"The value of x must be between 0 and 1"
)
expect_identical(
quanteda:::format_sparsity(sparsity(as.dfm(Matrix::rsparsematrix(1000, 1000, density = 0.5)))),
"50.00%"
)
expect_identical(
quanteda:::format_sparsity(sparsity(as.dfm(Matrix::rsparsematrix(1000, 1000, density = 0.1)))),
"90.00%"
)
expect_identical(
quanteda:::format_sparsity(sparsity(as.dfm(Matrix::rsparsematrix(1000, 1000, density = 0.99)))),
"1.00%"
)
expect_identical(quanteda:::format_sparsity(1.0), "100.00%")
expect_identical(quanteda:::format_sparsity(0.9999), "99.99%")
expect_identical(quanteda:::format_sparsity(0.99991), ">99.99%")
expect_identical(quanteda:::format_sparsity(0.0001), "0.01%")
expect_identical(quanteda:::format_sparsity(0.00001), "<0.01%")
expect_identical(quanteda:::format_sparsity(0.00011), "0.01%")
expect_identical(quanteda:::format_sparsity(0.0), "0.00%")
expect_identical(quanteda:::format_sparsity(NA), "0.00%")
})
test_that("unused argument warning only happens only once (#1509)", {
expect_warning(
dfm(tokens("some text"), NOTARG = TRUE),
"^NOTARG argument is not used\\.$"
)
expect_warning(
dfm(corpus("some text"), NOTARG = TRUE),
"^NOTARG argument is not used\\.$"
)
expect_warning(
dfm(tokens("some text"), NOTARG = TRUE),
"^NOTARG argument is not used\\.$"
)
expect_warning(
dfm(tokens("some text"), NOTARG = TRUE, NOTARG2 = FALSE),
"^NOTARG, NOTARG2 arguments are not used\\.$"
)
})
test_that("dfm.tokens() with groups works as expected", {
x <- tokens(data_corpus_inaugural)
groupeddfm <- suppressWarnings(dfm(tokens(x),
groups = c("FF", "FF", rep("non-FF", ndoc(x) - 2))))
expect_equal(ndoc(groupeddfm), 2)
expect_equal(docnames(groupeddfm), c("FF", "non-FF"))
expect_equal(featnames(groupeddfm), featnames(dfm(x)))
})
test_that("dimnames are always character vectors", {
mt <- data_dfm_lbgexample
expect_identical(dimnames(mt[, character()]),
list(docs = rownames(mt), features = character()))
expect_identical(dimnames(mt[, FALSE]),
list(docs = rownames(mt), features = character()))
expect_identical(dimnames(mt[character(), ]),
list(docs = character(), features = colnames(mt)))
expect_identical(dimnames(mt[FALSE, ]),
list(docs = character(), features = colnames(mt)))
})
test_that("set_dfm_dimnames etc functions work", {
x <- dfm(tokens(c("a a b b c", "b b b c")))
quanteda:::set_dfm_featnames(x) <- paste0("feature", 1:3)
expect_identical(featnames(x), c("feature1", "feature2", "feature3"))
quanteda:::set_dfm_docnames(x) <- paste0("DOC", 1:2)
expect_identical(docnames(x), c("DOC1", "DOC2"))
quanteda:::set_dfm_dimnames(x) <- list(c("docA", "docB"), LETTERS[1:3])
expect_identical(docnames(x), c("docA", "docB"))
expect_identical(featnames(x), c("A", "B", "C"))
})
test_that("dfm feature and document names have encoding", {
mt <- dfm(tokens(c("文書1" = "あ い い う", "文書2" = "え え え お")))
expect_true(all(Encoding(colnames(mt)) == "UTF-8"))
#expect_true(all(Encoding(rownames(mt)) == "UTF-8")) fix in new corpus
mt1 <- dfm_sort(mt)
expect_true(all(Encoding(colnames(mt1)) == "UTF-8"))
#expect_true(all(Encoding(rownames(mt1)) == "UTF-8")) fix in new corpus
mt2 <- dfm_group(mt, c("文書3", "文書3"))
expect_true(all(Encoding(colnames(mt2)) == "UTF-8"))
#expect_true(all(Encoding(rownames(mt2)) == "UTF-8")) fix in new corpus
mt3 <- dfm_remove(mt, c("あ"))
expect_true(all(Encoding(colnames(mt3)) == "UTF-8"))
#expect_true(all(Encoding(rownames(mt3)) == "UTF-8")) fix in new corpus
mt4 <- dfm_trim(mt, min_termfreq = 2)
expect_true(all(Encoding(colnames(mt4)) == "UTF-8"))
#expect_true(all(Encoding(rownames(mt4)) == "UTF-8")) fix in new corpus
})
test_that("dfm verbose = TRUE works as expected", {
expect_message(
tmp <- suppressWarnings(dfm(data_corpus_inaugural[1:3], verbose = TRUE)),
"Creating a dfm from a corpus input"
)
expect_message(
tmp <- dfm(tokens(data_corpus_inaugural[1:3]), verbose = TRUE),
"Finished constructing a 3 x 1,\\d{3} sparse dfm"
)
dict <- dictionary(list(pos = "good", neg = "bad", neg_pos = "not good", neg_neg = "not bad"))
expect_message(
tmp <- suppressWarnings(dfm(tokens(data_corpus_inaugural[1:3]), dictionary = dict, verbose = TRUE)),
"applying a dictionary consisting of 4 keys"
)
expect_message(
tmp <- suppressWarnings(dfm(dfm(tokens(data_corpus_inaugural[1:3])), dictionary = dict, verbose = TRUE)),
"applying a dictionary consisting of 4 keys"
)
expect_message(
tmp <- suppressWarnings(dfm(tokens(data_corpus_inaugural[1:3]),
groups = data_corpus_inaugural$President[1:3],
verbose = TRUE)),
"grouping texts"
)
expect_message(
tmp <- suppressWarnings(dfm(tokens(data_corpus_inaugural[1:2]), stem = TRUE, verbose = TRUE)),
"stemming types \\(English\\)"
)
expect_message(
tmp <- suppressWarnings(dfm(dfm(tokens(data_corpus_inaugural[1:2])), stem = TRUE, verbose = TRUE)),
"stemming features \\(English\\)"
)
expect_message(
tmp <- suppressWarnings(dfm(dfm(tokens(data_corpus_inaugural[1:3])),
groups = data_corpus_inaugural$President[1:3],
verbose = TRUE)),
"grouping texts"
)
expect_error(
dfm(tokens("one two three"), remove = "one", select = "three"),
"only one of select and remove may be supplied at once"
)
toks <- tokens(c("one two", "two three four"))
attributes(toks)$types[4] <- NA
dfm(toks)
})
test_that("dfm_sort works as expected", {
dfmat <- dfm(tokens(c(d1 = "z z x y a b", d3 = "x y y y c", d2 = "a z")))
expect_identical(
featnames(dfm_sort(dfmat, margin = "features", decreasing = TRUE)),
c("y", "z", "x", "a", "b", "c")
)
expect_identical(
featnames(dfm_sort(dfmat, margin = "features", decreasing = FALSE)),
c("b", "c", "x", "a", "z", "y")
)
expect_identical(
docnames(dfm_sort(dfmat, margin = "documents", decreasing = TRUE)),
c("d1", "d3", "d2")
)
expect_identical(
docnames(dfm_sort(dfmat, margin = "documents", decreasing = FALSE)),
rev(c("d1", "d3", "d2"))
)
})
test_that("test dfm transpose for #1903", {
dfmat <- dfm(tokens(c(d1 = "one two three", d2 = "two two three")))
dfmat_t <- t(dfmat)
expect_equal(
names(dimnames(dfmat_t)),
c("features", "docs")
)
expect_equal(
docnames(dfmat_t),
c("one", "two", "three")
)
expect_equal(
dfmat_t@docvars$docname_,
c("one", "two", "three")
)
expect_equal(
names(dfmat_t@meta),
c("system", "object", "user")
)
})
test_that("dfm deprecations work as expected", {
txt <- c("a a b b c", "a a b c c d d")
corp <- corpus(txt)
toks <- tokens(corp)
dfmat <- dfm(toks)
# deprecated methods
expect_warning(
dfm(txt),
"'dfm.character()' is deprecated. Use 'tokens()' first.",
fixed = TRUE
)
expect_warning(
dfm(corp),
"'dfm.corpus()' is deprecated. Use 'tokens()' first.",
fixed = TRUE
)
# old arguments
expect_warning(
dfm(txt, stem = TRUE),
"'stem' is deprecated; use dfm_wordstem() instead",
fixed = TRUE
)
expect_warning(
dfm(txt, select = "a"),
"'select' is deprecated; use dfm_select() instead",
fixed = TRUE
)
expect_warning(
dfm(txt, remove = "a"),
"'remove' is deprecated; use dfm_remove() instead",
fixed = TRUE
)
expect_warning(
dfm(txt, dictionary = dictionary(list(one = "b"))),
"'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead",
fixed = TRUE
)
expect_warning(
dfm(txt, groups = c(1, 1)),
"'groups' is deprecated; use dfm_group() instead",
fixed = TRUE
)
expect_warning(
dfm(txt, remove = "a", valuetype = "regex"),
"valuetype is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(txt, remove = "a", case_insensitive = FALSE),
"case_insensitive is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(corp, stem = TRUE),
"'stem' is deprecated; use dfm_wordstem() instead",
fixed = TRUE
)
expect_warning(
dfm(corp, select = "a"),
"'select' is deprecated; use dfm_select() instead",
fixed = TRUE
)
expect_warning(
dfm(corp, remove = "a"),
"'remove' is deprecated; use dfm_remove() instead",
fixed = TRUE
)
expect_warning(
dfm(corp, dictionary = dictionary(list(one = "b"))),
"'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead",
fixed = TRUE
)
expect_warning(
dfm(corp, groups = c(1, 1)),
"'groups' is deprecated; use dfm_group() instead",
fixed = TRUE
)
expect_warning(
dfm(corp, remove = "a", valuetype = "regex"),
"valuetype is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(corp, remove = "a", case_insensitive = FALSE),
"case_insensitive is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(toks, stem = TRUE),
"'stem' is deprecated; use dfm_wordstem() instead",
fixed = TRUE
)
expect_warning(
dfm(toks, select = "a"),
"'select' is deprecated; use dfm_select() instead",
fixed = TRUE
)
expect_warning(
dfm(toks, remove = "a"),
"'remove' is deprecated; use dfm_remove() instead",
fixed = TRUE
)
expect_warning(
dfm(toks, dictionary = dictionary(list(one = "b"))),
"'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead",
fixed = TRUE
)
expect_warning(
dfm(toks, groups = c(1, 1)),
"'groups' is deprecated; use dfm_group() instead",
fixed = TRUE
)
expect_warning(
dfm(toks, remove = "a", valuetype = "regex"),
"valuetype is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(toks, remove = "a", case_insensitive = FALSE),
"case_insensitive is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(dfmat, stem = TRUE),
"'stem' is deprecated; use dfm_wordstem() instead",
fixed = TRUE
)
expect_warning(
dfm(dfmat, select = "a"),
"'select' is deprecated; use dfm_select() instead",
fixed = TRUE
)
expect_warning(
dfm(dfmat, remove = "a"),
"'remove' is deprecated; use dfm_remove() instead",
fixed = TRUE
)
expect_warning(
dfm(dfmat, dictionary = dictionary(list(one = "b"))),
"'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead",
fixed = TRUE
)
expect_warning(
dfm(dfmat, groups = c(1, 1)),
"'groups' is deprecated; use dfm_group() instead",
fixed = TRUE
)
expect_warning(
dfm(dfmat, remove = "a", valuetype = "regex"),
"valuetype is deprecated in dfm()", fixed = TRUE
)
expect_warning(
dfm(dfmat, remove = "a", case_insensitive = FALSE),
"case_insensitive is deprecated in dfm()", fixed = TRUE
)
})
test_that("valuetype and case_insensitive are still working", {
txt <- c("a a b b c", "A A b C C d d")
corp <- corpus(txt)
toks <- tokens(corp)
dfmat <- dfm(toks, tolower = FALSE)
# for txt
expect_identical(
featnames(suppressWarnings(dfm(txt, tolower = FALSE, remove = "a|c",
valuetype = "regex"))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(txt, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = TRUE))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(txt, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = FALSE))),
c("b", "A", "C", "d")
)
# for corpus
expect_identical(
featnames(suppressWarnings(dfm(corp, tolower = FALSE, remove = "a|c",
valuetype = "regex"))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(corp, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = TRUE))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(corp, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = FALSE))),
c("b", "A", "C", "d")
)
# for dfm
expect_identical(
featnames(suppressWarnings(dfm(dfmat, tolower = FALSE, remove = "a|c",
valuetype = "regex"))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(dfmat, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = TRUE))),
c("b", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(dfmat, tolower = FALSE, remove = "a|c",
valuetype = "regex", case_insensitive = FALSE))),
c("b", "A", "C", "d")
)
})
test_that("remove_padding argument works", {
txt <- c("a a b b c", "a a b c c d d")
toks <- tokens(txt) %>% tokens_remove("b", padding = TRUE)
dfmat <- dfm(toks)
expect_identical(
featnames(suppressWarnings(dfm(txt, remove_padding = TRUE))),
c("a", "b", "c", "d")
)
expect_identical(
featnames(suppressWarnings(dfm(txt, remove_padding = FALSE))),
c("a", "b", "c", "d")
)
expect_identical(
featnames(dfm(toks, remove_padding = FALSE)),
c("", "a", "c", "d")
)
expect_identical(
featnames(dfm(dfmat, remove_padding = TRUE)),
c("a", "c", "d")
)
expect_identical(
featnames(dfm(dfmat, remove_padding = FALSE)),
c("", "a", "c", "d")
)
})
test_that("features of DFM are always in the same order (#2100)", {
toks1 <- quanteda:::build_tokens(list(c(1, 0, 2, 3, 4)), types = c("a", "b", "c", "d"),
padding = TRUE,
docvars = quanteda:::make_docvars(1L))
toks2 <- quanteda:::build_tokens(list(c(1, 0, 3, 2, 4)), types = c("a", "c", "b", "d"),
padding = TRUE,
docvars = quanteda:::make_docvars(1L))
toks3 <- quanteda:::build_tokens(list(c(1, 2, 3, 4)), types = c("a", "b", "c", "d"),
padding = FALSE,
docvars = quanteda:::make_docvars(1L))
dfmat1 <- dfm(toks1)
dfmat2 <- dfm(toks2)
dfmat3 <- dfm(toks3)
expect_identical(c("", "a", "b", "c", "d"), featnames(dfmat1))
expect_identical(c("", "a", "b", "c", "d"), featnames(dfmat2))
expect_identical(c("a", "b", "c", "d"), featnames(dfmat3))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.