Nothing
test_that("test dfm_group", {
dfmt <- dfm(tokens(c("a b c c", "b c d", "a")))
expect_equivalent(
as.matrix(dfm_group(dfmt, c("doc1", "doc1", "doc2"))),
matrix(c(1, 1, 2, 0, 3, 0, 1, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("a", "b", "c", "d")))
)
expect_equivalent(
as.matrix(dfm_group(dfmt, c(1, 1, 2))),
matrix(c(1, 1, 2, 0, 3, 0, 1, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("a", "b", "c", "d")))
)
})
test_that("dfm_group works with empty documents", {
dfmt <- dfm(tokens(c("a b c c", "b c d", "")))
expect_equivalent(
as.matrix(dfm_group(dfmt, c("doc1", "doc1", "doc2"))),
matrix(c(1, 0, 2, 0, 3, 0, 1, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("a", "b", "c", "d")))
)
expect_equivalent(
as.matrix(dfm_group(dfmt, c(1, 1, 2))),
matrix(c(1, 0, 2, 0, 3, 0, 1, 0), nrow = 2,
dimnames = list(c("doc1", "doc2"), c("a", "b", "c", "d")))
)
expect_equivalent(
as.matrix(dfm_group(dfm_remove(dfmt, "*"), c(1, 1, 2))),
matrix(numeric(), nrow = 2,
dimnames = list(c("doc1", "doc2"), NULL))
)
})
test_that("dfm.character groups works (#794)", {
grp <- c(1, 1, 2)
txt <- c(d1 = "one two three", d2 = "two three four", d3 = "one three four")
corp <- corpus(txt)
toks <- tokens(corp)
expect_equal(
suppressWarnings(dfm(txt, groups = grp)),
suppressWarnings(dfm(toks, groups = grp))
)
expect_equal(
suppressWarnings(dfm(txt, groups = grp)),
suppressWarnings(dfm(tokens(corp), groups = grp))
)
})
test_that("test dfm_group with factor levels, fill = TRUE and FALSE, #854", {
corp <- corpus(c("a b c c", "b c d", "a"),
docvars = data.frame(grp = factor(c("A", "A", "B"), levels = LETTERS[1:4])))
dfmt <- dfm(tokens(corp))
dfmt1 <- dfm_group(dfmt, groups = grp, fill = FALSE)
expect_equal(
as.matrix(dfmt1),
matrix(c(1,2,3,1, 1,0,0,0), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("A", "B"), features = letters[1:4]))
)
expect_equal(
docvars(dfmt1, "grp"),
factor(c("A", "B"))
)
dfmt2 <- dfm_group(dfmt, groups = grp, fill = TRUE)
expect_equal(
as.matrix(dfmt2),
matrix(c(1,2,3,1, 1,0,0,0, 0,0,0,0, 0,0,0,0), byrow = TRUE, nrow = 4 ,
dimnames = list(docs = c("A", "B", "C", "D"), features = letters[1:4]))
)
expect_equal(
docvars(dfmt2, "grp"),
factor(c("A", "B", "C", "D"))
)
})
test_that("test dfm_group with factor levels, fill = TRUE and FALSE", {
dfmt <- dfm(tokens(c("a b c c", "b c d", "a")))
grpvar <- factor(c("text1", "text1", "text2"),
levels = paste0("text", 0:3))
expect_equal(
as.matrix(dfm_group(dfmt, groups = grpvar, fill = FALSE)),
matrix(c(1,2,3,1, 1,0,0,0), byrow = TRUE, nrow = 2,
dimnames = list(docs = c("text1", "text2"), features = letters[1:4]))
)
expect_equal(
as.matrix(dfm_group(dfmt, groups = grpvar, fill = TRUE)),
matrix(c(0,0,0,0, 1,2,3,1, 1,0,0,0, 0,0,0,0), byrow = TRUE, nrow = 4,
dimnames = list(docs = paste0("text", 0:3), features = letters[1:4]))
)
# new documents in factor order
expect_equal(
as.matrix(dfm_group(dfmt, groups = factor(c(1, 1, 2), levels = 4:1), fill = TRUE)),
matrix(c(rep(0, 8), 1,0,0,0, 1,2,3,1), byrow = TRUE, nrow = 4,
dimnames = list(docs = 4:1, features = letters[1:4]))
)
# should this also be ordered? (here the expectation is that it is)
expect_equal(
as.matrix(dfm_group(dfmt, groups = factor(c(3, 3, 1), levels = 4:1), fill = FALSE)),
matrix(c(1,2,3,1, 1,0,0,0), byrow = TRUE, nrow = 2,
dimnames = list(docs = c(3, 1), features = letters[1:4]))
)
})
test_that("test dfm_group with non-factor grouping variable, with fill", {
grpvar <- c("D", "D", "A", "C")
corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
docvars = data.frame(grp = grpvar, stringsAsFactors = FALSE))
dfmt <- dfm(tokens(corp))
expect_equal(
as.matrix(dfm_group(dfmt, groups = grp, fill = FALSE)),
matrix(c(1,0,0,0, 0,1,0,2, 1,2,3,1), byrow = TRUE, nrow = 3,
dimnames = list(docs = c("A", "C", "D"), features = letters[1:4]))
)
expect_equal(
dfm_group(dfmt, groups = grpvar, fill = FALSE),
dfm_group(dfmt, groups = grpvar, fill = TRUE)
)
expect_equal(
dfm_group(dfmt, groups = grp, fill = FALSE),
dfm_group(dfmt, groups = grp, fill = TRUE)
)
expect_equal(
as.matrix(dfm_group(dfmt, groups = grp, fill = FALSE)),
matrix(c(1,0,0,0, 0,1,0,2, 1,2,3,1), byrow = TRUE, nrow = 3,
dimnames = list(docs = c("A", "C", "D"), features = letters[1:4]))
)
expect_equal(
dfm_group(dfmt, groups = grpvar, fill = FALSE),
dfm_group(dfmt, groups = grp, fill = FALSE)
)
})
test_that("test dfm_group with wrongly dimensioned groups variables", {
grpvar <- c("D", "D", "A", "C")
corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
docvars = data.frame(grp = grpvar, stringsAsFactors = FALSE))
dfmt <- dfm(tokens(corp))
expect_error(
dfm_group(dfmt, groups = c(1, 1, 2, 3, 3), fill = FALSE),
"groups must have length ndoc(x)", fixed = TRUE
)
expect_error(
dfm_group(dfmt, groups = c(1, 1, 2, 3, 3), fill = TRUE),
"groups must have length ndoc(x)", fixed = TRUE
)
expect_error(
dfm_group(dfmt, groups = c(1, 1, 2, 3, 4), fill = TRUE),
"groups must have length ndoc(x)", fixed = TRUE
)
})
test_that("test dfm_group keeps group-level variables", {
corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
docvars = data.frame(grp = c("D", "D", "A", "C"),
var1 = c(1, 1, 2, 2),
var2 = c(1, 2, 2, 3),
var3 = c("x", "x", "y", NA),
var4 = c("x", "y", "y", "x"),
var5 = as.Date(c("2018-01-01", "2018-01-01", "2015-03-01", "2012-12-15")),
var6 = as.Date(c("2018-01-01", "2015-03-01", "2015-03-01", "2012-12-15")),
stringsAsFactors = FALSE))
dfmt <- dfm(tokens(corp))
grp1 <- c("D", "D", "A", "C")
expect_equal(
dfm_group(dfmt, grp1)@docvars,
data.frame("docname_" = c("A", "C", "D"),
"docid_" = factor(c("A", "C", "D"),
levels = c("A", "C", "D")),
"segid_" = c(1L, 1L, 1L),
grp = c("A", "C", "D"),
var1 = c(2, 2, 1),
var3 = c("y", NA, "x"),
var5 = as.Date(c("2015-03-01", "2012-12-15", "2018-01-01")),
stringsAsFactors = FALSE)
)
grp2 <- factor(c("D", "D", "A", "C"), levels = c("A", "B", "C", "D"))
expect_equal(
dfm_group(dfmt, grp2, fill = TRUE)@docvars,
data.frame("docname_" = c("A", "B", "C", "D"),
"docid_" = factor(c("A", "B", "C", "D"),
levels = c("A", "B", "C", "D")),
"segid_" = c(1L, 1L, 1L, 1L),
grp = c("A", NA, "C", "D"),
var1 = c(2, NA, 2, 1),
var3 = c("y", NA, NA, "x"),
var5 = as.Date(c("2015-03-01", NA, "2012-12-15", "2018-01-01")),
stringsAsFactors = FALSE)
)
grp3 <- c(NA, NA, "A", "C")
expect_equal(
dfm_group(dfmt, grp3)@docvars,
data.frame("docname_" = c("A", "C"),
"docid_" = factor(c("A", "C"),
levels = c("A", "C")),
"segid_" = c(1L, 1L),
grp = c("A", "C"),
var1 = c(2, 2),
var2 = c(2, 3),
var3 = c("y", NA),
var4 = c("y", "x"),
var5 = as.Date(c("2015-03-01", "2012-12-15")),
var6 = as.Date(c("2015-03-01", "2012-12-15")),
stringsAsFactors = FALSE)
)
})
test_that("is_grouped is working", {
expect_false(quanteda:::is_grouped(c(1, 2, 3, 4),
c(1L, 1L, 2L, 2L)))
expect_false(quanteda:::is_grouped(c(1, 2, 2, 2),
c(1L, 1L, 2L, 2L)))
expect_false(quanteda:::is_grouped(as.factor(c(1, 2, 2, 2)),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(numeric(),
integer()))
expect_true(quanteda:::is_grouped(c(1, 1, 2, 2),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(c(0, 0, 1, 1),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(c(0, 0, 0, 0),
c(1L, 1L, 2L, 2L)))
expect_false(quanteda:::is_grouped(c("a", "b", "c", "d"),
c(1L, 1L, 2L, 2L)))
expect_false(quanteda:::is_grouped(c("a", "b", "b", "b"),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(c("a", "a", "b", "b"),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(character(),
integer()))
expect_true(quanteda:::is_grouped(c("a", "a", "a", "a"),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(c("", "", "b", "b"),
c(1L, 1L, 2L, 2L)))
expect_true(quanteda:::is_grouped(c("", "", "", ""),
c(1L, 1L, 2L, 2L)))
})
test_that("dfm_group resets weighting scheme to count (#1545)", {
mt1 <- dfm_weight(dfm(tokens(c("a b c c", "b c d", "a"))), "boolean")
expect_equal(mt1@meta$object$weight_tf$scheme, "boolean")
mt2 <- dfm_group(mt1, c("doc1", "doc1", "doc2"))
expect_equal(mt2@meta$object$weight_tf$scheme, "boolean")
mt3 <- dfm_weight(mt2, "logcount", force = TRUE)
expect_equal(mt3@meta$object$weight_tf$scheme, "logcount")
})
test_that("force argument works as expected (#1545)", {
corp <- corpus(c("He went out to buy a car",
"He went out and bought pickles and onions",
"He stayed home instead."),
docvars = data.frame(grp = c(1, 1, 2)))
dfmat <- dfm(tokens(corp))
dfmat_tfprop <- dfm_weight(dfmat, "prop")
dfmat_tfidf <- dfm_tfidf(dfmat)
expect_is(dfm_group(dfmat_tfprop, groups = grp, force = FALSE), "dfm")
expect_is(dfm_group(dfmat_tfprop, groups = grp, force = TRUE), "dfm")
expect_error(
dfm_group(dfm_group(dfmat_tfidf, groups = grp, force = FALSE)),
"will not group a weighted dfm; use force = TRUE to override"
)
expect_is(dfm_group(dfmat_tfidf, groups = grp, force = TRUE), "dfm")
})
test_that("group_docvar drops list column (#1553)", {
data <- data.frame("docname_" = c("A", "B", "C", "D"),
"docid_" = factor(c("text1", "text2", "text2", "text3")),
"segid_" = c(1L, 1L, 1L, 1L),
vec1 = c(1, 3, 3, 6),
vec2 = c("a", "b", "b", "c"),
stringsAsFactors = FALSE)
data$lis <- list(1:3, -5, 3:4, 1)
expect_equal(quanteda:::group_docvars(data, factor(c(1, 2, 2, 3))),
data.frame("docname_" = c("1", "2", "3"),
"docid_" = factor(c("1", "2", "3")),
"segid_" = c(1L, 1L, 1L),
vec1 = c(1, 3, 6),
vec2 = c("a", "b", "c"),
stringsAsFactors = FALSE))
corp <- corpus(c("a a c d", "s i k e", "k a i e", "z o p"),
docvars = data.frame(vec1 = c(1, 3, 3, 6),
vec2 = c("a", "b", "b", "c"),
stringsAsFactors = FALSE))
mt <- dfm(tokens(corp))
expect_equal(docvars(dfm_group(mt, c(1, 2, 2, 3))),
data.frame(data.frame(vec1 = c(1, 3, 6),
vec2 = c("a", "b", "c"),
stringsAsFactors = FALSE)))
})
test_that("restore original unit when groups = NULL", {
corp <- corpus(data_corpus_inaugural) # normalize hyphens
corp <- head(corp, 2)
corp_sent <- corpus_reshape(corp)
dfmt_sent <- dfm(tokens(corp_sent))
dfmt <- dfm_group(dfmt_sent)
expect_equal(ndoc(corp), ndoc(dfmt))
expect_equal(ndoc(corp_sent), ndoc(dfmt_sent))
expect_equal(as.matrix(dfmt), as.matrix(dfm(tokens(corp))))
})
test_that("dfm_group works with NA group labels", {
corp <- corpus(c("Doc 1", "Doc 1b", "Doc2", "Doc 3 with NA", "Doc 4, more NA"),
docvars = data.frame(factorvar = c("Yes", "Yes", "No", NA, NA)))
expect_identical(
suppressWarnings(dfm(tokens(corp), groups = corp$factorvar)),
suppressWarnings(dfm(tokens(corp[1:3]), groups = corp$factorvar[1:3]))
)
# expect_identical(
# dfm(tokens(corp)) %>% dfm_group(groups = "factorvar"),
# dfm(tokens(corp[1:3])) %>% dfm_group(groups = "factorvar")
# )
})
test_that("displayed matrix rownames are correct after dfm_group (#1949)", {
expect_identical(
docnames(suppressWarnings(dfm(tokens(letters[1:3]), groups = c("x", "x", "y")))),
c("x", "y")
)
expect_output(
print(suppressWarnings(dfm(tokens(letters[1:3]), groups = c("x", "x", "y")))),
paste0("Document-feature matrix of: 2 documents, 3 features (50.00% sparse) and 0 docvars.\n",
" features\n",
"docs a b c\n",
" x 1 1 0\n",
" y 0 0 1", collapse = ""), fixed = TRUE
)
})
test_that("dfm_group save grouping variable (#2037)", {
corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
docvars = data.frame(grp = factor(c("D", "D", "A", "C"), levels = c("A", "B", "C", "D")),
var1 = c(1, 1, 2, 2),
var2 = c(1, 1, 2, 2),
var3 = c("x", "x", "y", NA),
var4 = c("x", "y", "y", "x"),
var5 = as.Date(c("2018-01-01", "2018-01-01", "2015-03-01", "2012-12-15")),
var6 = as.Date(c("2018-01-01", "2015-03-01", "2015-03-01", "2012-12-15")),
stringsAsFactors = FALSE))
dfmat <- dfm(tokens(corp))
grpvar <- factor(c("E", "E", "F", "G"), levels = c("E", "F", "G", "H"))
dfmat_grp1 <- dfm_group(dfmat, grp)
dfmat_grp2 <- dfm_group(dfmat, grpvar)
dfmat_grp3 <- dfm_group(dfmat, var1)
dfmat_grp4 <- dfm_group(dfmat, grp, fill = TRUE)
dfmat_grp5 <- dfm_group(dfmat, grpvar, fill = TRUE)
dfmat_grp6 <- dfm_group(dfmat, var1, fill = TRUE)
expect_equal(
docvars(dfmat_grp1, "grp"),
factor(c("A", "C", "D"), levels = c("A", "C", "D"))
)
expect_equal(docvars(dfmat_grp1)$var1, c(2, 2, 1))
expect_null(docvars(dfmat_grp2)$grpvar)
expect_equal(docvars(dfmat_grp2)$var1, c(1, 2, 2))
expect_equal(docvars(dfmat_grp3)$var1, c(1, 2))
expect_equal(
docvars(dfmat_grp4, "grp"),
factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D"))
)
expect_equal(docvars(dfmat_grp4)$var1, c(2, NA, 2, 1))
expect_null(docvars(dfmat_grp5)$grpvar)
expect_equal(docvars(dfmat_grp5)$var1, c(1, 2, 2, NA))
expect_equal(docvars(dfmat_grp6)$var1, c(1, 2))
})
test_that("tokens_group drop document for NA", {
corp <- corpus(c("a b c c", "b c d", "a", "b d d"),
docvars = data.frame(grp = factor(c(NA, NA, "A", "C"), levels = c("A", "B", "C", "D")),
var1 = c(1, 1, 2, 2),
var2 = c("x", "x", "y", NA),
stringsAsFactors = FALSE))
toks <- tokens(corp)
dfmat <- dfm(toks)
expect_equal(dfm_group(dfmat, grp)@docvars,
data.frame(docname_ = c("A", "C"),
docid_ = factor(c("A", "C"), levels = c("A", "C")),
segid_ = c(1L, 1L),
grp = factor(c("A", "C"), levels = c("A", "C")),
var1 = c(2, 2),
var2 = c("y", NA),
stringsAsFactors = FALSE))
expect_equal(dfm_group(dfmat, grp, fill = TRUE)@docvars,
data.frame(docname_ = c("A", "B", "C", "D"),
docid_ = factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D")),
segid_ = c(1L, 1L, 1L, 1L),
grp = factor(c("A", "B", "C", "D"), levels = c("A", "B", "C", "D")),
var1 = c(2, NA, 2, NA),
var2 = c("y", NA, NA, NA),
stringsAsFactors = FALSE))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.