test_that("compare the output feature co-occurrence matrix to that of the text2vec package", {
skip_if_not_installed("text2vec")
library("text2vec")
txt <- "A D A C E A D F E B A C E D"
tokens <- txt |>
tolower() |>
word_tokenizer()
it <- itoken(tokens)
v <- create_vocabulary(it)
vectorizer <- vocab_vectorizer(v)
tcm <- create_tcm(itoken(tokens), vectorizer, skip_grams_window = 3L)
# convert to a symmetric matrix to facilitate the sorting
tcm <- as.matrix(tcm)
ttcm <- tcm
# diag(ttcm) <- 0
tcm <- tcm + t(ttcm)
# sort the matrix according to rowname-colname and convert back to a upper triangle matrix
tcm <- tcm[order(rownames(tcm)), order(colnames(tcm))]
tcm[lower.tri(tcm, diag = FALSE)] <- 0
toks <- tokens(char_tolower(txt), remove_punct = TRUE)
fcm <- fcm(toks, context = "window", count = "weighted", weights = 1 / seq_len(3),
window = 3)
fcm <- fcm_sort(fcm)
expect_equivalent(as.matrix(fcm), tcm, tol = .00001)
})
test_that("fcm works with character and tokens in the same way", {
txt <- "A D A C E A D F E B A C E D"
fcmt_char <- fcm(tokens(txt), context = "window", count = "weighted",
weights = c(3, 2, 1), window = 3)
toks <- tokens(txt)
fcmt_toks <- fcm(toks, context = "window", count = "weighted",
weights = c(3, 2, 1), window = 3)
expect_equivalent(round(as.matrix(fcmt_char), 2),
round(as.matrix(fcmt_toks), 2))
})
test_that("fcm works with dfm and tokens in the same way", {
txt <- c("b a b c", "a a c b e", "a c e f g")
toks <- tokens(txt)
fcmat_toks_doc <- fcm(toks, context = "document")
fcmat_dfm_doc <- fcm(dfm(toks), context = "document")
expect_equal(as.matrix(fcmat_toks_doc), as.matrix(fcmat_dfm_doc))
# same as document context diagonal
fcmat_toks_win_ord <- fcm(toks, context = "window", window = 1000, ordered = TRUE)
expect_equivalent(diag(as.matrix(fcmat_toks_doc)),
diag(as.matrix(fcmat_toks_win_ord)))
})
# Testing weighting function
test_that("not weighted", {
txt <- "A D A C E A D F E B A C E D"
fcmt <- fcm(tokens(txt), context = "window", window = 3)
mat <- matrix(c(4, 1, 4, 4, 5, 2,
0, 0, 1, 1, 2, 1,
0, 0, 0, 3, 3, 0,
0, 0, 0, 0, 4, 1,
0, 0, 0, 0, 0, 2,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
fcmt <- fcm_sort(fcmt)
expect_equivalent(as.matrix(fcmt), mat)
})
test_that("weighted by default", {
txt <- "A D A C E A D F E B A C E D"
fcmt <- fcm(tokens(txt), context = "window", count = "weighted", window = 3)
mat <- matrix(c(1.67, 1, 2.83, 3.33, 2.83, 0.83,
0, 0, 0.5, 0.33, 1.33, 0.50,
0, 0, 0, 1.33, 2.33, 0,
0, 0, 0, 0, 2.33, 1.00,
0, 0, 0, 0, 0, 1.33,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
fcmt <- fcm_sort(fcmt)
expect_equivalent(mat, round(as.matrix(fcmt), 2))
})
test_that("customized weighting function", {
txt <- "A D A C E A D F E B A C E D"
fcmt <- fcm(tokens(txt), context = "window",
count = "weighted", weights = c(3, 2, 1), window = 3)
mat <- matrix(c(6, 3, 9, 10, 10, 3,
0, 0, 2, 1, 4, 2,
0, 0, 0, 5, 7, 0,
0, 0, 0, 0, 8, 3,
0, 0, 0, 0, 0, 4,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
fcmt <- fcm_sort(fcmt)
expect_equivalent(mat, round(as.matrix(fcmt), 2))
})
test_that("ordered setting: window", {
txt <- "A D A C E A D F E B A C E D"
toks <- tokens(txt)
fcmat <- fcm(toks, context = "window", window = 3, ordered = TRUE, tri = FALSE)
fcmat <- fcm_sort(fcmat)
mat <- matrix(c(2, 0, 3, 3, 3, 1,
1, 0, 1, 0, 1, 0,
1, 0, 0, 2, 2, 0,
1, 1, 1, 0, 2, 1,
2, 1, 1, 2, 0, 1,
1, 1, 0, 0, 1, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_true(all(round(fcmat, 2) == round(mat, 2)))
expect_true(fcmat@meta$object$ordered)
fcmat_nord <- fcm(toks, context = "window", window = 3, ordered = FALSE, tri = FALSE)
fcmat_nord <- fcm_sort(fcmat_nord)
mat <- matrix(c(4, 1, 4, 4, 5, 2,
1, 0, 1, 1, 2, 1,
4, 1, 0, 3, 3, 0,
4, 1, 3, 0, 4, 1,
5, 2, 3, 4, 0, 2,
2, 1, 0, 1, 2, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_true(all(round(fcmat_nord, 2) == round(mat, 2)))
expect_false(fcmat_nord@meta$object$ordered)
})
test_that("ordered setting: boolean", {
txt <- c("b a b c", "a a c b e", "a c e f g")
toks <- tokens(txt)
fcmat1 <- fcm(toks, context = "window", count = "boolean", window = 2,
ordered = TRUE, tri = TRUE)
fcmat1 <- fcm_sort(fcmat1)
mat1 <- matrix(c(1, 2, 3, 1, 0, 0,
1, 1, 1, 1, 0, 0,
0, 1, 0, 2, 1, 0,
0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_equivalent(mat1, as.matrix(fcmat1))
fcmat2 <- fcm(toks, context = "window", count = "boolean", window = 2,
ordered = FALSE, tri = TRUE)
fcmat2 <- fcm_sort(fcmat2)
mat2 <- matrix(c(2, 2, 3, 1, 0, 0,
0, 2, 2, 1, 0, 0,
0, 0, 0, 2, 1, 0,
0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_equivalent(mat2, as.matrix(fcmat2))
fcmat3 <- fcm(dfm(toks), count = "boolean", tri = TRUE)
fcmat3 <- fcm_sort(fcmat3)
mat3 <- matrix(c(1, 2, 3, 2, 1, 1,
0, 1, 2, 1, 0, 0,
0, 0, 0, 2, 1, 1,
0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_equivalent(mat3, as.matrix(fcmat3))
})
test_that("window = 2", {
txt <- c("a a a b b c", "a a c e", "a c e f g")
fcm <- fcm(tokens(txt), context = "window", count = "boolean", window = 2)
mat <- matrix(c(4, 1, 2, 2, 0, 0,
0, 2, 1, 0, 0, 0,
0, 0, 0, 2, 1, 0,
0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
fcm <- fcm_sort(fcm)
expect_equivalent(mat, as.matrix(fcm))
})
test_that("window = 3", {
txt <- c("a a a b b c", "a a c e", "a c e f g")
fcm <- fcm(tokens(txt), context = "window", count = "boolean", window = 3)
fcm <- fcm_sort(fcm)
mat <- matrix(c(4, 1, 3, 2, 1, 0,
0, 2, 1, 0, 0, 0,
0, 0, 0, 2, 1, 1,
0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0),
nrow = 6, ncol = 6, byrow = TRUE)
expect_equivalent(mat, as.matrix(fcm))
})
test_that("fcm.dfm works same as fcm.tokens", {
txt <- c("The quick brown fox jumped over the lazy dog.",
"The dog jumped and ate the fox.")
toks <- tokens(char_tolower(txt), remove_punct = TRUE)
dfmat <- dfm(toks)
expect_equal(fcm(toks, context = "document"),
fcm(dfmat, context = "document"))
fcmat <- fcm(dfm_weight(dfmat, scheme = "boolean"))
expect_equal(as.vector(fcmat[1, ]), c(0, 1, 1, 2, 2, 1, 1, 2, 1, 1))
})
test_that("fcm.dfm only works for context = \"document\"", {
txt <- c("The quick brown fox jumped over the lazy dog.",
"The dog jumped and ate the fox.")
toks <- tokens(char_tolower(txt), remove_punct = TRUE)
expect_error(fcm(dfm(toks), context = "window"),
"fcm.dfm only works on context = \"document\"")
})
test_that("fcm.dfm does works for context = \"document\" with weighted counts", {
txt <- c("The quick brown fox jumped over the lazy dog.",
"The dog jumped and ate the fox.")
toks <- tokens(char_tolower(txt), remove_punct = TRUE)
expect_error(fcm(dfm(toks), context = "document", count = "weighted"),
"Cannot have weighted counts with context = \"document\"")
})
test_that("fcm works as expected for tokens_hashed", {
txt <- c("The quick brown fox jumped over the lazy dog.",
"The dog jumped and ate the fox.")
toks <- tokens(char_tolower(txt), remove_punct = TRUE)
toksh <- tokens(char_tolower(txt), remove_punct = TRUE)
classic <- fcm(toks, context = "window", window = 3)
hashed <- fcm(toksh, context = "window", window = 3)
expect_equivalent(classic, hashed)
})
test_that("fcm print works as expected", {
dfmt <- dfm(tokens(data_corpus_inaugural[1:2],
remove_punct = FALSE, remove_numbers = FALSE, split_hyphens = TRUE))
fcmt <- fcm(dfmt)
expect_output(print(fcmt, max_nfeat = 6, show_summary = TRUE),
paste0("^Feature co-occurrence matrix of: 634 by 634 features\\.",
".*",
"\\[ reached max_feat \\.\\.\\. 628 more features, reached max_nfeat \\.\\.\\. 628 more features \\]$")
)
expect_output(print(fcmt[1:5, 1:5], max_nfeat = 6, show_summary = TRUE),
paste0("^Feature co-occurrence matrix of: 5 by 5 features\\.",
".*",
"fellow\\s+3\\s+6\\s+16\\s+224\\s+361")
)
expect_output(print(fcmt[1:10, 1:2], max_nfeat = 6, show_summary = TRUE),
paste0("^Feature co-occurrence matrix of: 10 by 2 features\\.",
".*",
"\\[ reached max_feat \\.\\.\\. 4 more features \\]$")
)
expect_output(print(fcmt[1:5, 1:5], max_nfeat = -1, show_summary = TRUE),
paste0("^Feature co-occurrence matrix of: 5 by 5 features\\.",
".*",
"the\\s+0\\s+0\\s+0\\s+0\\s+6748$")
)
expect_output(print(fcmt[1:10, 1:2], max_nfeat = -1, show_summary = TRUE),
paste0("^Feature co-occurrence matrix of: 10 by 2 features\\.",
".*",
":\\s+0\\s+0$")
)
expect_output(print(fcmt, max_nfeat = 6, show_summary = FALSE),
paste0("^\\s+features",
".*",
"\\[ reached max_feat \\.\\.\\. 628 more features, reached max_nfeat \\.\\.\\. 628 more features \\]$")
)
})
test_that("fcm expects error for wrong weight or window", {
txt <- c("a a a b b c", "a a c e", "a c e f g")
toks <- tokens(txt)
expect_error(fcm(toks, context = "window", window = 0),
"The value of window must be between 1 and Inf")
expect_error(fcm(toks, context = "window", window = integer()),
"The length of window must be 1")
expect_error(fcm(toks, context = "window", window = 2,
count = "weighted", weights = c(1, 2, 3)),
"The length of weights must be equal to the window size")
expect_error(fcm(toks, context = "window", window = 2,
count = "weighted", weights = c(1, 2, 3)),
"The length of weights must be equal to the window size")
})
test_that("fcm works tokens with paddings, #788", {
txt <- c("The quick brown fox jumped over the lazy dog.",
"The dog jumped and ate the fox.")
toks <- tokens(txt, remove_punct = TRUE)
toks <- tokens_remove(toks, pattern = stopwords(), padding = TRUE)
fcmt <- fcm(toks, context = "window", window = 3)
expect_equal(sort(colnames(fcmt)), sort(attr(toks, "types")))
})
test_that("test empty object is handled properly", {
mat <- quanteda:::make_null_dfm()
expect_equal(dim(fcm(mat)), c(0, 0))
expect_true(is.fcm(fcm(mat)))
toks <- tokens(c("", ""))
expect_equal(dim(fcm(toks)), c(0, 0))
expect_true(is.fcm(fcm(toks)))
})
test_that("arithmetic/linear operation works with dfm", {
mt <- fcm(dfm(tokens(c(d1 = "a a b", d2 = "a b b c", d3 = "c c d"))))
expect_true(is.fcm(mt + 2))
expect_true(is.fcm(mt - 2))
expect_true(is.fcm(mt * 2))
expect_true(is.fcm(mt / 2))
expect_true(is.fcm(mt ^ 2))
expect_true(is.fcm(2 + mt))
expect_true(is.fcm(2 - mt))
expect_true(is.fcm(2 * mt))
expect_true(is.fcm(2 / mt))
expect_true(is.fcm(2 ^ mt))
expect_true(is.fcm(t(mt)))
expect_equal(rowSums(mt), colSums(t(mt)))
})
test_that("ordered is working correctly (#1413)", {
expect_equivalent(
as.matrix(fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE)),
matrix(c(0, 2, 0, 0, 0, 2, 0, 0, 0),
nrow = 3, ncol = 3, byrow = TRUE))
expect_equivalent(
as.matrix(fcm(tokens(c("a b c", "a b c")), "window", window = 2, ordered = TRUE)),
matrix(c(0, 2, 2, 0, 0, 2, 0, 0, 0),
nrow = 3, ncol = 3, byrow = TRUE))
expect_equivalent(
as.matrix(fcm(tokens(c("a b c", "c b a")), "window", window = 1, ordered = TRUE)),
matrix(c(0, 1, 0, 1, 0, 1, 0, 1, 0),
nrow = 3, ncol = 3, byrow = TRUE))
expect_equivalent(
as.matrix(fcm(tokens(c("a b c", "c b a")), "window", window = 2, ordered = TRUE)),
matrix(c(0, 1, 1, 1, 0, 1, 1, 1, 0),
nrow = 3, ncol = 3, byrow = TRUE))
expect_equal(fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE, tri = TRUE),
fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE, tri = FALSE))
})
test_that("dimnames are always character vectors", {
mt <- fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE)
expect_identical(dimnames(mt[, character()]),
list(features = rownames(mt), features = character()))
expect_identical(dimnames(mt[, FALSE]),
list(features = rownames(mt), features = character()))
expect_identical(dimnames(mt[character(), ]),
list(features = character(), features = colnames(mt)))
expect_identical(dimnames(mt[FALSE, ]),
list(features = character(), features = colnames(mt)))
})
test_that("fcm_setnames works", {
x <- fcm(tokens(c("a b c", "a b c")), "window", window = 1)
quanteda:::set_fcm_featnames(x) <- paste0("feature", 1:3)
expect_identical(featnames(x), c("feature1", "feature2", "feature3"))
quanteda:::set_fcm_dimnames(x) <- list(paste0("feature", 1:3), paste0("ALTFEAT", 1:3))
})
test_that("fcm feature names have encoding", {
mt <- fcm(tokens(c("文書1" = "あ い い う", "文書2" = "え え え お")))
expect_true(all(Encoding(colnames(mt)) == "UTF-8"))
expect_true(all(Encoding(rownames(mt)) == "UTF-8"))
mt1 <- fcm_sort(mt)
expect_true(all(Encoding(colnames(mt1)) == "UTF-8"))
expect_true(all(Encoding(rownames(mt1)) == "UTF-8"))
mt2 <- fcm_remove(mt, c("あ"))
expect_true(all(Encoding(colnames(mt2)) == "UTF-8"))
expect_true(all(Encoding(rownames(mt2)) == "UTF-8"))
})
test_that("fcm raise nicer error message, #1267", {
txt <- c(d1 = "one two three", d2 = "two three four", d3 = "one three four")
mx <- fcm(dfm(tokens(txt)))
expect_silent(mx[])
expect_error(mx["five"], "Subscript out of bounds")
expect_error(mx[, "five"], "Subscript out of bounds")
expect_error(mx[5], "Subscript out of bounds")
expect_error(mx[, 5], "Subscript out of bounds")
expect_error(mx[, 1:5], "Subscript out of bounds")
expect_error(mx["d4", "five"], "Subscript out of bounds")
expect_error(mx[, "five", TRUE], "Subscript out of bounds")
expect_error(mx[, 5, TRUE], "Subscript out of bounds")
expect_error(mx[, 1:5, TRUE], "Subscript out of bounds")
expect_error(mx["d4", "five", TRUE], "Subscript out of bounds")
expect_error(mx[4, 5], "Subscript out of bounds")
expect_error(mx[4:5], "Subscript out of bounds")
expect_error(mx[1:4, 1:5], "Subscript out of bounds")
expect_error(mx[4, 5, TRUE], "Subscript out of bounds")
expect_error(mx[1:4, 1:5, TRUE], "Subscript out of bounds")
})
test_that("margin is correct (#2176)" , {
txt <- "A D a C e A D f E b A C E D"
toks <- tokens(txt)
fcmt <- fcm(toks)
expect_identical(
fcmt@meta$object$margin,
featfreq(dfm(toks, tolower = FALSE))
)
expect_identical(
fcm(dfm(toks, tolower = TRUE))@meta$object$margin,
featfreq(dfm(toks, tolower = TRUE))
)
expect_identical(
fcm(dfm(toks, tolower = FALSE))@meta$object$margin,
featfreq(dfm(toks, tolower = FALSE))
)
toks2 <- tokens_tolower(toks)
fcmt2 <- fcm(toks2)
expect_identical(
fcmt2@meta$object$margin,
featfreq(dfm(toks2, tolower = FALSE))
)
})
test_that("unused arguments works for fcm (#2103)", {
toks <- tokens(c("a b c d e", "a a b n"))
expect_warning(
fcm(toks, notanargument = TRUE),
"notanargument argument is not used"
)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.