tests/testthat/test-fcm.R

test_that("compare the output feature co-occurrence matrix to that of the text2vec package", {
    skip_if_not_installed("text2vec")
    library("text2vec")

    txt <- "A D A C E A D F E B A C E D"
    tokens <- txt |> 
        tolower() |> 
        word_tokenizer()
    it <- itoken(tokens)
    v <- create_vocabulary(it)
    vectorizer <- vocab_vectorizer(v)
    tcm <- create_tcm(itoken(tokens), vectorizer, skip_grams_window = 3L)

    # convert to a symmetric matrix to facilitate the sorting
    tcm <- as.matrix(tcm)
    ttcm <- tcm
    # diag(ttcm) <- 0
    tcm <- tcm + t(ttcm)

    # sort the matrix according to rowname-colname and convert back to a upper triangle matrix
    tcm <- tcm[order(rownames(tcm)), order(colnames(tcm))]
    tcm[lower.tri(tcm, diag = FALSE)] <- 0

    toks <- tokens(char_tolower(txt), remove_punct = TRUE)
    fcm <- fcm(toks, context = "window", count = "weighted", weights = 1 / seq_len(3),
               window = 3)
    fcm <- fcm_sort(fcm)

    expect_equivalent(as.matrix(fcm), tcm, tol = .00001)
})

test_that("fcm works with character and tokens in the same way", {
    txt <- "A D A C E A D F E B A C E D"
    fcmt_char <- fcm(tokens(txt), context = "window", count = "weighted",
                     weights = c(3, 2, 1), window = 3)
    toks <- tokens(txt)
    fcmt_toks <- fcm(toks, context = "window", count = "weighted",
                    weights = c(3, 2, 1), window = 3)
    expect_equivalent(round(as.matrix(fcmt_char), 2),
                      round(as.matrix(fcmt_toks), 2))
})

test_that("fcm works with dfm and tokens in the same way", {
    txt <- c("b a b c", "a a c b e", "a c e f g")
    toks <- tokens(txt)
    
    fcmat_toks_doc <- fcm(toks, context = "document")
    fcmat_dfm_doc <- fcm(dfm(toks), context = "document")
    expect_equal(as.matrix(fcmat_toks_doc), as.matrix(fcmat_dfm_doc))

    # same as document context diagonal
    fcmat_toks_win_ord <- fcm(toks, context = "window", window = 1000, ordered = TRUE)
    expect_equivalent(diag(as.matrix(fcmat_toks_doc)), 
                       diag(as.matrix(fcmat_toks_win_ord)))
})

# Testing weighting function

test_that("not weighted", {
    txt <- "A D A C E A D F E B A C E D"
    fcmt <- fcm(tokens(txt), context = "window", window = 3)

    mat <- matrix(c(4, 1, 4, 4, 5, 2,
                     0, 0, 1, 1, 2, 1,
                     0, 0, 0, 3, 3, 0,
                     0, 0, 0, 0, 4, 1,
                     0, 0, 0, 0, 0, 2,
                     0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    fcmt <- fcm_sort(fcmt)
    expect_equivalent(as.matrix(fcmt), mat)
})

test_that("weighted by default", {
    txt <- "A D A C E A D F E B A C E D"
    fcmt <- fcm(tokens(txt), context = "window", count = "weighted", window = 3)

    mat <- matrix(c(1.67, 1, 2.83, 3.33, 2.83, 0.83,
                     0, 0, 0.5, 0.33, 1.33, 0.50,
                     0, 0, 0, 1.33, 2.33, 0,
                     0, 0, 0, 0, 2.33, 1.00,
                     0, 0, 0, 0, 0, 1.33,
                     0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    fcmt <- fcm_sort(fcmt)
    expect_equivalent(mat, round(as.matrix(fcmt), 2))
})

test_that("customized weighting function", {
    txt <- "A D A C E A D F E B A C E D"
    fcmt <- fcm(tokens(txt), context = "window", 
                count = "weighted", weights = c(3, 2, 1), window = 3)

    mat <- matrix(c(6, 3, 9, 10, 10, 3,
                     0, 0, 2, 1, 4, 2,
                     0, 0, 0, 5, 7, 0,
                     0, 0, 0, 0, 8, 3,
                     0, 0, 0, 0, 0, 4,
                     0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    fcmt <- fcm_sort(fcmt)
    expect_equivalent(mat, round(as.matrix(fcmt), 2))
})

test_that("ordered setting: window", {
    txt <- "A D A C E A D F E B A C E D"
    toks <- tokens(txt)
    fcmat <- fcm(toks, context = "window", window = 3, ordered = TRUE, tri = FALSE)
    fcmat <- fcm_sort(fcmat)
    mat <- matrix(c(2, 0, 3, 3, 3, 1,
                     1, 0, 1, 0, 1, 0,
                     1, 0, 0, 2, 2, 0,
                     1, 1, 1, 0, 2, 1,
                     2, 1, 1, 2, 0, 1,
                     1, 1, 0, 0, 1, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    expect_true(all(round(fcmat, 2) == round(mat, 2)))
    expect_true(fcmat@meta$object$ordered)

    fcmat_nord <- fcm(toks, context = "window", window = 3, ordered = FALSE, tri = FALSE)
    fcmat_nord <- fcm_sort(fcmat_nord)
    mat <- matrix(c(4, 1, 4, 4, 5, 2,
                     1, 0, 1, 1, 2, 1,
                     4, 1, 0, 3, 3, 0,
                     4, 1, 3, 0, 4, 1,
                     5, 2, 3, 4, 0, 2,
                     2, 1, 0, 1, 2, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    expect_true(all(round(fcmat_nord, 2) == round(mat, 2)))
    expect_false(fcmat_nord@meta$object$ordered)
})

test_that("ordered setting: boolean", {
    txt <- c("b a b c", "a a c b e", "a c e f g")
    toks <- tokens(txt)
    fcmat1 <- fcm(toks, context = "window", count = "boolean", window = 2,
               ordered = TRUE, tri = TRUE)
    fcmat1 <- fcm_sort(fcmat1)
    mat1 <- matrix(c(1, 2, 3, 1, 0, 0,
                     1, 1, 1, 1, 0, 0,
                     0, 1, 0, 2, 1, 0,
                     0, 0, 0, 0, 1, 1,
                     0, 0, 0, 0, 0, 1,
                     0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    expect_equivalent(mat1, as.matrix(fcmat1))

    fcmat2 <- fcm(toks, context = "window", count = "boolean", window = 2,
               ordered = FALSE, tri = TRUE)
    fcmat2 <- fcm_sort(fcmat2)
    mat2 <- matrix(c(2, 2, 3, 1, 0, 0,
                     0, 2, 2, 1, 0, 0,
                     0, 0, 0, 2, 1, 0,
                     0, 0, 0, 0, 1, 1,
                     0, 0, 0, 0, 0, 1,
                     0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    expect_equivalent(mat2, as.matrix(fcmat2))
    
    fcmat3 <- fcm(dfm(toks), count = "boolean", tri = TRUE)
    fcmat3 <- fcm_sort(fcmat3)
    mat3 <- matrix(c(1, 2, 3, 2, 1, 1,
                    0, 1, 2, 1, 0, 0,
                    0, 0, 0, 2, 1, 1,
                    0, 0, 0, 0, 1, 1,
                    0, 0, 0, 0, 0, 1,
                    0, 0, 0, 0, 0, 0),
                  nrow = 6, ncol = 6, byrow = TRUE)
    expect_equivalent(mat3, as.matrix(fcmat3))
})

test_that("window = 2", {
    txt <- c("a a a b b c", "a a c e", "a c e f g")
    fcm <- fcm(tokens(txt), context = "window", count = "boolean", window = 2)
    mat <- matrix(c(4, 1, 2, 2, 0, 0,
                   0, 2, 1, 0, 0, 0,
                   0, 0, 0, 2, 1, 0,
                   0, 0, 0, 0, 1, 1,
                   0, 0, 0, 0, 0, 1,
                   0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    fcm <- fcm_sort(fcm)
    expect_equivalent(mat, as.matrix(fcm))
})

test_that("window = 3", {
    txt <- c("a a a b b c", "a a c e", "a c e f g")
    fcm <- fcm(tokens(txt), context = "window", count = "boolean", window = 3)
    fcm <- fcm_sort(fcm)
    mat <- matrix(c(4, 1, 3, 2, 1, 0,
                    0, 2, 1, 0, 0, 0,
                    0, 0, 0, 2, 1, 1,
                    0, 0, 0, 0, 1, 1,
                    0, 0, 0, 0, 0, 1,
                    0, 0, 0, 0, 0, 0),
                   nrow = 6, ncol = 6, byrow = TRUE)
    expect_equivalent(mat, as.matrix(fcm))
})

test_that("fcm.dfm works same as fcm.tokens", {
    txt <- c("The quick brown fox jumped over the lazy dog.",
             "The dog jumped and ate the fox.")
    toks <- tokens(char_tolower(txt), remove_punct = TRUE)
    dfmat <- dfm(toks)
    expect_equal(fcm(toks, context = "document"),
                 fcm(dfmat, context = "document"))

    fcmat <- fcm(dfm_weight(dfmat, scheme = "boolean"))
    expect_equal(as.vector(fcmat[1, ]), c(0, 1, 1, 2, 2, 1, 1, 2, 1, 1))
})

test_that("fcm.dfm only works for context = \"document\"", {
    txt <- c("The quick brown fox jumped over the lazy dog.",
             "The dog jumped and ate the fox.")
    toks <- tokens(char_tolower(txt), remove_punct = TRUE)
    expect_error(fcm(dfm(toks), context = "window"),
                 "fcm.dfm only works on context = \"document\"")
})

test_that("fcm.dfm does works for context = \"document\" with weighted counts", {
    txt <- c("The quick brown fox jumped over the lazy dog.",
             "The dog jumped and ate the fox.")
    toks <- tokens(char_tolower(txt), remove_punct = TRUE)
    expect_error(fcm(dfm(toks), context = "document", count = "weighted"),
                 "Cannot have weighted counts with context = \"document\"")
})

test_that("fcm works as expected for tokens_hashed", {
    txt <- c("The quick brown fox jumped over the lazy dog.",
             "The dog jumped and ate the fox.")
    toks <- tokens(char_tolower(txt), remove_punct = TRUE)
    toksh <- tokens(char_tolower(txt), remove_punct = TRUE)
    classic <- fcm(toks, context = "window", window = 3)
    hashed <- fcm(toksh, context = "window", window = 3)
    expect_equivalent(classic, hashed)
})

test_that("fcm print works as expected", {
    dfmt <- dfm(tokens(data_corpus_inaugural[1:2],
                remove_punct = FALSE, remove_numbers = FALSE, split_hyphens = TRUE))
    fcmt <- fcm(dfmt)
    expect_output(print(fcmt, max_nfeat = 6, show_summary = TRUE),
                  paste0("^Feature co-occurrence matrix of: 634 by 634 features\\.",
                         ".*",
                         "\\[ reached max_feat \\.\\.\\. 628 more features, reached max_nfeat \\.\\.\\. 628 more features \\]$")
    )
    expect_output(print(fcmt[1:5, 1:5], max_nfeat = 6, show_summary = TRUE),
                  paste0("^Feature co-occurrence matrix of: 5 by 5 features\\.",
                         ".*",
                         "fellow\\s+3\\s+6\\s+16\\s+224\\s+361")
    )
    expect_output(print(fcmt[1:10, 1:2], max_nfeat = 6, show_summary = TRUE),
                  paste0("^Feature co-occurrence matrix of: 10 by 2 features\\.",
                         ".*",
                         "\\[ reached max_feat \\.\\.\\. 4 more features \\]$")
    )
    expect_output(print(fcmt[1:5, 1:5], max_nfeat = -1, show_summary = TRUE),
                  paste0("^Feature co-occurrence matrix of: 5 by 5 features\\.",
                         ".*",
                         "the\\s+0\\s+0\\s+0\\s+0\\s+6748$")
    )
    expect_output(print(fcmt[1:10, 1:2], max_nfeat = -1, show_summary = TRUE),
                  paste0("^Feature co-occurrence matrix of: 10 by 2 features\\.",
                         ".*",
                         ":\\s+0\\s+0$")
    )
    expect_output(print(fcmt, max_nfeat = 6, show_summary = FALSE),
                  paste0("^\\s+features",
                         ".*",
                         "\\[ reached max_feat \\.\\.\\. 628 more features, reached max_nfeat \\.\\.\\. 628 more features \\]$")
    )
})

test_that("fcm expects error for wrong weight or window", {
    txt <- c("a a a b b c", "a a c e", "a c e f g")
    toks <- tokens(txt)
    expect_error(fcm(toks, context = "window", window = 0),
                 "The value of window must be between 1 and Inf")
    expect_error(fcm(toks, context = "window", window = integer()),
                 "The length of window must be 1")
    expect_error(fcm(toks, context = "window", window = 2,
                     count = "weighted", weights = c(1, 2, 3)),
                 "The length of weights must be equal to the window size")
    expect_error(fcm(toks, context = "window", window = 2,
                     count = "weighted", weights = c(1, 2, 3)),
                 "The length of weights must be equal to the window size")
})

test_that("fcm works tokens with paddings, #788", {
    txt <- c("The quick brown fox jumped over the lazy dog.",
             "The dog jumped and ate the fox.")
    toks <- tokens(txt, remove_punct = TRUE)
    toks <- tokens_remove(toks, pattern = stopwords(), padding = TRUE)
    fcmt <- fcm(toks, context = "window", window = 3)
    expect_equal(sort(colnames(fcmt)), sort(attr(toks, "types")))
})

test_that("test empty object is handled properly", {

    mat <- quanteda:::make_null_dfm()
    expect_equal(dim(fcm(mat)), c(0, 0))
    expect_true(is.fcm(fcm(mat)))

    toks <- tokens(c("", ""))
    expect_equal(dim(fcm(toks)), c(0, 0))
    expect_true(is.fcm(fcm(toks)))
})

test_that("arithmetic/linear operation works with dfm", {
    mt <- fcm(dfm(tokens(c(d1 = "a a b", d2 = "a b b c", d3 = "c c d"))))
    expect_true(is.fcm(mt + 2))
    expect_true(is.fcm(mt - 2))
    expect_true(is.fcm(mt * 2))
    expect_true(is.fcm(mt / 2))
    expect_true(is.fcm(mt ^ 2))
    expect_true(is.fcm(2 + mt))
    expect_true(is.fcm(2 - mt))
    expect_true(is.fcm(2 * mt))
    expect_true(is.fcm(2 / mt))
    expect_true(is.fcm(2 ^ mt))
    expect_true(is.fcm(t(mt)))
    expect_equal(rowSums(mt), colSums(t(mt)))
})

test_that("ordered is working correctly (#1413)", {
    expect_equivalent(
        as.matrix(fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE)),
        matrix(c(0, 2, 0, 0, 0, 2, 0, 0, 0),
               nrow = 3, ncol = 3, byrow = TRUE))

    expect_equivalent(
        as.matrix(fcm(tokens(c("a b c", "a b c")), "window", window = 2, ordered = TRUE)),
        matrix(c(0, 2, 2, 0, 0, 2, 0, 0, 0),
               nrow = 3, ncol = 3, byrow = TRUE))

    expect_equivalent(
        as.matrix(fcm(tokens(c("a b c", "c b a")), "window", window = 1, ordered = TRUE)),
        matrix(c(0, 1, 0, 1, 0, 1, 0, 1, 0),
               nrow = 3, ncol = 3, byrow = TRUE))

    expect_equivalent(
        as.matrix(fcm(tokens(c("a b c", "c b a")), "window", window = 2, ordered = TRUE)),
        matrix(c(0, 1, 1, 1, 0, 1, 1, 1, 0),
               nrow = 3, ncol = 3, byrow = TRUE))

    expect_equal(fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE, tri = TRUE),
                 fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE, tri = FALSE))
})

test_that("dimnames are always character vectors", {
    mt <- fcm(tokens(c("a b c", "a b c")), "window", window = 1, ordered = TRUE)
    expect_identical(dimnames(mt[, character()]),
                     list(features = rownames(mt), features = character()))
    expect_identical(dimnames(mt[, FALSE]),
                     list(features = rownames(mt), features = character()))
    expect_identical(dimnames(mt[character(), ]),
                     list(features = character(), features = colnames(mt)))
    expect_identical(dimnames(mt[FALSE, ]),
                     list(features = character(), features = colnames(mt)))
})

test_that("fcm_setnames works", {
    x <- fcm(tokens(c("a b c", "a b c")), "window", window = 1)

    quanteda:::set_fcm_featnames(x) <- paste0("feature", 1:3)
    expect_identical(featnames(x), c("feature1", "feature2", "feature3"))

    quanteda:::set_fcm_dimnames(x) <- list(paste0("feature", 1:3), paste0("ALTFEAT", 1:3))
})

test_that("fcm feature names have encoding", {
    mt <- fcm(tokens(c("文書1" = "あ い い う", "文書2" = "え え え お")))
    expect_true(all(Encoding(colnames(mt)) == "UTF-8"))
    expect_true(all(Encoding(rownames(mt)) == "UTF-8"))

    mt1 <- fcm_sort(mt)
    expect_true(all(Encoding(colnames(mt1)) == "UTF-8"))
    expect_true(all(Encoding(rownames(mt1)) == "UTF-8"))

    mt2 <- fcm_remove(mt, c("あ"))
    expect_true(all(Encoding(colnames(mt2)) == "UTF-8"))
    expect_true(all(Encoding(rownames(mt2)) == "UTF-8"))
})

test_that("fcm raise nicer error message, #1267", {
    txt <- c(d1 = "one two three", d2 = "two three four", d3 = "one three four")
    mx <- fcm(dfm(tokens(txt)))
    expect_silent(mx[])
    expect_error(mx["five"], "Subscript out of bounds")
    expect_error(mx[, "five"], "Subscript out of bounds")
    expect_error(mx[5], "Subscript out of bounds")
    expect_error(mx[, 5], "Subscript out of bounds")
    expect_error(mx[, 1:5], "Subscript out of bounds")
    expect_error(mx["d4", "five"], "Subscript out of bounds")
    expect_error(mx[, "five", TRUE], "Subscript out of bounds")
    expect_error(mx[, 5, TRUE], "Subscript out of bounds")
    expect_error(mx[, 1:5, TRUE], "Subscript out of bounds")
    expect_error(mx["d4", "five", TRUE], "Subscript out of bounds")

    expect_error(mx[4, 5], "Subscript out of bounds")
    expect_error(mx[4:5], "Subscript out of bounds")
    expect_error(mx[1:4, 1:5], "Subscript out of bounds")
    expect_error(mx[4, 5, TRUE], "Subscript out of bounds")
    expect_error(mx[1:4, 1:5, TRUE], "Subscript out of bounds")
})

test_that("margin is correct (#2176)" , {
  
  txt <- "A D a C e A D f E b A C E D"

  toks <- tokens(txt)
  fcmt <- fcm(toks)
  expect_identical(
    fcmt@meta$object$margin,
    featfreq(dfm(toks, tolower = FALSE))
  )
  
  expect_identical(
    fcm(dfm(toks, tolower = TRUE))@meta$object$margin,
    featfreq(dfm(toks, tolower = TRUE))
  )
  
  expect_identical(
    fcm(dfm(toks, tolower = FALSE))@meta$object$margin,
    featfreq(dfm(toks, tolower = FALSE))
  )
  
  toks2 <- tokens_tolower(toks)
  fcmt2 <- fcm(toks2)
  expect_identical(
    fcmt2@meta$object$margin,
    featfreq(dfm(toks2, tolower = FALSE))
  )
  
})

test_that("unused arguments works for fcm (#2103)", {
    toks <- tokens(c("a b c d e", "a a b n"))
    expect_warning(
        fcm(toks, notanargument = TRUE),
        "notanargument argument is not used"
    )
})
quanteda/quanteda documentation built on May 5, 2024, 8:33 p.m.