tests/testthat/test-textmodel_nb.R

library("quanteda")

## Example from 13.1 of _An Introduction to Information Retrieval_
txt <- c(d1 = "Chinese Beijing Chinese",
         d2 = "Chinese Chinese Shanghai",
         d3 = "Chinese Macao",
         d4 = "Tokyo Japan Chinese",
         d5 = "Chinese Chinese Chinese Tokyo Japan")
nb_dfm <- dfm(tokens(txt), tolower = FALSE)
nb_class <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)

nb_multi_smooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "multinomial", smooth = 1)
nb_multi_nosmooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "multinomial", smooth = 0)
nb_bern_smooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "Bernoulli", smooth = 1)

# test_that("class priors are preserved in correct order", {
#     expect_equal(textmodel_nb(nb_dfm, nb_class, prior = "uniform")$Pc,
#                  c(N = 0.5, Y = 0.5))
#     expect_equal(textmodel_nb(nb_dfm, nb_class, prior = "docfreq")$Pc,
#                  c(N = 0.25, Y = 0.75))
#     expect_equal(round(textmodel_nb(nb_dfm, nb_class, prior = "termfreq")$Pc, 2),
#                  c(N = 0.27, Y = 0.73))
# })

test_that("bernoulli diff from multinomial model (#776)", {
    expect_true(
        !identical(nb_multi_smooth$param[1, ], nb_bern_smooth$param[1, ])
    )
})

test_that("multinomial likelihoods and class posteriors are correct", {
    # test for results from p261, https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    # with smoothing
    expect_identical(nb_multi_smooth$param["Y", "Chinese"], 3/7)
    expect_identical(nb_multi_smooth$param["Y", "Tokyo"], 1/14)
    expect_identical(nb_multi_smooth$param["Y", "Japan"], 1/14)
    expect_identical(nb_multi_smooth$param["N", "Chinese"], 2/9)
    expect_identical(nb_multi_smooth$param["N", "Tokyo"], 2/9)
    expect_identical(nb_multi_smooth$param["N", "Japan"], 2/9)

    # without smoothing
    expect_identical(nb_multi_nosmooth$param["Y", "Chinese"], 5/8)
    expect_identical(nb_multi_nosmooth$param["Y", "Tokyo"], 0/8)
    expect_identical(nb_multi_nosmooth$param["Y", "Japan"], 0/8)
    expect_identical(nb_multi_nosmooth$param["N", "Chinese"], 1/3)
    expect_identical(nb_multi_nosmooth$param["N", "Tokyo"], 1/3)
    expect_identical(nb_multi_nosmooth$param["N", "Japan"], 1/3)
})

test_that("Bernoulli likelihoods and class posteriors are correct", {
    # test for results from p261, https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    # with smoothing
    expect_identical(nb_bern_smooth$param["Y", "Chinese"], 4/5)
    expect_identical(nb_bern_smooth$param["Y", "Japan"], 1/5)
    expect_identical(nb_bern_smooth$param["Y", "Tokyo"], 1/5)
    expect_identical(nb_bern_smooth$param["Y", "Beijing"], 2/5)
    expect_identical(nb_bern_smooth$param["Y", "Macao"], 2/5)
    expect_identical(nb_bern_smooth$param["Y", "Shanghai"], 2/5)
    expect_identical(nb_bern_smooth$param["N", "Chinese"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Japan"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Tokyo"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Beijing"], 1/3)
    expect_identical(nb_bern_smooth$param["N", "Macao"], 1/3)
    expect_identical(nb_bern_smooth$param["N", "Shanghai"], 1/3)
})

test_that("Bernoulli nb predicted values are correct", {
    book_lik_Y <- 3/4 * 4/5 * 1/5 * 1/5 * (1-2/5) * (1-2/5) * (1-2/5)  # 0.005184
    book_lik_N <- 1/4 * 2/3 * 2/3 * 2/3 * (1-1/3) * (1-1/3) * (1-1/3)  # 0.02194787
    nb_bern_smooth_pred <- predict(nb_bern_smooth, nb_dfm, type = "prob")
    expect_equal(
        book_lik_Y / (book_lik_Y + book_lik_N),
        nb_bern_smooth_pred["d5", "Y"]
    )
    expect_equal(
        book_lik_N / (book_lik_Y + book_lik_N),
        nb_bern_smooth_pred["d5", "N"]
    )
})

test_that("Works with newdata with different features from the model (#1329 and #1322)", {
    mt1 <- dfm(tokens(c(text1 = "a b c", text2 = "d e f")))
    mt2 <- dfm(tokens(c(text3 = "a b c", text4 = "e f g")))

    nb <- textmodel_nb(mt1, factor(1:2))

    expect_silent(predict(nb, newdata = mt1, force = TRUE))
    expect_warning(predict(nb, newdata = mt2, force = TRUE),
                   "1 feature in newdata not used in prediction\\.")
    expect_error(predict(nb, newdata = mt2),
                 "newdata's feature set is not conformant to model terms\\.")
})

test_that("Works with features with zero probability", {
    mt <- as.dfm(matrix(c(0, 0, 3, 1, 4, 2), nrow = 2))
    nb <- textmodel_nb(mt, factor(1:2), smooth = 0)
    expect_silent(predict(nb))
})

test_that("types works (#1322)", {
    pr <- predict(nb_multi_smooth)
    expect_identical(names(pr), docnames(nb_multi_smooth))
    expect_is(pr, "factor")

    pr_prob <- predict(nb_multi_smooth, type = "probability")
    expect_identical(colnames(pr_prob), c("N", "Y"))
    expect_identical(rownames(pr_prob), c("d1", "d2", "d3", "d4", "d5"))
    expect_equal(pr_prob[1, ], c(N = .065, Y = .935), tol = .001)
    expect_is(pr_prob, "matrix")

    pr_lp <- predict(nb_multi_smooth, type = "logposterior")
    expect_identical(colnames(pr_lp), c("N", "Y"))
    expect_identical(rownames(pr_lp), c("d1", "d2", "d3", "d4", "d5"))
    expect_equal(pr_lp[1, ], c(N = -6.59, Y = -3.93), tol = .01)
    expect_is(pr_lp, "matrix")
})

test_that("textmodel_nb print methods work", {
    nb <- textmodel_nb(data_dfm_lbgexample, c(seq(-1.5, 1.5, .75), NA))
    expect_output(
        print(nb),
        "^\\nCall:\\ntextmodel_nb.dfm\\(x = data_dfm_lbgexample, y = c\\(seq\\(-1\\.5,"
    )

    nbs <- summary(nb)
    expect_output(
        print(nbs),
        "^\\nCall:\\ntextmodel_nb.dfm\\(x = data_dfm_lbgexample, y = c\\(seq\\(-1\\.5,"
    )
})

test_that("coef methods work", {
    cef <- coef(nb_multi_smooth)
    expect_is(cef, "matrix")
    expect_equal(dim(cef), c(6, 2))
    expect_equal(
        dimnames(cef),
        list(c("Chinese", "Beijing", "Shanghai", "Macao", "Tokyo", "Japan"), c("N", "Y"))
    )
    cef <- coef(nb_bern_smooth)
    expect_is(cef, "matrix")
    expect_equal(dim(cef), c(6, 2))
    expect_equal(
        dimnames(cef),
        list(c("Chinese", "Beijing", "Shanghai", "Macao", "Tokyo", "Japan"), c("N", "Y"))
    )
})

test_that("raise warning of unused dots", {
    expect_warning(predict(nb_multi_smooth, something = TRUE),
                   "something argument is not used")
    expect_warning(predict(nb_multi_smooth, something = TRUE, whatever = TRUE),
                   "something, whatever arguments are not used")
})

test_that("raises error when dfm is empty (#1419)",  {
    mx <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_error(textmodel_nb(mx, factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)),
                 quanteda.textmodels:::message_error("dfm_empty"))
})

test_that("constant predictor raises exception", {
    txt <- c(d1 = "Chinese Beijing Chinese",
             d2 = "Chinese Chinese Shanghai",
             d3 = "Chinese Macao",
             d4 = "Tokyo Japan Chinese",
             d5 = "Chinese Chinese Chinese Tokyo Japan")
    x <- dfm(tokens(txt), tolower = FALSE)

    expect_error(
        textmodel_nb(x, y = c("Y", "Y", "Y", "Y", NA)),
        "y cannot be constant"
    )
    expect_error(
        textmodel_nb(x, y = factor(c("Y", "Y", "Y", "Y", NA))),
        "y cannot be constant"
    )
    expect_error(
        textmodel_nb(x, y = factor(c("Y", "Y", "Y", "Y", NA), levels = c("Y", "N"))),
        "y cannot be constant"
    )
})

test_that("textmodel_nb() works with weighted dfm", {
    dfmat <- dfm_tfidf(data_dfm_lbgexample)
    expect_silent(
        tmod <- textmodel_nb(dfmat, y = c("N", "N", NA, "Y", "Y", NA))
    )
    expect_silent(
        predict(tmod)
    )
})

test_that("multinomial output matches fastNaiveBayes and naivebayes packages", {
    skip_if_not_installed("fastNaiveBayes")
    skip_if_not_installed("naivebayes")
    library("fastNaiveBayes")
    library("naivebayes")

    x <- nb_dfm
    y <- nb_class

    tmod_fnb <- fnb.multinomial(x[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                                laplace = 1, sparse = TRUE)
    tmod_nb <- textmodel_nb(x, y, prior = "docfreq", distribution = "multinomial")
    tmod_bnb <- multinomial_naive_bayes(as(x[1:4, ], "dgCMatrix"), y[1:4], laplace = 1)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, x[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    # expect_equivalent(
    #     predict(tmod_fnb, x[5, ], sparse = TRUE, type = "class"),
    #     predict(tmod_nb, x[5, ], type = "class")
    # )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, x[5, ], type = "prob")
    )

    tmod_fnb <- fnb.multinomial(x[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                                laplace = 0.5, sparse = TRUE)
    tmod_nb <- textmodel_nb(x, y, prior = "docfreq", smooth = 0.5, distribution = "multinomial")
    tmod_bnb <- multinomial_naive_bayes(as(x[1:4, ], "dgCMatrix"), y[1:4], laplace = 0.5)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, x[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    # expect_equivalent(
    #     predict(tmod_fnb, x[5, ], sparse = TRUE, type = "class"),
    #     predict(tmod_nb, x[5, ], type = "class")
    # )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, x[5, ], type = "prob")
    )
})

test_that("Bernoulli output matches fastNaiveBayes and naivebayes packages", {
    skip_if_not_installed("fastNaiveBayes")
    skip_if_not_installed("naivebayes")
    library("fastNaiveBayes")
    library("naivebayes")

    xb <- dfm_weight(nb_dfm, scheme = "boolean")
    y <- nb_class

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 1, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb[1:4, ], y[1:4], prior = "docfreq", distribution = "Bernoulli")
    tmod_bnb <- bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 1)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob")
    )

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 0, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb, y, prior = "docfreq", smooth = 0, distribution = "Bernoulli")
    tmod_bnb <- suppressWarnings(bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 0))

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 0.5, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb, y, prior = "docfreq", smooth = 0.5, distribution = "Bernoulli")
    tmod_bnb <- bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 0.5)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
})

Try the quanteda.textmodels package in your browser

Any scripts or data that you put into this service are public.

quanteda.textmodels documentation built on Sept. 11, 2024, 8:19 p.m.