test-textmodel_nb.R
In quanteda.textmodels: Scaling Models and Classifiers for Textual Data

library("quanteda")

## Example from 13.1 of _An Introduction to Information Retrieval_
txt <- c(d1 = "Chinese Beijing Chinese",
         d2 = "Chinese Chinese Shanghai",
         d3 = "Chinese Macao",
         d4 = "Tokyo Japan Chinese",
         d5 = "Chinese Chinese Chinese Tokyo Japan")
nb_dfm <- dfm(tokens(txt), tolower = FALSE)
nb_class <- factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)

nb_multi_smooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "multinomial", smooth = 1)
nb_multi_nosmooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "multinomial", smooth = 0)
nb_bern_smooth <-
    textmodel_nb(nb_dfm, nb_class, prior = "docfreq",
                 distribution = "Bernoulli", smooth = 1)

# test_that("class priors are preserved in correct order", {
#     expect_equal(textmodel_nb(nb_dfm, nb_class, prior = "uniform")$Pc,
#                  c(N = 0.5, Y = 0.5))
#     expect_equal(textmodel_nb(nb_dfm, nb_class, prior = "docfreq")$Pc,
#                  c(N = 0.25, Y = 0.75))
#     expect_equal(round(textmodel_nb(nb_dfm, nb_class, prior = "termfreq")$Pc, 2),
#                  c(N = 0.27, Y = 0.73))
# })

test_that("bernoulli diff from multinomial model (#776)", {
    expect_true(
        !identical(nb_multi_smooth$param[1, ], nb_bern_smooth$param[1, ])
    )
})

test_that("multinomial likelihoods and class posteriors are correct", {
    # test for results from p261, https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    # with smoothing
    expect_identical(nb_multi_smooth$param["Y", "Chinese"], 3/7)
    expect_identical(nb_multi_smooth$param["Y", "Tokyo"], 1/14)
    expect_identical(nb_multi_smooth$param["Y", "Japan"], 1/14)
    expect_identical(nb_multi_smooth$param["N", "Chinese"], 2/9)
    expect_identical(nb_multi_smooth$param["N", "Tokyo"], 2/9)
    expect_identical(nb_multi_smooth$param["N", "Japan"], 2/9)

    # without smoothing
    expect_identical(nb_multi_nosmooth$param["Y", "Chinese"], 5/8)
    expect_identical(nb_multi_nosmooth$param["Y", "Tokyo"], 0/8)
    expect_identical(nb_multi_nosmooth$param["Y", "Japan"], 0/8)
    expect_identical(nb_multi_nosmooth$param["N", "Chinese"], 1/3)
    expect_identical(nb_multi_nosmooth$param["N", "Tokyo"], 1/3)
    expect_identical(nb_multi_nosmooth$param["N", "Japan"], 1/3)
})

test_that("Bernoulli likelihoods and class posteriors are correct", {
    # test for results from p261, https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    # with smoothing
    expect_identical(nb_bern_smooth$param["Y", "Chinese"], 4/5)
    expect_identical(nb_bern_smooth$param["Y", "Japan"], 1/5)
    expect_identical(nb_bern_smooth$param["Y", "Tokyo"], 1/5)
    expect_identical(nb_bern_smooth$param["Y", "Beijing"], 2/5)
    expect_identical(nb_bern_smooth$param["Y", "Macao"], 2/5)
    expect_identical(nb_bern_smooth$param["Y", "Shanghai"], 2/5)
    expect_identical(nb_bern_smooth$param["N", "Chinese"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Japan"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Tokyo"], 2/3)
    expect_identical(nb_bern_smooth$param["N", "Beijing"], 1/3)
    expect_identical(nb_bern_smooth$param["N", "Macao"], 1/3)
    expect_identical(nb_bern_smooth$param["N", "Shanghai"], 1/3)
})

test_that("Bernoulli nb predicted values are correct", {
    book_lik_Y <- 3/4 * 4/5 * 1/5 * 1/5 * (1-2/5) * (1-2/5) * (1-2/5)  # 0.005184
    book_lik_N <- 1/4 * 2/3 * 2/3 * 2/3 * (1-1/3) * (1-1/3) * (1-1/3)  # 0.02194787
    nb_bern_smooth_pred <- predict(nb_bern_smooth, nb_dfm, type = "prob")
    expect_equal(
        book_lik_Y / (book_lik_Y + book_lik_N),
        nb_bern_smooth_pred["d5", "Y"]
    )
    expect_equal(
        book_lik_N / (book_lik_Y + book_lik_N),
        nb_bern_smooth_pred["d5", "N"]
    )
})

test_that("Works with newdata with different features from the model (#1329 and #1322)", {
    mt1 <- dfm(tokens(c(text1 = "a b c", text2 = "d e f")))
    mt2 <- dfm(tokens(c(text3 = "a b c", text4 = "e f g")))

    nb <- textmodel_nb(mt1, factor(1:2))

    expect_silent(predict(nb, newdata = mt1, force = TRUE))
    expect_warning(predict(nb, newdata = mt2, force = TRUE),
                   "1 feature in newdata not used in prediction\\.")
    expect_error(predict(nb, newdata = mt2),
                 "newdata's feature set is not conformant to model terms\\.")
})

test_that("Works with features with zero probability", {
    mt <- as.dfm(matrix(c(0, 0, 3, 1, 4, 2), nrow = 2))
    nb <- textmodel_nb(mt, factor(1:2), smooth = 0)
    expect_silent(predict(nb))
})

test_that("types works (#1322)", {
    pr <- predict(nb_multi_smooth)
    expect_identical(names(pr), docnames(nb_multi_smooth))
    expect_is(pr, "factor")

    pr_prob <- predict(nb_multi_smooth, type = "probability")
    expect_identical(colnames(pr_prob), c("N", "Y"))
    expect_identical(rownames(pr_prob), c("d1", "d2", "d3", "d4", "d5"))
    expect_equal(pr_prob[1, ], c(N = .065, Y = .935), tol = .001)
    expect_is(pr_prob, "matrix")

    pr_lp <- predict(nb_multi_smooth, type = "logposterior")
    expect_identical(colnames(pr_lp), c("N", "Y"))
    expect_identical(rownames(pr_lp), c("d1", "d2", "d3", "d4", "d5"))
    expect_equal(pr_lp[1, ], c(N = -6.59, Y = -3.93), tol = .01)
    expect_is(pr_lp, "matrix")
})

test_that("textmodel_nb print methods work", {
    nb <- textmodel_nb(data_dfm_lbgexample, c(seq(-1.5, 1.5, .75), NA))
    expect_output(
        print(nb),
        "^\\nCall:\\ntextmodel_nb.dfm\\(x = data_dfm_lbgexample, y = c\\(seq\\(-1\\.5,"
    )

    nbs <- summary(nb)
    expect_output(
        print(nbs),
        "^\\nCall:\\ntextmodel_nb.dfm\\(x = data_dfm_lbgexample, y = c\\(seq\\(-1\\.5,"
    )
})

test_that("coef methods work", {
    cef <- coef(nb_multi_smooth)
    expect_is(cef, "matrix")
    expect_equal(dim(cef), c(6, 2))
    expect_equal(
        dimnames(cef),
        list(c("Chinese", "Beijing", "Shanghai", "Macao", "Tokyo", "Japan"), c("N", "Y"))
    )
    cef <- coef(nb_bern_smooth)
    expect_is(cef, "matrix")
    expect_equal(dim(cef), c(6, 2))
    expect_equal(
        dimnames(cef),
        list(c("Chinese", "Beijing", "Shanghai", "Macao", "Tokyo", "Japan"), c("N", "Y"))
    )
})

test_that("raise warning of unused dots", {
    expect_warning(predict(nb_multi_smooth, something = TRUE),
                   "something argument is not used")
    expect_warning(predict(nb_multi_smooth, something = TRUE, whatever = TRUE),
                   "something, whatever arguments are not used")
})

test_that("raises error when dfm is empty (#1419)",  {
    mx <- dfm_trim(data_dfm_lbgexample, 1000)
    expect_error(textmodel_nb(mx, factor(c("Y", "Y", "Y", "N", NA), ordered = TRUE)),
                 quanteda.textmodels:::message_error("dfm_empty"))
})

test_that("constant predictor raises exception", {
    txt <- c(d1 = "Chinese Beijing Chinese",
             d2 = "Chinese Chinese Shanghai",
             d3 = "Chinese Macao",
             d4 = "Tokyo Japan Chinese",
             d5 = "Chinese Chinese Chinese Tokyo Japan")
    x <- dfm(tokens(txt), tolower = FALSE)

    expect_error(
        textmodel_nb(x, y = c("Y", "Y", "Y", "Y", NA)),
        "y cannot be constant"
    )
    expect_error(
        textmodel_nb(x, y = factor(c("Y", "Y", "Y", "Y", NA))),
        "y cannot be constant"
    )
    expect_error(
        textmodel_nb(x, y = factor(c("Y", "Y", "Y", "Y", NA), levels = c("Y", "N"))),
        "y cannot be constant"
    )
})

test_that("textmodel_nb() works with weighted dfm", {
    dfmat <- dfm_tfidf(data_dfm_lbgexample)
    expect_silent(
        tmod <- textmodel_nb(dfmat, y = c("N", "N", NA, "Y", "Y", NA))
    )
    expect_silent(
        predict(tmod)
    )
})

test_that("multinomial output matches fastNaiveBayes and naivebayes packages", {
    skip_if_not_installed("fastNaiveBayes")
    skip_if_not_installed("naivebayes")
    library("fastNaiveBayes")
    library("naivebayes")

    x <- nb_dfm
    y <- nb_class

    tmod_fnb <- fnb.multinomial(x[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                                laplace = 1, sparse = TRUE)
    tmod_nb <- textmodel_nb(x, y, prior = "docfreq", distribution = "multinomial")
    tmod_bnb <- multinomial_naive_bayes(as(x[1:4, ], "dgCMatrix"), y[1:4], laplace = 1)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, x[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    # expect_equivalent(
    #     predict(tmod_fnb, x[5, ], sparse = TRUE, type = "class"),
    #     predict(tmod_nb, x[5, ], type = "class")
    # )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, x[5, ], type = "prob")
    )

    tmod_fnb <- fnb.multinomial(x[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                                laplace = 0.5, sparse = TRUE)
    tmod_nb <- textmodel_nb(x, y, prior = "docfreq", smooth = 0.5, distribution = "multinomial")
    tmod_bnb <- multinomial_naive_bayes(as(x[1:4, ], "dgCMatrix"), y[1:4], laplace = 0.5)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, x[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    # expect_equivalent(
    #     predict(tmod_fnb, x[5, ], sparse = TRUE, type = "class"),
    #     predict(tmod_nb, x[5, ], type = "class")
    # )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, x[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(x)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, x[5, ], type = "prob")
    )
})

test_that("Bernoulli output matches fastNaiveBayes and naivebayes packages", {
    skip_if_not_installed("fastNaiveBayes")
    skip_if_not_installed("naivebayes")
    library("fastNaiveBayes")
    library("naivebayes")

    xb <- dfm_weight(nb_dfm, scheme = "boolean")
    y <- nb_class

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 1, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb[1:4, ], y[1:4], prior = "docfreq", distribution = "Bernoulli")
    tmod_bnb <- bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 1)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob")
    )

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 0, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb, y, prior = "docfreq", smooth = 0, distribution = "Bernoulli")
    tmod_bnb <- suppressWarnings(bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 0))

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )

    tmod_fnb <- fnb.bernoulli(xb[1:4, ], y[1:4], priors = as.numeric(prop.table(table(y))),
                              laplace = 0.5, sparse = TRUE)
    tmod_nb <- textmodel_nb(xb, y, prior = "docfreq", smooth = 0.5, distribution = "Bernoulli")
    tmod_bnb <- bernoulli_naive_bayes(as(xb[1:4, ], "dgCMatrix"), y[1:4], laplace = 0.5)

    expect_equivalent(
        as.numeric(predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "raw")),
        predict(tmod_nb, xb[5, ], type = "prob")
    )
    expect_equivalent(
        predict(tmod_fnb, xb[5, ], sparse = TRUE, type = "class"),
        predict(tmod_nb, xb[5, ], type = "class")
    )
    expect_equivalent(
        as.numeric(predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob")),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
    expect_equivalent(
        predict(tmod_bnb, newdata = as.matrix(xb)[5, , drop = FALSE], type = "prob"),
        predict(tmod_nb, xb[5, ], type = "prob"),
        tol = .000001
    )
})
Any scripts or data that you put into this service are public.
quanteda.textmodels documentation built on Sept. 11, 2024, 8:19 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
quanteda.textmodels
Scaling Models and Classifiers for Textual Data

tests/testthat/test-textmodel_nb.R
In quanteda.textmodels: Scaling Models and Classifiers for Textual Data

Try the quanteda.textmodels package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textmodels Scaling Models and Classifiers for Textual Data

tests/testthat/test-textmodel_nb.R In quanteda.textmodels: Scaling Models and Classifiers for Textual Data

Try the quanteda.textmodels package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

quanteda.textmodels
Scaling Models and Classifiers for Textual Data

tests/testthat/test-textmodel_nb.R
In quanteda.textmodels: Scaling Models and Classifiers for Textual Data