inst/doc/textmodel_performance.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  error = TRUE,
  collapse = TRUE,
  comment = "##"
)

## ----setup--------------------------------------------------------------------
library("quanteda")
library("quanteda.textmodels")

## ----echo = FALSE-------------------------------------------------------------
# large movie review database of 50,000 movie reviews
data_corpus_LMRD <- NULL
suppressWarnings(
    try(load(url("https://quanteda.org/data/data_corpus_LMRD.rda")))
)
if (is.null(data_corpus_LMRD)) {
    data_corpus_LMRD <- data_corpus_moviereviews
    data_corpus_LMRD$polarity <- data_corpus_LMRD$sentiment
    data_corpus_LMRD$set <- "train"
    data_corpus_LMRD$set[c(sample(1:1000, size = 200),
                           sample(1001:2000, size = 200))] <- "test"
}

dfmat <- tokens(data_corpus_LMRD) %>%
  dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

## ----eval = FALSE-------------------------------------------------------------
#  # large movie review database of 50,000 movie reviews
#  load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
#  
#  dfmat <- tokens(data_corpus_LMRD) %>%
#    dfm()
#  dfmat_train <- dfm_subset(dfmat, set == "train")
#  dfmat_test <- dfm_subset(dfmat, set == "test")

## -----------------------------------------------------------------------------
library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 20
)

## -----------------------------------------------------------------------------
microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 20
)

## -----------------------------------------------------------------------------
library("fastNaiveBayes")
library("naivebayes")

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 20
)

## -----------------------------------------------------------------------------
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodel_nb = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 20
)

Try the quanteda.textmodels package in your browser

Any scripts or data that you put into this service are public.

quanteda.textmodels documentation built on May 29, 2024, 3:07 a.m.