knitr::opts_chunk$set( error = TRUE, collapse = TRUE, comment = "##" )
library("quanteda") library("quanteda.textmodels")
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews data_corpus_LMRD <- NULL suppressWarnings( try(load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))) ) if (is.null(data_corpus_LMRD)) { data_corpus_LMRD <- data_corpus_moviereviews data_corpus_LMRD$polarity <- data_corpus_LMRD$sentiment data_corpus_LMRD$set <- "train" data_corpus_LMRD$set[c(sample(1:1000, size = 200), sample(1001:2000, size = 200))] <- "test" } dfmat <- tokens(data_corpus_LMRD) %>% dfm() dfmat_train <- dfm_subset(dfmat, set == "train") dfmat_test <- dfm_subset(dfmat, set == "test")
# large movie review database of 50,000 movie reviews load(url("https://quanteda.org/data/data_corpus_LMRD.rda")) dfmat <- tokens(data_corpus_LMRD) %>% dfm() dfmat_train <- dfm_subset(dfmat, set == "train") dfmat_test <- dfm_subset(dfmat, set == "test")
Comparing the performance of fitting the model:
library("microbenchmark") microbenchmark( multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"), bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"), times = 20 )
And for prediction:
microbenchmark( multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"), newdata = dfmat_test), bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"), newdata = dfmat_test), times = 20 )
Now let's see how textmodel_nb()
compares to equivalent functions from other packages. Multinomial:
library("fastNaiveBayes") library("naivebayes") microbenchmark( textmodels = { tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial") pred <- predict(tmod, newdata = dfmat_test) }, fastNaiveBayes = { tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE) pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix")) }, naivebayes = { tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1) pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix")) }, times = 20 )
And Bernoulli. Note here that while we are supplying the Boolean matrix to textmodel_nb()
, this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it's done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean") dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean") microbenchmark( textmodel_nb = { tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli") pred <- predict(tmod, newdata = dfmat_test) }, fastNaiveBayes = { tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE) pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix")) }, naivebayes = { tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1) pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix")) }, times = 20 )
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7,
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.