Tests using the Large Movie Review Dataset, a dataset for sentiment classification containing 25,000 highly polar movie reviews for training, and 25,000 for testing, from Maas et. al. (2011).

Source: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Setting up {.tabset .tabset-fade}

knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)

Test and training data

dfmat <- dfm(data_corpus_LMRD)
dfmat_train <- dfm_trim(dfmat, min_termfreq = 100) %>%
    dfm_subset(set == "train")
dfmat_test <- dfm_trim(dfmat, min_termfreq = 100) %>%
    dfm_subset(set == "test")

Some functions for evaluation

performance <- function(mytable, verbose = TRUE) {
  truePositives <- mytable[1, 1]
  trueNegatives <- sum(diag(mytable)[-1])
  falsePositives <- sum(mytable[1, ]) - truePositives
  falseNegatives <- sum(mytable[, 1]) - truePositives
  precision <- truePositives / (truePositives + falsePositives)
  recall <- truePositives / (truePositives + falseNegatives)
  accuracy <- sum(diag(mytable)) / sum(mytable)
  tnr <- trueNegatives / (trueNegatives + falsePositives)
  balanced_accuracy <- sum(c(precision, tnr), na.rm = TRUE) / 2
  if (verbose) {
    print(mytable)
    cat(
      "\n    precision =", round(precision, 2),
      "\n       recall =", round(recall, 2),
      "\n     accuracy =", round(accuracy, 2),
      "\n    bal. acc. =", round(balanced_accuracy, 2),
      "\n"
    )
  }
  invisible(c(precision, recall))
}

Naive Bayes {.tabset .tabset-fade}

No weights

system.time({
tmod_nb <- textmodel_nb(dfmat_train, y = docvars(dfmat_train, "polarity"))
pred_nb <- predict(tmod_nb, newdata = dfmat_test, type = "class")
table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

tf-idf weighting

system.time({
tmod_nb <- textmodel_nb(dfm_tfidf(dfmat_train), 
                        y = docvars(dfmat_train, "polarity"))
pred_nb <- predict(tmod_nb, newdata = dfm_tfidf(dfmat_test), type = "class")
table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

SVM {.tabset .tabset-fade}

SVM

system.time({
tmod_svm <- textmodel_svm(dfmat_train, y = docvars(dfmat_train, "polarity"))
pred_svm <- predict(tmod_svm, newdata = dfmat_test, type = "class")
table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

SVM w/tf-idf weights

tmod_svm2 <- textmodel_svm(dfm_tfidf(dfmat_train),
                           y = docvars(dfmat_train, "polarity"))
pred_svm2 <- predict(tmod_svm2, newdata = dfm_tfidf(dfmat_test), type = "class")
table(pred_svm2, dfmat_test$polarity)[2:1, 2:1] %>% performance()

SVMlin

system.time({
tmod_svmlin <- textmodel_svmlin(dfmat_train,
                             y = docvars(dfmat_train, "polarity"))
pred_svmlin <- predict(tmod_svmlin, newdata = dfmat_test, type = "class")
table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

Multilayer Perceptron Network (MLP)

system.time({
tmod_mlp <- textmodel_mlp(dfmat_train,
                              y = docvars(dfmat_train, "polarity"), epochs = 10)
pred_mlp <- predict(tmod_mlp, newdata = dfmat_test, type = "class")
table(pred_mlp, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

CNN-embedding-LSTM model

system.time({
toks_train <- data_corpus_LMRD %>%
    corpus_subset(set == "test") %>%
    tokens() %>%
    tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
tmod_cnn <- textmodel_cnnlstmemb(toks_train, 
                                 y = docvars(dfmat_train, "polarity"), 
                                 epochs = 10)

toks_test <- data_corpus_LMRD %>%
    corpus_subset(set == "train") %>%
    tokens() %>%
    tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
pred_cnn <- predict(tmod_cnn, newdata = toks_test, type = "class")
table(pred_cnn, dfmat_test$polarity) %>% performance()
})


quanteda/quanteda.classifiers documentation built on Oct. 20, 2023, 6:53 a.m.