In quanteda/quanteda.classifiers: Models for supervised text classification

Tests using the Large Movie Review Dataset, a dataset for sentiment classification containing 25,000 highly polar movie reviews for training, and 25,000 for testing, from Maas et. al. (2011).

Source: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). "Learning Word Vectors for Sentiment Analysis". The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Setting up {.tabset .tabset-fade}

knitr::opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE)

Test and training data

dfmat <- dfm(data_corpus_LMRD)
dfmat_train <- dfm_trim(dfmat, min_termfreq = 100) %>%
    dfm_subset(set == "train")
dfmat_test <- dfm_trim(dfmat, min_termfreq = 100) %>%
    dfm_subset(set == "test")

Some functions for evaluation

performance <- function(mytable, verbose = TRUE) {
  truePositives <- mytable[1, 1]
  trueNegatives <- sum(diag(mytable)[-1])
  falsePositives <- sum(mytable[1, ]) - truePositives
  falseNegatives <- sum(mytable[, 1]) - truePositives
  precision <- truePositives / (truePositives + falsePositives)
  recall <- truePositives / (truePositives + falseNegatives)
  accuracy <- sum(diag(mytable)) / sum(mytable)
  tnr <- trueNegatives / (trueNegatives + falsePositives)
  balanced_accuracy <- sum(c(precision, tnr), na.rm = TRUE) / 2
  if (verbose) {
    print(mytable)
    cat(
      "\n    precision =", round(precision, 2),
      "\n       recall =", round(recall, 2),
      "\n     accuracy =", round(accuracy, 2),
      "\n    bal. acc. =", round(balanced_accuracy, 2),
      "\n"
    )
  }
  invisible(c(precision, recall))
}

Naive Bayes {.tabset .tabset-fade}

No weights

system.time({
tmod_nb <- textmodel_nb(dfmat_train, y = docvars(dfmat_train, "polarity"))
pred_nb <- predict(tmod_nb, newdata = dfmat_test, type = "class")
table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

tf-idf weighting

system.time({
tmod_nb <- textmodel_nb(dfm_tfidf(dfmat_train), 
                        y = docvars(dfmat_train, "polarity"))
pred_nb <- predict(tmod_nb, newdata = dfm_tfidf(dfmat_test), type = "class")
table(pred_nb, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

SVM {.tabset .tabset-fade}

SVM

system.time({
tmod_svm <- textmodel_svm(dfmat_train, y = docvars(dfmat_train, "polarity"))
pred_svm <- predict(tmod_svm, newdata = dfmat_test, type = "class")
table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

SVM w/tf-idf weights

tmod_svm2 <- textmodel_svm(dfm_tfidf(dfmat_train),
                           y = docvars(dfmat_train, "polarity"))
pred_svm2 <- predict(tmod_svm2, newdata = dfm_tfidf(dfmat_test), type = "class")
table(pred_svm2, dfmat_test$polarity)[2:1, 2:1] %>% performance()

SVMlin

system.time({
tmod_svmlin <- textmodel_svmlin(dfmat_train,
                             y = docvars(dfmat_train, "polarity"))
pred_svmlin <- predict(tmod_svmlin, newdata = dfmat_test, type = "class")
table(pred_svm, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

Multilayer Perceptron Network (MLP)

system.time({
tmod_mlp <- textmodel_mlp(dfmat_train,
                              y = docvars(dfmat_train, "polarity"), epochs = 10)
pred_mlp <- predict(tmod_mlp, newdata = dfmat_test, type = "class")
table(pred_mlp, dfmat_test$polarity)[2:1, 2:1] %>% performance()
})

CNN-embedding-LSTM model

system.time({
toks_train <- data_corpus_LMRD %>%
    corpus_subset(set == "test") %>%
    tokens() %>%
    tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
tmod_cnn <- textmodel_cnnlstmemb(toks_train, 
                                 y = docvars(dfmat_train, "polarity"), 
                                 epochs = 10)

toks_test <- data_corpus_LMRD %>%
    corpus_subset(set == "train") %>%
    tokens() %>%
    tokens_remove("\\p{P}", valuetype = "regex", padding = TRUE)
pred_cnn <- predict(tmod_cnn, newdata = toks_test, type = "class")
table(pred_cnn, dfmat_test$polarity) %>% performance()
})

quanteda/quanteda.classifiers documentation built on Oct. 20, 2023, 6:53 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

quanteda/quanteda.classifiers
Models for supervised text classification

In quanteda/quanteda.classifiers: Models for supervised text classification

Setting up {.tabset .tabset-fade}

Test and training data

Some functions for evaluation

Naive Bayes {.tabset .tabset-fade}

No weights

tf-idf weighting

SVM {.tabset .tabset-fade}

SVM

SVM w/tf-idf weights

SVMlin

Multilayer Perceptron Network (MLP)

CNN-embedding-LSTM model

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.classifiers Models for supervised text classification

In quanteda/quanteda.classifiers: Models for supervised text classification

Setting up {.tabset .tabset-fade}

Test and training data

Some functions for evaluation

Naive Bayes {.tabset .tabset-fade}

No weights

tf-idf weighting

SVM {.tabset .tabset-fade}

SVM

SVM w/tf-idf weights

SVMlin

Multilayer Perceptron Network (MLP)

CNN-embedding-LSTM model

R Package Documentation

Browse R Packages

We want your feedback!

quanteda/quanteda.classifiers
Models for supervised text classification