knitr::opts_chunk$set( collapse = TRUE, comment = "#>", rmarkdown.html_vignette.check_title = FALSE )
library(autotextclassifier) # Auto text classifier library(parallel) # Parallel processing library(doParallel) # Parallel processing library(here) # Creating reproducible file paths library(patchwork) # Putting ggplots together library(recipes) # Preprocessing library(zeallot) # Multiple assignments library(yardstick) # Metrics
load(file = here("inst/extdata/sample_data.rda"))
Don't forget to make sure that the type of the outcome variable should be factor.
names(sample_data) <- c("category", "org_name", "ein", "text") sample_data$category <- as.factor(sample_data$category)
The rec
object provides the following information. The function also checks whether the text
column has missing values or includes extremely short documents (less than five words).
There are two broad basic options for text preprocessing.
Without word embedding
Tokenization for text [trained]
Term frequency-inverse document frequency with text [trained]
With word embedding
Tokenization for text [trained]
# Without word embedding rec <- apply_basic_recipe(sample_data, category ~ text, text) # With word embedding rec_alt <- apply_basic_recipe(sample_data, category ~ text, text, add_embedding = TRUE)
The build_pipeline
function reduces the steps one needs to take a classifier pipeline. The pipeline involves data splitting, creating tuning parameters, search spaces, workflows, 10-fold cross-validation samples, finding the best model from each algorithm and fitting the best model from each algorithm to the data.
# Using parallel processing to speed up all_cores <- parallel::detectCores(logical = FALSE) cl <- makeCluster(all_cores[1] - 1) registerDoParallel(cl)
set.seed(1234) c(lasso_fit, rand_fit, xg_fit) %<-% build_pipeline(rec, category, rec, prop_ratio = 0.8, metric_choice = "roc_auc")
# Based on the class-based metrics viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "class") + viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "class") + viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "class") # Based on the probability-based metrics viz_class_fit(lasso_fit, "Lasso", test_x_class, test_y_class, "probability") + viz_class_fit(rand_fit, "Random forest", test_x_class, test_y_class, "probability") + viz_class_fit(xg_fit, "XGBoost", test_x_class, test_y_class, "probability")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.