textwhiz: Support tools for Text Analytics

Documented in tune_fasttext

#' @title 'Tune' FastText Models
#' @description Tests several combinations of parameters for FastText Models. Cross validates
#' each set of parameters k times to have robust model evaluations. Can be paralellized to
#' improve speed and allow for large grid search paramters.
#'
#' @param k The number of k-folds for each combination set of parameters to test. Defaults to 5.
#' @param texts The texts given by the user to classify.
#' @param text_ids The text_ids in the text to output nice clean format.
#' @param labels The labels for the texts given by the user train the FastText model.
#' @param parameters A df that contains all the different combinations of paramters for a FastText model.
#' Must include the following:
#' \itemize{
##'  \item{lr = learning rate}
##'  \item{epoch = # of epochs}
##'  \item{dim = dimensions }
##'  \item{ws = window size}
##'  \item{wordNgrams = word n-grams}
##'  \item{minn = min of character n-grams}
##'  \item{maxn = max of character n-grams}
##'  }
#' @param parallel Defaults to TRUE. Determines whether you want to parallize the analysis.
#' @param seed A number for \code{set.seed} when partioning data for have model reproducability.
#' @return A dataframe with the average accuracy and SD for each row of \code{parameters}.
#' @export
#' @examples
#'   fast.text.parameters <- expand.grid(
#'   lr = seq(4, 4.3, 0.5),
#'   epoch = seq(30, 33, 10),
#'   dim = seq(100,120, 25),
#'   ws = seq(4, 6, 2),
#'   wordNgrams = 2,
#'   minn = 2,
#'   maxn = 6
#'   )
#'
#' tune_fasttext(k = 5,
#'               texts = df$mytext,
#'               text_ids = df$text_id,
#'               labels = df$topic,
#'               parameters = fast.text.parameters,
#'               seed = 123,
#'               parallel = T)
#'

tune_fasttext <- function(k = 5, texts, text_ids, labels, parameters, seed = 123, parallel = TRUE){
  # Notes -----------------------
  #   - Creates a temporary folder "tmp_fasttext" that holds your model and text objects
  #   only to be deleted at the end
  #   - fixed the 'loss' function and n_sample_negative = 5.
  #   - can expand to other metrics for model evaluation like precision, recall, etc.
  #

  # Argument Validation ----------------
  Check <- ArgumentCheck::newArgCheck()

  if(length(texts) == 0)
    ArgumentCheck::addError(
      msg = "'texts' is empty",
      argcheck = Check
    )

  if(length(text_ids) == 0)
    ArgumentCheck::addError(
      msg = "'text_ids' is empty",
      argcheck = Check
    )

  if(length(labels) == 0)
    ArgumentCheck::addError(
      msg = "'labels' is empty",
      argcheck = Check
    )

  if(length(parameters) == 0)
    ArgumentCheck::addError(
      msg = "'parameters' has no values",
      argcheck = Check
    )

  not_here = c("lr","epoch","dim","ws","wordNgrams","minn","maxn") %in% colnames(parameters)
  if(FALSE %in% not_here)
    ArgumentCheck::addError(
      msg = "'parameters' is missing at least one of the following columns: lr, epoch, dim, ws, wordNgrams, minn, maxn",
      argcheck = Check
    )

  ArgumentCheck::finishArgCheck(Check)

  # Data Preparation ---------------------------

  shuffleIndex <- seq_along(texts)

  set.seed(seed)
  shuffleIndex <- sample(shuffleIndex) #randomly shuffle data

  texts2 <- texts[shuffleIndex]
  labels2 <- labels[shuffleIndex]

  folds <- cut(seq(1,length(texts2)),breaks = k, labels = F)

  myparameters <- parameters
  myparameters$accuracy <- NA
  myparameters$sd <- NA

  dir.create("tmp_fasttext") # creating temporary folder location

  print("...........running fasttext k times")
  # Parallel ---------------------------
  if(parallel == T){
    print("......running in parallel")
    for (i in 1:nrow(parameters)){

      future::plan(future::multisession) ## => parallelize on your local computer
      k_accuracies = future.apply::future_lapply(
        1:k,
        function(x){
          testIndex <- which(folds==x, arr.ind = T)

          texts_test <- texts2[testIndex]
          texts_train <- texts2[-testIndex]

          labels_test <- labels2[testIndex]
          labels_train <- labels2[-testIndex]

          tmp_file_model <- "tmp_fasttext/fasttext"

          train_labels <- paste0("__label__", labels_train)
          train_to_write <- paste(train_labels, texts_train)
          train_tmp_file_txt <- "tmp_fasttext/traintext"

          writeLines(text = train_to_write, con = train_tmp_file_txt)

          test_labels <- paste0("__label__", labels_test)
          test_labels_without_prefix <- labels_test
          test_texts <- tolower(texts_test)
          test_to_write <- paste(test_labels, test_texts)

          fastrtext::execute(commands = c("supervised",
                                          "-input", train_tmp_file_txt,
                                          "-output", tmp_file_model,
                                          "-dim", parameters$dim[i], #number of dimensions
                                          "-ws", parameters$ws[i], #window size
                                          "-lr",parameters$lr[i], #learning rate
                                          "-epoch", parameters$epoch[i],
                                          "-wordNgrams", parameters$wordNgrams[i],
                                          "-loss", "softmax",
                                          "-minn", parameters$minn[i],
                                          "-maxn", parameters$maxn[i],
                                          "-verbose", 2))

          # load model
          model <- fastrtext::load_model(paste0(tmp_file_model,".bin"))

          #prediction are returned as a list with words and probabilities
          predictions <- predict(model, sentences = test_to_write)

          # Compute accuracy
          predicted <- names(unlist(predictions))
          actual <- labels_test

          # clean up
          unlink(train_tmp_file_txt)
          unlink(tmp_file_model)
          rm(model)
          gc()

          #computing the accuracy
          accuracy1 <- mean(predicted == actual)
          accuracy1
        })

      print(unlist(k_accuracies))
      myparameters$accuracy[i] <- mean(unlist(k_accuracies))
      myparameters$sd[i] <- sd(unlist(k_accuracies))

      print(paste0("accuracy: ",myparameters$accuracy[i],
                   ", sd:",myparameters$sd[i],
                   ", lr:",myparameters$lr[i],
                   ", ws:",myparameters$ws[i],
                   ", dim:",myparameters$dim[i],
                   ", epoch:",myparameters$epoch[i],
                   ", wordNGrams:",myparameters$wordNgrams[i],
                   ", minn:",myparameters$minn[i],
                   ", maxn:",myparameters$maxn[i]
                   ))
    }
  }
  # NOT Parallel ---------------------------
  else{
    print("......NOT running in parallel")
    for (i in 1:nrow(parameters)){
      k_accuracies = lapply(1:k,
                            function(x){
                             testIndex <- which(folds==x, arr.ind = T)

                             texts_test <- texts2[testIndex]
                             texts_train <- texts2[-testIndex]

                             labels_test <- labels2[testIndex]
                             labels_train <- labels2[-testIndex]

                             tmp_file_model <- "tmp_fasttext/fasttext"

                             train_labels <- paste0("__label__", labels_train)
                             train_to_write <- paste(train_labels, texts_train)
                             train_tmp_file_txt <- "tmp_fasttext/traintext"

                             writeLines(text = train_to_write, con = train_tmp_file_txt)

                             test_labels <- paste0("__label__", labels_test)
                             test_labels_without_prefix <- labels_test
                             test_texts <- tolower(texts_test)
                             test_to_write <- paste(test_labels, test_texts)

                             fastrtext::execute(commands = c("supervised",
                                                             "-input", train_tmp_file_txt,
                                                             "-output", tmp_file_model,
                                                             "-dim", parameters$dim[i], #number of dimensions
                                                             "-ws", parameters$ws[i], #window size
                                                             "-lr",parameters$lr[i], #learning rate
                                                             "-epoch", parameters$epoch[i],
                                                             "-wordNgrams", parameters$wordNgrams[i],
                                                             "-loss", "softmax",
                                                             "-minn", parameters$minn[i],
                                                             "-maxn", parameters$maxn[i],
                                                             "-verbose", 0))

                             # load model
                             model <- fastrtext::load_model(paste0(tmp_file_model,".bin"))

                             #prediction are returned as a list with words and probabilities
                             predictions <- predict(model, sentences = test_to_write)

                             # Compute accuracy
                             predicted <- names(unlist(predictions))
                             actual <- labels_test

                             # clean up
                             unlink(train_tmp_file_txt)
                             unlink(tmp_file_model)
                             rm(model)
                             gc()

                             #computing the accuracy
                             accuracy1 <- mean(predicted == actual)
                             accuracy1
                                         })

      print(unlist(k_accuracies))
      myparameters$accuracy[i] <- mean(unlist(k_accuracies))
      myparameters$sd[i] <- sd(unlist(k_accuracies))

      print(paste0("accuracy: ",myparameters$accuracy[i],
                   ", sd:",myparameters$sd[i],
                   ", lr:",myparameters$lr[i],
                   ", ws:",myparameters$ws[i],
                   ", dim:",myparameters$dim[i],
                   ", epoch:",myparameters$epoch[i],
                   ", wordNGrams:",myparameters$wordNgrams[i],
                   ", minn:",myparameters$minn[i],
                   ", maxn:",myparameters$maxn[i]
      ))}
    }
  # Output ---------------------------

  # Clean up
  unlink("tmp_fasttext", recursive = T) # deleting 'tmp_fasttext' folder

  print("All Done.")
  return(myparameters)
}