textwhiz: Support tools for Text Analytics

Documented in forensic_fasttext

#' @title  Does 'forensics' on single FastText model
#' @description Use to investigate where your choosen set of parameters for a fasttext model are lacking.
#' Provides:
#' \itemize{
##'  \item{investigation to the magnitude of the predicted score for misclassfied texts.}
##'  \item{comparison for the predicted label vs the actual label.}
##'  \item{exposure for mislabelled training data.}
##'  }
#'
#' @param k The number of k-folds for each combination set of parameters to test. Defaults to 5.
#' @param texts The texts given by the user to classify later.
#' @param texts_ids The text_ids in the text to output nice clean format.
#' @param labels The labels for the texts given by the user train the FastText model.
#' @param parameters A df that contains all the different combinations of paramters for a FastText model.
#' Must include the following:
#' \itemize{
##'  \item{lr = learning rate}
##'  \item{epoch = # of epochs}
##'  \item{dim = dimensions }
##'  \item{ws = window size}
##'  \item{wordNgrams = word n-grams}
##'  \item{minn = min of character n-grams}
##'  \item{maxn = max of character n-grams}
##'  }
#' @param seed A number for \code{set.seed} when partioning data for have model reproducability.
#' @return A dataframe with the average and SD accuracy for each row of \code{parameters}.
#' @export
#' @examples
#' fast.text.parameters <- expand.grid(
#'   lr = seq(4, 4.3, 0.5),
#'   epoch = seq(30, 33, 10),
#'   dim = seq(100,120, 25),
#'   ws = seq(4, 6, 2),
#'   wordNgrams = 2,
#'   minn = 2,
#'   maxn = 6
#'   )
#'
#' tune_fasttext(k = 5,
#'               texts = df$mytext,
#'               text_ids = df$text_id,
#'               labels = df$topic,
#'               parameters = fast.text.parameters,
#'               seed = 123)
#'

forensic_fasttext = function(k, texts, text_ids, labels, parameters, seed){
  # Notes -----------------------
  #   - Is not parallellized b/c it's only meant to run 1 single combination of parameters
  #       - need to create tests make sure that only 1 set of parameters are given by the users
  #   - Creates a temporary folder "tmp_fasttext" that holds your model and text objects
  #   only to be deleted at the end
  #   - fixed the 'loss' function and n_sample_negative = 5.
  #   - Can expand to include plotting loss rate, ROC curves, learning rates

  # Argument Validation ----------------
  Check <- ArgumentCheck::newArgCheck()

  if(length(texts) == 0)
    ArgumentCheck::addError(
      msg = "'texts' is empty",
      argcheck = Check
    )

  if(length(text_ids) == 0)
    ArgumentCheck::addError(
      msg = "'text_ids' is empty",
      argcheck = Check
    )

  if(length(labels) == 0)
    ArgumentCheck::addError(
      msg = "'labels' is empty",
      argcheck = Check
    )

  if(length(parameters) != 1)
    ArgumentCheck::addError(
      msg = "'parameters' has either no values or more than 1 value",
      argcheck = Check
    )

  not_here = c("lr","epoch","dim","ws","wordNgrams","minn","maxn") %in% colnames(parameters)
  if(FALSE %in% not_here)
    ArgumentCheck::addError(
      msg = "'parameters' is missing at least one of the following columns: lr, epoch, dim, ws, wordNgrams, minn, maxn",
      argcheck = Check
    )

  ArgumentCheck::finishArgCheck(Check)

  # Data Preparation ------------------
  shuffleIndex <- seq_along(texts)

  set.seed(seed)
  shuffleIndex <- sample(shuffleIndex) #randomly shuffle data

  texts2 = texts[shuffleIndex]
  text_ids2 = text_ids[shuffleIndex]
  labels2 = labels[shuffleIndex]

  folds <- cut(seq(1,length(texts2)),breaks = k, labels = F)

  myparameters = parameters
  myparameters$accuracy = NA
  myparameters$sd = NA

  dir.create("tmp_fasttext") # creating temporary folder location

  for (i in 1:nrow(parameters)){

    repeated.accuracy = numeric(k)
    repeated.incorrect = list()
    repeated.byClass = list()

    for (j in 1:k){
      testIndex <- which(folds==j, arr.ind = T)

      texts_test=texts2[testIndex]
      texts_train=texts2[-testIndex]

      labels_test=labels2[testIndex]
      labels_train=labels2[-testIndex]

      text_ids_Test = text_ids2[testIndex]

      tmp_file_model <- "tmp_fasttext/fasttext"

      train_labels <- paste0("__label__", labels_train)
      train_to_write <- paste(train_labels, texts_train)
      train_tmp_file_txt <- "tmp_fasttext/traintext"

      writeLines(text = train_to_write, con = train_tmp_file_txt)

      test_labels <- paste0("__label__", labels_test)
      test_labels_without_prefix <- labels_test
      test_texts <- tolower(texts_test)
      test_to_write <- paste(test_labels, test_texts)

      fastrtext::execute(commands = c("supervised",
                           "-input", train_tmp_file_txt,
                           "-output", tmp_file_model,
                           "-dim", parameters$dim[i], #number of dimensions
                           "-ws", parameters$ws[i], #window size
                           "-lr",parameters$lr[i], #learning rate
                           "-epoch", parameters$epoch[i],
                           "-wordNgrams", parameters$wordNgrams[i],
                           "-loss", "softmax",
                           "-minn", parameters$minn[i],
                           "-maxn", parameters$maxn[i],
                           "-verbose", 2))

      model <- fastrtext::load_model(paste0(tmp_file_model,".bin")) # load model

      #prediction are returned as a list with words and probabilities
      predictions <- predict(model, sentences = test_to_write)

      predicted = as.factor(names(unlist(predictions))) # Compute accuracy
      actual = as.factor(labels_test)

      repeated.accuracy[j] = mean(predicted == actual) #storing the overall accuracy in list

      incorrect = data.frame(text = texts_test, label = labels_test, text_ids = text_ids_Test)

      incorrect$predicted = predicted
      incorrect$score = unlist(predictions)

      incorrect = subset(incorrect,incorrect$label != incorrect$predicted)

      repeated.incorrect[[j]] = incorrect

      #computing the k-fold precision and recall for each topic
      repeated.byClass[[j]] = caret::confusionMatrix(data = predicted, actual)$byClass
      # clean up
      unlink(train_tmp_file_txt)
      unlink(tmp_file_model)
      rm(model)
      gc()
    }

    myparameters$accuracy[i] = mean(repeated.accuracy)
    myparameters$sd[i] = sd(repeated.accuracy)

    print(repeated.accuracy)
    print(paste0("accuracy: ",myparameters$accuracy[i],
                 ", sd:",myparameters$sd[i],
                 ", lr:",myparameters$lr[i],
                 ", ws:",myparameters$ws[i],
                 ", dim:",myparameters$dim[i],
                 ", epoch:",myparameters$epoch[i],
                 ", wordNGrams:",myparameters$wordNgrams[i],
                 ", minn:",myparameters$minn[i],
                 ", maxn:",myparameters$maxn[i]
    ))
  }

  # Clean up
  unlink("tmp_fasttext", recursive = T) # deleting 'tmp_fasttext' folder

  big_data = do.call(rbind,repeated.incorrect)

  if (length(table(labels)) == 2){
    #for binary classification model
    label.metrics = abind::abind(repeated.byClass, along = 2)
    label.metrics2 = as.data.frame(rowMeans(as.matrix(label.metrics)))

    colnames(label.metrics2) = c("K-fold Average")
  }else{
    #for 3 or more classification model
    label.metrics = abind::abind(repeated.byClass, along = 3)
    label.metrics2 = as.data.frame(apply(label.metrics, 1:2, mean))
  }

  print("All Done.")
  output = list()

  big_data = dplyr::select(big_data,text_ids,text,label,predicted,score) #reorder columns

  output$incorrect = big_data
  output$topic.metrics = label.metrics2

  return(output)
}