#' @title Does 'forensics' on single FastText model
#' @description Use to investigate where your choosen set of parameters for a fasttext model are lacking.
#' Provides:
#' \itemize{
##' \item{investigation to the magnitude of the predicted score for misclassfied texts.}
##' \item{comparison for the predicted label vs the actual label.}
##' \item{exposure for mislabelled training data.}
##' }
#'
#' @param k The number of k-folds for each combination set of parameters to test. Defaults to 5.
#' @param texts The texts given by the user to classify later.
#' @param texts_ids The text_ids in the text to output nice clean format.
#' @param labels The labels for the texts given by the user train the FastText model.
#' @param parameters A df that contains all the different combinations of paramters for a FastText model.
#' Must include the following:
#' \itemize{
##' \item{lr = learning rate}
##' \item{epoch = # of epochs}
##' \item{dim = dimensions }
##' \item{ws = window size}
##' \item{wordNgrams = word n-grams}
##' \item{minn = min of character n-grams}
##' \item{maxn = max of character n-grams}
##' }
#' @param seed A number for \code{set.seed} when partioning data for have model reproducability.
#' @return A dataframe with the average and SD accuracy for each row of \code{parameters}.
#' @export
#' @examples
#' fast.text.parameters <- expand.grid(
#' lr = seq(4, 4.3, 0.5),
#' epoch = seq(30, 33, 10),
#' dim = seq(100,120, 25),
#' ws = seq(4, 6, 2),
#' wordNgrams = 2,
#' minn = 2,
#' maxn = 6
#' )
#'
#' tune_fasttext(k = 5,
#' texts = df$mytext,
#' text_ids = df$text_id,
#' labels = df$topic,
#' parameters = fast.text.parameters,
#' seed = 123)
#'
forensic_fasttext = function(k, texts, text_ids, labels, parameters, seed){
# Notes -----------------------
# - Is not parallellized b/c it's only meant to run 1 single combination of parameters
# - need to create tests make sure that only 1 set of parameters are given by the users
# - Creates a temporary folder "tmp_fasttext" that holds your model and text objects
# only to be deleted at the end
# - fixed the 'loss' function and n_sample_negative = 5.
# - Can expand to include plotting loss rate, ROC curves, learning rates
# Argument Validation ----------------
Check <- ArgumentCheck::newArgCheck()
if(length(texts) == 0)
ArgumentCheck::addError(
msg = "'texts' is empty",
argcheck = Check
)
if(length(text_ids) == 0)
ArgumentCheck::addError(
msg = "'text_ids' is empty",
argcheck = Check
)
if(length(labels) == 0)
ArgumentCheck::addError(
msg = "'labels' is empty",
argcheck = Check
)
if(length(parameters) != 1)
ArgumentCheck::addError(
msg = "'parameters' has either no values or more than 1 value",
argcheck = Check
)
not_here = c("lr","epoch","dim","ws","wordNgrams","minn","maxn") %in% colnames(parameters)
if(FALSE %in% not_here)
ArgumentCheck::addError(
msg = "'parameters' is missing at least one of the following columns: lr, epoch, dim, ws, wordNgrams, minn, maxn",
argcheck = Check
)
ArgumentCheck::finishArgCheck(Check)
# Data Preparation ------------------
shuffleIndex <- seq_along(texts)
set.seed(seed)
shuffleIndex <- sample(shuffleIndex) #randomly shuffle data
texts2 = texts[shuffleIndex]
text_ids2 = text_ids[shuffleIndex]
labels2 = labels[shuffleIndex]
folds <- cut(seq(1,length(texts2)),breaks = k, labels = F)
myparameters = parameters
myparameters$accuracy = NA
myparameters$sd = NA
dir.create("tmp_fasttext") # creating temporary folder location
for (i in 1:nrow(parameters)){
repeated.accuracy = numeric(k)
repeated.incorrect = list()
repeated.byClass = list()
for (j in 1:k){
testIndex <- which(folds==j, arr.ind = T)
texts_test=texts2[testIndex]
texts_train=texts2[-testIndex]
labels_test=labels2[testIndex]
labels_train=labels2[-testIndex]
text_ids_Test = text_ids2[testIndex]
tmp_file_model <- "tmp_fasttext/fasttext"
train_labels <- paste0("__label__", labels_train)
train_to_write <- paste(train_labels, texts_train)
train_tmp_file_txt <- "tmp_fasttext/traintext"
writeLines(text = train_to_write, con = train_tmp_file_txt)
test_labels <- paste0("__label__", labels_test)
test_labels_without_prefix <- labels_test
test_texts <- tolower(texts_test)
test_to_write <- paste(test_labels, test_texts)
fastrtext::execute(commands = c("supervised",
"-input", train_tmp_file_txt,
"-output", tmp_file_model,
"-dim", parameters$dim[i], #number of dimensions
"-ws", parameters$ws[i], #window size
"-lr",parameters$lr[i], #learning rate
"-epoch", parameters$epoch[i],
"-wordNgrams", parameters$wordNgrams[i],
"-loss", "softmax",
"-minn", parameters$minn[i],
"-maxn", parameters$maxn[i],
"-verbose", 2))
model <- fastrtext::load_model(paste0(tmp_file_model,".bin")) # load model
#prediction are returned as a list with words and probabilities
predictions <- predict(model, sentences = test_to_write)
predicted = as.factor(names(unlist(predictions))) # Compute accuracy
actual = as.factor(labels_test)
repeated.accuracy[j] = mean(predicted == actual) #storing the overall accuracy in list
incorrect = data.frame(text = texts_test, label = labels_test, text_ids = text_ids_Test)
incorrect$predicted = predicted
incorrect$score = unlist(predictions)
incorrect = subset(incorrect,incorrect$label != incorrect$predicted)
repeated.incorrect[[j]] = incorrect
#computing the k-fold precision and recall for each topic
repeated.byClass[[j]] = caret::confusionMatrix(data = predicted, actual)$byClass
# clean up
unlink(train_tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
}
myparameters$accuracy[i] = mean(repeated.accuracy)
myparameters$sd[i] = sd(repeated.accuracy)
print(repeated.accuracy)
print(paste0("accuracy: ",myparameters$accuracy[i],
", sd:",myparameters$sd[i],
", lr:",myparameters$lr[i],
", ws:",myparameters$ws[i],
", dim:",myparameters$dim[i],
", epoch:",myparameters$epoch[i],
", wordNGrams:",myparameters$wordNgrams[i],
", minn:",myparameters$minn[i],
", maxn:",myparameters$maxn[i]
))
}
# Clean up
unlink("tmp_fasttext", recursive = T) # deleting 'tmp_fasttext' folder
big_data = do.call(rbind,repeated.incorrect)
if (length(table(labels)) == 2){
#for binary classification model
label.metrics = abind::abind(repeated.byClass, along = 2)
label.metrics2 = as.data.frame(rowMeans(as.matrix(label.metrics)))
colnames(label.metrics2) = c("K-fold Average")
}else{
#for 3 or more classification model
label.metrics = abind::abind(repeated.byClass, along = 3)
label.metrics2 = as.data.frame(apply(label.metrics, 1:2, mean))
}
print("All Done.")
output = list()
big_data = dplyr::select(big_data,text_ids,text,label,predicted,score) #reorder columns
output$incorrect = big_data
output$topic.metrics = label.metrics2
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.