#' @title 'Tune' FastText Models
#' @description Tests several combinations of parameters for FastText Models. Cross validates
#' each set of parameters k times to have robust model evaluations. Can be paralellized to
#' improve speed and allow for large grid search paramters.
#'
#' @param k The number of k-folds for each combination set of parameters to test. Defaults to 5.
#' @param texts The texts given by the user to classify.
#' @param text_ids The text_ids in the text to output nice clean format.
#' @param labels The labels for the texts given by the user train the FastText model.
#' @param parameters A df that contains all the different combinations of paramters for a FastText model.
#' Must include the following:
#' \itemize{
##' \item{lr = learning rate}
##' \item{epoch = # of epochs}
##' \item{dim = dimensions }
##' \item{ws = window size}
##' \item{wordNgrams = word n-grams}
##' \item{minn = min of character n-grams}
##' \item{maxn = max of character n-grams}
##' }
#' @param parallel Defaults to TRUE. Determines whether you want to parallize the analysis.
#' @param seed A number for \code{set.seed} when partioning data for have model reproducability.
#' @return A dataframe with the average accuracy and SD for each row of \code{parameters}.
#' @export
#' @examples
#' fast.text.parameters <- expand.grid(
#' lr = seq(4, 4.3, 0.5),
#' epoch = seq(30, 33, 10),
#' dim = seq(100,120, 25),
#' ws = seq(4, 6, 2),
#' wordNgrams = 2,
#' minn = 2,
#' maxn = 6
#' )
#'
#' tune_fasttext(k = 5,
#' texts = df$mytext,
#' text_ids = df$text_id,
#' labels = df$topic,
#' parameters = fast.text.parameters,
#' seed = 123,
#' parallel = T)
#'
tune_fasttext <- function(k = 5, texts, text_ids, labels, parameters, seed = 123, parallel = TRUE){
# Notes -----------------------
# - Creates a temporary folder "tmp_fasttext" that holds your model and text objects
# only to be deleted at the end
# - fixed the 'loss' function and n_sample_negative = 5.
# - can expand to other metrics for model evaluation like precision, recall, etc.
#
# Argument Validation ----------------
Check <- ArgumentCheck::newArgCheck()
if(length(texts) == 0)
ArgumentCheck::addError(
msg = "'texts' is empty",
argcheck = Check
)
if(length(text_ids) == 0)
ArgumentCheck::addError(
msg = "'text_ids' is empty",
argcheck = Check
)
if(length(labels) == 0)
ArgumentCheck::addError(
msg = "'labels' is empty",
argcheck = Check
)
if(length(parameters) == 0)
ArgumentCheck::addError(
msg = "'parameters' has no values",
argcheck = Check
)
not_here = c("lr","epoch","dim","ws","wordNgrams","minn","maxn") %in% colnames(parameters)
if(FALSE %in% not_here)
ArgumentCheck::addError(
msg = "'parameters' is missing at least one of the following columns: lr, epoch, dim, ws, wordNgrams, minn, maxn",
argcheck = Check
)
ArgumentCheck::finishArgCheck(Check)
# Data Preparation ---------------------------
shuffleIndex <- seq_along(texts)
set.seed(seed)
shuffleIndex <- sample(shuffleIndex) #randomly shuffle data
texts2 <- texts[shuffleIndex]
labels2 <- labels[shuffleIndex]
folds <- cut(seq(1,length(texts2)),breaks = k, labels = F)
myparameters <- parameters
myparameters$accuracy <- NA
myparameters$sd <- NA
dir.create("tmp_fasttext") # creating temporary folder location
print("...........running fasttext k times")
# Parallel ---------------------------
if(parallel == T){
print("......running in parallel")
for (i in 1:nrow(parameters)){
future::plan(future::multisession) ## => parallelize on your local computer
k_accuracies = future.apply::future_lapply(
1:k,
function(x){
testIndex <- which(folds==x, arr.ind = T)
texts_test <- texts2[testIndex]
texts_train <- texts2[-testIndex]
labels_test <- labels2[testIndex]
labels_train <- labels2[-testIndex]
tmp_file_model <- "tmp_fasttext/fasttext"
train_labels <- paste0("__label__", labels_train)
train_to_write <- paste(train_labels, texts_train)
train_tmp_file_txt <- "tmp_fasttext/traintext"
writeLines(text = train_to_write, con = train_tmp_file_txt)
test_labels <- paste0("__label__", labels_test)
test_labels_without_prefix <- labels_test
test_texts <- tolower(texts_test)
test_to_write <- paste(test_labels, test_texts)
fastrtext::execute(commands = c("supervised",
"-input", train_tmp_file_txt,
"-output", tmp_file_model,
"-dim", parameters$dim[i], #number of dimensions
"-ws", parameters$ws[i], #window size
"-lr",parameters$lr[i], #learning rate
"-epoch", parameters$epoch[i],
"-wordNgrams", parameters$wordNgrams[i],
"-loss", "softmax",
"-minn", parameters$minn[i],
"-maxn", parameters$maxn[i],
"-verbose", 2))
# load model
model <- fastrtext::load_model(paste0(tmp_file_model,".bin"))
#prediction are returned as a list with words and probabilities
predictions <- predict(model, sentences = test_to_write)
# Compute accuracy
predicted <- names(unlist(predictions))
actual <- labels_test
# clean up
unlink(train_tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
#computing the accuracy
accuracy1 <- mean(predicted == actual)
accuracy1
})
print(unlist(k_accuracies))
myparameters$accuracy[i] <- mean(unlist(k_accuracies))
myparameters$sd[i] <- sd(unlist(k_accuracies))
print(paste0("accuracy: ",myparameters$accuracy[i],
", sd:",myparameters$sd[i],
", lr:",myparameters$lr[i],
", ws:",myparameters$ws[i],
", dim:",myparameters$dim[i],
", epoch:",myparameters$epoch[i],
", wordNGrams:",myparameters$wordNgrams[i],
", minn:",myparameters$minn[i],
", maxn:",myparameters$maxn[i]
))
}
}
# NOT Parallel ---------------------------
else{
print("......NOT running in parallel")
for (i in 1:nrow(parameters)){
k_accuracies = lapply(1:k,
function(x){
testIndex <- which(folds==x, arr.ind = T)
texts_test <- texts2[testIndex]
texts_train <- texts2[-testIndex]
labels_test <- labels2[testIndex]
labels_train <- labels2[-testIndex]
tmp_file_model <- "tmp_fasttext/fasttext"
train_labels <- paste0("__label__", labels_train)
train_to_write <- paste(train_labels, texts_train)
train_tmp_file_txt <- "tmp_fasttext/traintext"
writeLines(text = train_to_write, con = train_tmp_file_txt)
test_labels <- paste0("__label__", labels_test)
test_labels_without_prefix <- labels_test
test_texts <- tolower(texts_test)
test_to_write <- paste(test_labels, test_texts)
fastrtext::execute(commands = c("supervised",
"-input", train_tmp_file_txt,
"-output", tmp_file_model,
"-dim", parameters$dim[i], #number of dimensions
"-ws", parameters$ws[i], #window size
"-lr",parameters$lr[i], #learning rate
"-epoch", parameters$epoch[i],
"-wordNgrams", parameters$wordNgrams[i],
"-loss", "softmax",
"-minn", parameters$minn[i],
"-maxn", parameters$maxn[i],
"-verbose", 0))
# load model
model <- fastrtext::load_model(paste0(tmp_file_model,".bin"))
#prediction are returned as a list with words and probabilities
predictions <- predict(model, sentences = test_to_write)
# Compute accuracy
predicted <- names(unlist(predictions))
actual <- labels_test
# clean up
unlink(train_tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
#computing the accuracy
accuracy1 <- mean(predicted == actual)
accuracy1
})
print(unlist(k_accuracies))
myparameters$accuracy[i] <- mean(unlist(k_accuracies))
myparameters$sd[i] <- sd(unlist(k_accuracies))
print(paste0("accuracy: ",myparameters$accuracy[i],
", sd:",myparameters$sd[i],
", lr:",myparameters$lr[i],
", ws:",myparameters$ws[i],
", dim:",myparameters$dim[i],
", epoch:",myparameters$epoch[i],
", wordNGrams:",myparameters$wordNgrams[i],
", minn:",myparameters$minn[i],
", maxn:",myparameters$maxn[i]
))}
}
# Output ---------------------------
# Clean up
unlink("tmp_fasttext", recursive = T) # deleting 'tmp_fasttext' folder
print("All Done.")
return(myparameters)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.