#' The standard cross-validator (CV)
#'
#' This object runs a decoding analysis where a classifier is repeatedly trained
#' and tested using cross-validation.
#'
#' @details A cross-validator object takes a datasource (DS), a classifier (CL),
#' feature preprocessors (FP) and result metric (RM) objects, and runs
#' multiple cross-validation cycles where:
#'
#' 1. A datasource (DS) generates training and test data splits of the data
#' 2. Feature preprocessors (FPs) do preprocessing of the data
#' 3. A classifier (CL) is trained and predictions are generated on a test set
#' 4. Result metrics (RMs) assess the accuracy of the predictions and compile
#' the results.
#'
#' @param datasource A datasource (DS) object that will generate the training
#' and test data.
#'
#' @param classifier A classifier (CS) object that will learn parameters based
#' on the training data and will generate predictions based on the test data.
#'
#' @param feature_preprocessors A list of feature preprocessor (FP) objects that
#' learn preprocessing parameters from the training data and apply
#' preprocessing of both the training and test data based on these parameters.
#'
#' @param result_metrics A list of result metric (RM) objects that are used to
#' evaluate the classification performance. If this is set to null then the
#' rm_main_results(), rm_confusion_matrix() results metrics will be used.
#'
#' @param num_resample_runs The number of times the cross-validation should be
#' run (i.e., "resample runs"), where on each run, new training and test sets
#' are generated. If pseudo-populations are used (e.g., with the ds_basic),
#' then new pseudo-populations will be generated on each resample run as well.
#'
#' @param test_only_at_training_time A boolean indicated whether the classifier
#' should only be trained and tested at the same time point (i.e., if TRUE a
#' temporal cross-decoding analysis will NOT be run). Setting this to true can
#' potentially speed up the analysis and save memory at the cost of not
#' calculated the temporal cross decoding results.
#'
#' @param run_parallel A boolean to indicate whether the code should be run in
#' parallel. It is useful to set this to FALSE for debugging purposes or
#' if you are running a job that takes a lot of memory and runtime is not
#' of much concern.
#'
#' @examples
#' data_file <- system.file("extdata/ZD_150bins_50sampled.Rda",
#' package = "NDTr"
#' )
#'
#' ds <- ds_basic(data_file, "stimulus_ID", 18)
#' fps <- list(fp_zscore())
#' cl <- cl_max_correlation()
#'
#' cv <- cv_standard(ds, cl, fps)
#'
#'
#' @family cross-validator
#'
#'
#'
# the constructor
#' @export
cv_standard <- function(datasource,
classifier,
feature_preprocessors,
result_metrics = NULL,
num_resample_runs = 50,
test_only_at_training_time = FALSE,
run_parallel = TRUE) {
if (is.null(result_metrics)) {
result_metrics <- list(
rm_main_results(),
rm_confusion_matrix())
}
analysis_ID <- generate_analysis_ID()
the_cv <- list(
analysis_ID = analysis_ID,
datasource = datasource,
classifier = classifier,
feature_preprocessors = feature_preprocessors,
num_resample_runs = num_resample_runs,
result_metrics = result_metrics,
test_only_at_training_time = test_only_at_training_time,
run_parallel = run_parallel)
attr(the_cv, "class") <- "cv_standard"
the_cv
}
#' @export
run_decoding.cv_standard <- function(cv_obj) {
analysis_start_time <- Sys.time()
# copy over the main objects
datasource <- cv_obj$datasource
classifier <- cv_obj$classifier
feature_preprocessors <- cv_obj$feature_preprocessors
num_resample_runs <- cv_obj$num_resample_runs
result_metrics <- cv_obj$result_metrics
test_only_at_training_time <- cv_obj$test_only_at_training_time
run_parallel <- cv_obj$run_parallel
if (run_parallel) {
# register parallel resources
cores <- parallel::detectCores()
the_cluster <- parallel::makeCluster(cores, type = "SOCK")
doSNOW::registerDoSNOW(the_cluster)
"%do_type%" <- get("%dopar%")
} else {
"%do_type%" <- get("%do%")
}
# Do a parallel loop over resample runs
all_resample_run_decoding_results <- foreach(iResample = 1:num_resample_runs) %do_type% {
# get the data from the current cross-validation run
cv_data <- get_data(datasource)
unique_times <- unique(cv_data$time_bin)
num_time_bins <- length(unique_times)
all_cv_train_test_inds <- select(cv_data, starts_with("CV"))
num_cv <- ncol(all_cv_train_test_inds)
# resample_run_decoding_results is the name of the decoding results inside
# the dopar loop outside the loop, when all the results have really been
# combined into a list, this is called all_resample_run_decoding_results
resample_run_decoding_results <- NULL
all_cv_results <- NULL
for (iCV in 1:num_cv) {
all_time_results <- NULL
# when the code is not run in parallel, the CV number will be printed
tictoc::tic()
message(paste0("CV: ", iCV))
for (iTrain in 1:num_time_bins) {
training_set <- dplyr::filter(
cv_data, .data$time_bin == unique_times[iTrain],
all_cv_train_test_inds[iCV] == "train") %>%
dplyr::select(starts_with("site"), .data$train_labels)
test_set <- dplyr::filter(cv_data, all_cv_train_test_inds[iCV] == "test") %>%
dplyr::select(starts_with("site"), .data$test_labels, .data$time_bin)
if (test_only_at_training_time) {
test_set <- dplyr::filter(test_set, .data$time_bin == unique_times[iTrain])
}
# if feature-processors have been specified, do feature processing...
if (length(feature_preprocessors) >= 1) {
for (iFP in seq_along(feature_preprocessors)) {
processed_data <- preprocess_data(feature_preprocessors[[iFP]], training_set, test_set)
training_set <- processed_data$training_set
test_set <- processed_data$test_set
}
}
# get predictions from the classifier (along with the correct labels)
curr_cv_prediction_results <- get_predictions(classifier, training_set, test_set)
# add the current CV run number, train time to the results data frame
curr_cv_prediction_results <- curr_cv_prediction_results %>%
dplyr::mutate(CV = iCV, train_time = unique_times[iTrain]) %>%
select(.data$CV, .data$train_time, everything())
# all_cv_results <- rbind(all_cv_results, curr_cv_prediction_results)
all_time_results[[iTrain]] <- curr_cv_prediction_results # should be faster b/c don't need to reallocate memory
} # end the for loop over time bins
tictoc::toc()
# Aggregate results over all CV split runs
all_cv_results[[iCV]] <- dplyr::bind_rows(all_time_results)
} # end the for loop over CV splits
# convert the results from each CV split from a list into a data frame
all_cv_results <- dplyr::bind_rows(all_cv_results)
# go through each Result Metric and aggregate the results from all CV splits using each metric
for (iMetric in seq_along(result_metrics)) {
curr_metric_results <- aggregate_CV_split_results(result_metrics[[iMetric]], all_cv_results)
resample_run_decoding_results[[iMetric]] <- curr_metric_results ### DECODING_RESULTS
}
return(resample_run_decoding_results)
} # end loop over resample runs
# aggregate results over all resample runs ---------------------------------
# close parallel resources
if (run_parallel) {
parallel::stopCluster(the_cluster)
}
# go through each Result Metric and aggregate the final results from all resample runs using each metric
DECODING_RESULTS <- NULL
result_metric_names <- NULL
grouped_results <- purrr::transpose(all_resample_run_decoding_results)
for (iMetric in seq_along(result_metrics)) {
# bind the list of all the resample result RM objects together and preserve the RM's options attribute
curr_options <- attributes(grouped_results[[iMetric]][[1]])$options
curr_resample_run_results <- dplyr::bind_rows(grouped_results[[iMetric]], .id = "resample_run")
attr(curr_resample_run_results, "options") <- curr_options
DECODING_RESULTS[[iMetric]] <- aggregate_resample_run_results(curr_resample_run_results)
result_metric_names[iMetric] <- class(DECODING_RESULTS[[iMetric]])[1]
}
# add names to the final results list so easy to extract elements
names(DECODING_RESULTS) <- result_metric_names
# save the decoding parameters to make results reproducible -----------------
# set to null to save memory, can recreate the datasource by reloading the
# data in the binned_file_name field
cv_obj$datasource$binned_data <- NULL
cv_obj$parameter_df <- get_parameters(cv_obj)
analysis_end_time <- Sys.time()
# could save these in the cv_obj directly rather than in the cv_obj$parameters_df
cv_obj$parameter_df$analysis_start_time <- analysis_start_time
cv_obj$parameter_df$analysis_end_time <- analysis_end_time
# saves all the CV parameters (DS, CL FPs etc)
DECODING_RESULTS$cross_validation_paramaters <- cv_obj
return(DECODING_RESULTS)
} # end the run_decoding method
# get parameters from all objects and save the in a data frame so that
# which will be useful to tell if an analysis has already been run
get_parameters.cv_standard <- function(ndtr_obj) {
# start by getting the parameters from the datasource
parameter_df <- get_parameters(ndtr_obj$datasource)
# add the parameters from the classifier
parameter_df <- cbind(parameter_df, get_parameters(ndtr_obj$classifier))
# if feature-processors have been specified, add their parameters to the data frame
if (length(ndtr_obj$feature_preprocessors) >= 1) {
for (iFP in seq_along(ndtr_obj$feature_preprocessors)) {
curr_FP_parameters <- get_parameters(ndtr_obj$feature_preprocessors[[iFP]])
parameter_df <- cbind(parameter_df, curr_FP_parameters)
}
}
# go through each result metric and get their parameters
for (iMetric in seq_along(ndtr_obj$result_metrics)) {
curr_metric_parameters <- get_parameters(ndtr_obj$result_metrics[[iMetric]])
parameter_df <- cbind(parameter_df, curr_metric_parameters)
}
# finally add the parameters from this cv_standard object as well
cv_parameters <- data.frame(
analysis_ID = ndtr_obj$analysis_ID,
cv_standard.num_resample_runs = ndtr_obj$num_resample_runs,
cv_standard.test_only_at_training_time = ndtr_obj$test_only_at_training_time
)
parameter_df <- cbind(cv_parameters, parameter_df)
parameter_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.