#' Generate emulators for specified machine learning techniques with provided
#' data
#'
#' This method generates an emulator model from a training set for a specified
#' technique, and generates performance statistics from the test set. The
#' currently implemented techniques are a neural network (using the neuralnet
#' package), a random forest (from the randomforest package), a support vector
#' machine (from package e1071), a gaussian process model (from package mlegp),
#' and a general linear model. Where a neural network is desired, the
#' hyper-parameters are determined using k-fold cross validation from a set of
#' specified network structures. Where a simulation has multiple outputs, an
#' emulator model is created for each output response. This method provides
#' capacity to save the generated emulator models to file, in Rda format, and
#' plot a comparison of the predicted responses to a set of those of the
#' training and test sets, giving correlation of determination (R-squared) and
#' mean squared error values. The method returns a list of emulators of a
#' specified technique, one for each simulation output, and the performance
#' statistics for each measure, including the time taken to generate these
#' emulators. If the training data has been normalised, minimum and maximum
#' sampling values for each parameter are also returned such that any
#' predictions generated using this emulation can be rescaled correctly.
#' If plots are desired (by setting a flag in emulation_algorithm settings),
#' plots produced are stored as PDF's in the working directory. The same
#' applies to saving the generated emulator, set by the saveEmulation flag
#' in emulation_algorithm_settings. Note that it must be specified as to
#' whether the data being provided in partitioned_data has been normalised
#' or not: this affects the output of the plots (as the data is rescaled back
#' to its original scale if the data was normalised). Similarly to the rest of
#' spartan, this method can create emulations for multiple timepoints.
#'
#' @param model_list Vector of the types of emulation model to create.
#' Accepted abbreviations are: SVM (Support-Vector Machine), GP (Gaussian
#' Process Model), NNET (Neural Network), RF (Random Forest), GLM (General
#' Linear Model)
#' @param partitioned_data Object output from the function partition_dataset,
#' an object containing training, testing, and validation data
#' @param parameters Vector containing the names of the simulation parameters
#' in the dataset on which the emulator is being trained
#' @param measures Vector containing the simulation outputs that the emulators
#' should be able to predict
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms to use in emulation creation. If no setting changes are
#' required, and a neural network is not being generated, this can be left out,
#' and will be generated by generate_requested_emulations (so this defaults to
#' NULL). If you are making any changes to the settings or generating a neural
#' network, you must create this object before calling
#' generate_requested_emulations.
#' @param timepoint If using multiple timepoints, the timepoint for which
#' emulators are being created
#' @param normalised Whether the emulator data has been normalised or not.
#' Affects how training and test output predictions are displayed
#' @param output_formats File formats in which result graphs should be produced
#' @return Emulation objects, bundled into a list, with the required
#' sampling information to rescale the data these emulations produce if
#' required
#'
#' @export
#'
generate_requested_emulations <- function(model_list, partitioned_data,
parameters, measures,
algorithm_settings = NULL,
timepoint = NULL,
normalised = FALSE,
output_formats=c("pdf")) {
# First see whether any of the default settings have been overridden
# if not provided generate the defaults
if (is.null(algorithm_settings))
algorithm_settings <- emulation_algorithm_settings()
# Quick check that parameters and measures are equal to those in the partitioned
# dataset, where features may have been removed if all values were equal
# Ideally we should find a better method of doing this
if(length(parameters) != length(partitioned_data$parameters))
{
parameters = partitioned_data$parameters
message("Parameters List Updated to that Used in Partitioning, where features may have been removed")
}
if(length(measures) != length(partitioned_data$measures))
{
measures = partitioned_data$measures
message("Measures List Updated to that Used in Partitioning, where features may have been removed")
}
# For a neural nework check that the structure list has been specified
if ("NNET" %in% model_list & is.null(
algorithm_settings$network_structures)) {
message("No Neural Network layer structures provided. Algorithm terminating.
Please run emulation_algorithm_settings, providing a list as an
argument for networkStructures, and run this again")
# -1 used to confirm error when generating ensemble with model generation
return(-1)
} else {
all_model_predictions <- NULL
all_model_accuracy_stats <- NULL
# Structure to store the emulators created
ensemble_emulators <- vector("list", length(model_list))
# Iterate through all the models that the user wants to generate
for (model_index in 1:length(model_list)) {
print(model_list[model_index])
# Check an acceptable model has been specified
if (check_acceptable_model_type(model_list[model_index]) == TRUE) {
# Sse the same code to generate all the currently acceptable model types
model_fit <- generate_emulator_model(model_list[model_index],
parameters, measures,
partitioned_data,
algorithm_settings,
timepoint = timepoint, normalised,
output_formats)
# Add the sampled mins and maxes to help rescaling
model_fit$pre_normed_mins <- partitioned_data$pre_normed_mins
model_fit$pre_normed_maxes <- partitioned_data$pre_normed_maxes
# Get predicted results from test set parameters. We can create this
# using the same method to construct the ensemble training set
# We don't pass in emulation_predictions to be bound onto -
# we just want the predictions for this emulator
emulation_predictions <- generate_ensemble_training_set(
model_fit, parameters, measures, partitioned_data$testing, NULL)
# Print the accuracy of the test set, if requested
if (algorithm_settings$plot_test_accuracy == TRUE) {
if (normalised) {
# Scale the predictions such that these are plotted correctly
unscaled_predictions <- denormalise_dataset(
emulation_predictions, rbind(model_fit$pre_normed_mins[measures]),
rbind(model_fit$pre_normed_maxes[measures]))
unscaled_simulations <- denormalise_dataset(
partitioned_data$testing, rbind(partitioned_data$pre_normed_mins),
rbind(partitioned_data$pre_normed_maxes))
produce_accuracy_plots_all_measures(
paste(model_list[model_index], "_TestSet", sep = ""), measures,
unscaled_predictions, unscaled_simulations, output_formats,
timepoint)
#produce_accuracy_plots_all_measures(
# paste(model_list[model_index], "_TestSet_Normed", sep = ""),
# measures, emulation_predictions, partitioned_data$testing,
# timepoint)
} else {
produce_accuracy_plots_all_measures(
paste(model_list[model_index], "_TestSet", sep = ""),
measures, emulation_predictions, partitioned_data$testing,
output_formats, timepoint)
}
}
# Stats for the generation of this model:
model_stats <- build_performance_statistics(model_list[model_index],
emulation_predictions,
partitioned_data$testing,
measures,
model_fit$benchmark)
all_model_accuracy_stats <- rbind(all_model_accuracy_stats,
model_stats)
# Store the emulator
ensemble_emulators[[model_index]] <- model_fit
# Bind the results to the global set
all_model_predictions <- cbind(all_model_predictions,
emulation_predictions)
}
}
# Build the object that is to be both saved out (if requested) and returned
# Oct 2018 - as we're now checking parameters and measures are not all the same
# and removing these, we need to return the updated parameters and measures incase
# used in an ensemble
built_emulation <- list(
"emulators" = ensemble_emulators,
"prediction_set" = all_model_predictions,
"statistics" = all_model_accuracy_stats,
"pre_normed_mins" = partitioned_data$pre_normed_mins,
"pre_normed_maxes" = partitioned_data$pre_normed_maxes,
"parameters"=partitioned_data$parameters,
"measures"=partitioned_data$measures)
if (algorithm_settings$save_emulators) {
if (is.null(timepoint))
save(built_emulation, file = paste("Built_Emulation_",
paste(model_list, collapse = "_"),
".Rda", sep = ""))
else
save(built_emulation, file = paste("Built_Emulation_",
paste(model_list, collapse = "_"),
"_", timepoint, ".Rda", sep = ""))
}
return(built_emulation)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.