#' Generate an ensemble from previously created spartan emulation objects
#'
#' Where emulations have already been created, this method combines these to
#' form one ensemble. This takes as input a list of the emulator objects, the
#' simulation parameters and output response labels, and a set of test data
#' from which the performance weights will be evolved. We would recommend
#' providing the testing set of the output from the partition_dataset method.
#' An option is given, by setting these within emulation_algorithm_settings,
#' to save the ensemble object to file, as well as produce plots showing the
#' accuracy of the generated ensemble for the test data set
#'
#' @param existing_emulations Vector of emulator objects created by method
#' \code{generate_requested_emulations}
#' @param parameters Array containing the names of the parameters for which
#' values are input into each emulation
#' @param measures Array containing the names of the output measures predicted
#' by each emulation
#' @param observed_data Dataset to train the new ensemble on. We recommend
#' using the test data in the set generated by partition_data method, and not
#' the training set, as the emulators themselves have been trained on that
#' data and the ensemble could thus overfit.
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms used in emulation creation. Here this is needed to
#' decide whether any accuracy plots should be produced during ensemble
#' creation, whether or not the ensemble should be saved to file, and to
#' specify the number of generations for which the neural network that is
#' generating the algorithm weightings should run.
#' @param normalise Whether the predictions generated when testing the
#' ensemble should be normalised for presenting test results
#' @param timepoint If using multiple timepoints, the timepoint for which
#' this ensemble is being created
#' @param output_formats File formats in which result graphs should be produced
#' @return A list containing the ensemble, the time taken to generate it,
#' and the sampling mins and maxes used in its creation such that
#' unseen data used by and predictions generated by the ensemble can be
#' scaled and rescaled correctly
#'
#' @export
generate_ensemble_from_existing_emulations <-
function(existing_emulations, parameters, measures, observed_data,
algorithm_settings = NULL, normalise = FALSE,
timepoint = NULL, output_formats=c("pdf")) {
start.time <- proc.time()
all_model_predictions <- NULL
# For exisiting emulations, we collect the type of emulation from the list of
# emulators, so we can correctly label the predicted data being fed into the
# ensemble
emulator_types <- NULL
# We're also going to restructure the emulations into one list, as if they
# had been generated at the same time (as the alternative method does),
#such that we can keep the code constant
ensemble_emulators <- vector("list")
# We're also going to collect the times taken in generating each sub model
generation_time <- 0
# As the generate_requested_emulations can return a list of emulators
# (if there is more than one in the model list), we need to be careful
# we have processed all emulations. However each in existing_emulations
# could have been generated separately, so we need to make sure we
# account for both. If generated separately, emulator$emulators will be
# a list of length 1, for all items in existing_emulations. If generated
# together, the length will increase
# Note that if the data has been normalised, we're going to need the
# references to the max and mins in normalisation to scale the data
# back into its original form. The assumption here is that the user has
# generated the existing emulations with the same dataset, thus we can
# take the mins and maxes from those contained in any of the emulators in
# the list. So we do that here with the first emulator
pre_normed_mins <- NULL
pre_normed_maxes <- NULL
## As we may have a compound structure of emulators (depending on the call),
# we need to keep a reference that can combine each into one list
emulator_ref <- 1
# First we are going to examine all items in the existing_emulations list
for (model in 1:length(existing_emulations)) {
# Get the object of emulations that was returned when this was generated
# (from generate_requested_emulation)
emulator <- existing_emulations[[model]]
# This in turn may include more than one emulation (if the user used the
# multiple method instead of generating each separately, then attached
# several of these objects)
# So we need to check for this and cycle through these
if (length(emulator$emulators) > 0) {
for (sub_emulator in 1:length(emulator$emulators)) {
# Get this enclosed emulator
inlist_emulator <- emulator$emulators[[sub_emulator]]
emulator_types <- c(emulator_types,
emulator$emulators[[sub_emulator]]$type)
all_model_predictions <- generate_ensemble_training_set(
inlist_emulator, parameters, measures, observed_data,
all_model_predictions)
ensemble_emulators[[emulator_ref]] <- inlist_emulator
# # Get the generation time and add this to the generation time
generation_time <- generation_time +
emulator$emulators[[sub_emulator]]$benchmark
# If the pre_normed_mins and maxes are still NULL, see if this emulator
# has set them. If it has, we make the assumption this is the case for
# them all (as they should have all been generated on the same scale
if (is.null(pre_normed_maxes) & is.null(pre_normed_mins)) {
pre_normed_maxes <-
emulator$emulators[[sub_emulator]]$pre_normed_maxes
pre_normed_mins <-
emulator$emulators[[sub_emulator]]$pre_normed_mins
}
emulator_ref <- emulator_ref + 1
}
} else {
# Only one emulator in this object
emulator_types <- c(emulator_types, emulator$type)
all_model_predictions <- generate_ensemble_training_set(
emulator, parameters, measures, observed_data, all_model_predictions)
ensemble_emulators[[emulator_ref]] <- emulator
# # Get the generation time and add this to the generation time
generation_time <- generation_time + emulator$benchmark
# If the pre_normed_mins and maxes are still NULL, see if this
# emulator has set them. If it has, we make the assumption this
# is the case for them all (as they should have all been
# generated on the same scale
if (is.null(pre_normed_maxes) & is.null(pre_normed_mins)) {
pre_normed_maxes <- emulator$pre_normed_maxes
pre_normed_mins <- emulator$pre_normed_mins
}
emulator_ref <- emulator_ref + 1
}
}
generated_ensemble <- create_ensemble(ensemble_emulators,
all_model_predictions,
observed_data, measures,
emulator_types,
pre_normed_mins, pre_normed_maxes,
algorithm_settings,
normalise, timepoint, output_formats)
# Calculate time taken, only want the elapsed value (3rd one)
time.taken <- as.numeric(proc.time() - start.time)[3]
# Add on the total from all emulation creation
time.taken <- time.taken + generation_time
# Build the ensemble package for saving and return
# Add in the mins and maxes from the emulator creation, as we'll need
# these to denormalise the predictions if the original data was normalised
# KA: AS TAKEN FROM DATAFRAME
built_ensemble <- list("ensemble" = generated_ensemble,
"timetaken" = time.taken,
"pre_normed_maxes" = t(data.frame(pre_normed_maxes)),
"pre_normed_mins" = t(data.frame(pre_normed_mins)))
if (algorithm_settings$save_ensemble) {
# Save out the ensemble
if (is.null(timepoint))
save(built_ensemble, file = "built_ensemble.Rda")
else
save(built_ensemble, file = paste("built_ensemble_", timepoint, ".Rda",
sep = ""))
}
return(built_ensemble)
}
#' Generate a set of emulators and combine into an ensemble
#'
#' This method generates all requested emulators then combines these into one
#' ensemble. This takes as input a list of the emulation objects to create
#' (could be random forest, support vector machine, neural network, general
#' linear model, and gaussian process model), the simulation parameters and
#' output response labels, an object created by the partitioned_dataset
#' method (training, testing, and validation datasets), and an object
#' created by method emulation_algorithm_settings. The latter sets key
#' arguments used in emulation creation, as detailed in the description
#' accompanying that method.
#'
#' @param model_list Vector of the types of emulation model to create.
#' Accepted abbreviations are: SVM (Support-Vector Machine), GP (Gaussian
#' Process Model), NNET (Neural Network), RF (Random Forest), GLM (General
#' Linear Model)
#' @param parameters Vector containing the names of the simulation parameters
#' in the dataset on which the emulator is being trained
#' @param measures Vector containing the simulation outputs that the emulators
#' should be able to predict
#' @param partitioned_data Object output from the function partition_dataset,
#' an object containing training, testing, and validation data
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms to use in emulation creation. If no setting changes are
#' required, and a neural network is not being generated, this can be left out,
#' and will be generated by generate_requested_emulations (so this defaults to
#' NULL). If you are making any changes to the settings or generating a neural
#' network, you must create this object before calling
#' generate_requested_emulations.
#' @param timepoint If using multiple timepoints, the timepoint for which
#' emulators are being created
#' @param normalised Whether the emulator data has been normalised or not.
#' Affects how training and test output predictions are displayed
#' @param output_formats File formats in which result graphs should be produced
#' @return A list containing the ensemble, the time taken to generate it,
#' and the sampling mins and maxes used in its creation such that
#' unseen data used by and predictions generated by the ensemble can be
#' scaled and rescaled correctly
#' @examples
#' sampleMaxes <- cbind(100,0.9,0.5,0.08,1,5)
#' sampleMins <-cbind(0,0.1,0.1,0.015,0.1,0.25)
#' modelList <- c("RF","GLM")
#' measures<-c("Velocity")
#' parameters<-c("stableBindProbability","chemokineExpressionThreshold",
#' "initialChemokineExpressionValue","maxChemokineExpressionValue",
#' "maxProbabilityOfAdhesion","adhesionFactorExpressionSlope")
#' data("sim_data_for_emulation")
#' partitionedData <- partition_dataset(sim_data_for_emulation[,1:7], parameters,
#' measures, percent_train=75, percent_test=15, percent_validation=10, normalise=TRUE,
#' sample_mins = sampleMins, sample_maxes = sampleMaxes)
#' generated_ensemble<-generate_emulators_and_ensemble(modelList, parameters,
#' measures, partitionedData, normalised=TRUE)
#'
#'
#'
#' @export
generate_emulators_and_ensemble <- function(model_list, parameters, measures,
partitioned_data,
algorithm_settings = NULL,
timepoint = NULL,
normalised = FALSE,
output_formats=c("pdf")) {
if (length(model_list) > 1) {
start.time <- proc.time()
# Generate_requested_emulations checks for algorithm_settings
emulators_with_test_preds <- generate_requested_emulations(
model_list, partitioned_data, parameters, measures, algorithm_settings,
timepoint, normalised, output_formats)
# Here we need to double check whether any parameters or measures were
# excluded from data partitioning (if they were all the same)
if(length(parameters) > length(partitioned_data$parameters))
parameters<-partitioned_data$parameters
if(length(measures) > length(partitioned_data$measures))
measures<-partitioned_data$measures
# Above will return -1 on error with Neural network settings
# (if algorithm_settings has not been provided with a structure list)
if (!(typeof(emulators_with_test_preds) == "list")) {
message("No Ensemble Generated. Please fix the issues with
algorithm_settings")
return(NULL)
} else {
all_model_predictions <- emulators_with_test_preds$prediction_set
emulators <- emulators_with_test_preds$emulators
generated_ensemble <- create_ensemble(
emulators, all_model_predictions, partitioned_data$testing, measures,
model_list, emulators_with_test_preds$pre_normed_mins,
emulators_with_test_preds$pre_normed_maxes, algorithm_settings,
normalise = normalised, timepoint, output_formats)
# Added October 2018: Would be nice to see the stats for the ensemble performance
#performance_stats<-c("Ensemble")
#for (m in 1:length(measures))
#{
# performance_stats<-cbind(performance_stats,meanSquaredError(all_model_predictions[, measures[m]], partitioned_data$testing[, measures[m]]))
#}
# Calculate time taken, only want the elapsed value (3rd one)
time.taken <- as.numeric(proc.time() - start.time)[3]
# Build the ensemble package for saving and return
# Add in the mins and maxes from the emulator creation,
# as we'll need these to denormalise the predictions if the original
# data was normalised
built_ensemble <- list(
"ensemble" = generated_ensemble, "timetaken" = time.taken,
"pre_normed_maxes" = t(data.frame(
emulators_with_test_preds$pre_normed_maxes)),
"pre_normed_mins" = t(data.frame(
emulators_with_test_preds$pre_normed_mins)),
"statistics" = emulators_with_test_preds$statistics)
# Save out the ensemble
if (is.null(timepoint))
save(built_ensemble, file = "built_ensemble.Rda")
else
save(built_ensemble, file = paste("built_ensemble_", timepoint, ".Rda",
sep = ""))
return(built_ensemble)
}
} else {
message("To generate an ensemble you need at least two types of emulator")
return(NULL)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.