R/ensemble_generation.R

#' Generate an ensemble from previously created spartan emulation objects
#'
#' Where emulations have already been created, this method combines these to
#' form one ensemble. This takes as input a list of the emulator objects, the
#' simulation parameters and output response labels, and a set of test data
#' from which the performance weights will be evolved. We would recommend
#' providing the testing set of the output from the partition_dataset method.
#' An option is given, by setting these within emulation_algorithm_settings,
#' to save the ensemble object to file, as well as produce plots showing the
#' accuracy of the generated ensemble for the test data set
#'
#' @param existing_emulations Vector of emulator objects created by method
#' \code{generate_requested_emulations}
#' @param parameters Array containing the names of the parameters for which
#' values are input into each emulation
#' @param measures Array containing the names of the output measures predicted
#' by each emulation
#' @param observed_data Dataset to train the new ensemble on. We recommend
#' using the test data in the set generated by partition_data method, and not
#' the training set, as the emulators themselves have been trained on that
#' data and the ensemble could thus overfit.
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms used in emulation creation. Here this is needed to
#' decide whether any accuracy plots should be produced during ensemble
#' creation, whether or not the ensemble should be saved to file, and to
#' specify the number of generations for which the neural network that is
#' generating the algorithm weightings should run.
#' @param normalise Whether the predictions generated when testing the
#' ensemble should be normalised for presenting test results
#' @param timepoint If using multiple timepoints, the timepoint for which
#' this ensemble is being created
#' @param output_formats File formats in which result graphs should be produced
#' @return A list containing the ensemble, the time taken to generate it,
#' and the sampling mins and maxes used in its creation such that
#' unseen data used by and predictions generated by the ensemble can be
#' scaled and rescaled correctly
#'
#' @export
generate_ensemble_from_existing_emulations <-
  function(existing_emulations, parameters, measures, observed_data,
           algorithm_settings = NULL, normalise = FALSE,
           timepoint = NULL, output_formats=c("pdf")) {

  start.time <- proc.time()

  all_model_predictions <- NULL
  # For exisiting emulations, we collect the type of emulation from the list of
  # emulators, so we can correctly label the predicted data being fed into the
  # ensemble
  emulator_types <- NULL

  # We're also going to restructure the emulations into one list, as if they
  # had been generated at the same time (as the alternative method does),
  #such that we can keep the code constant
  ensemble_emulators <- vector("list")

  # We're also going to collect the times taken in generating each sub model
  generation_time <- 0

  # As the generate_requested_emulations can return a list of emulators
  # (if there is more than one in the model list), we need to be careful
  # we have processed all emulations. However each in existing_emulations
  # could have been generated separately, so we need to make sure we
  # account for both. If generated separately, emulator$emulators will be
  # a list of length 1, for all items in existing_emulations. If generated
  # together, the length will increase

  # Note that if the data has been normalised, we're going to need the
  # references to the max and mins in normalisation to scale the data
  # back into its original form. The assumption here is that the user has
  # generated the existing emulations with the same dataset, thus we can
  # take the mins and maxes from those contained in any of the emulators in
  # the list. So we do that here with the first emulator
  pre_normed_mins <- NULL
  pre_normed_maxes <- NULL

  ## As we may have a compound structure of emulators (depending on the call),
  # we need to keep a reference that can combine each into one list
  emulator_ref <- 1

  # First we are going to examine all items in the existing_emulations list
  for (model in 1:length(existing_emulations)) {

    # Get the object of emulations that was returned when this was generated
    # (from generate_requested_emulation)
    emulator <- existing_emulations[[model]]

    # This in turn may include more than one emulation (if the user used the
    # multiple method instead of generating each separately, then attached
    # several of these objects)
    # So we need to check for this and cycle through these
    if (length(emulator$emulators) > 0) {
      for (sub_emulator in 1:length(emulator$emulators)) {
        # Get this enclosed emulator
        inlist_emulator <- emulator$emulators[[sub_emulator]]
        emulator_types <- c(emulator_types,
                            emulator$emulators[[sub_emulator]]$type)

        all_model_predictions <- generate_ensemble_training_set(
          inlist_emulator, parameters, measures, observed_data,
          all_model_predictions)
        ensemble_emulators[[emulator_ref]] <- inlist_emulator

        #  # Get the generation time and add this to the generation time
        generation_time <- generation_time +
          emulator$emulators[[sub_emulator]]$benchmark

        # If the pre_normed_mins and maxes are still NULL, see if this emulator
        # has set them. If it has, we make the assumption this is the case for
        # them all (as they should have all been generated on the same scale
        if (is.null(pre_normed_maxes) & is.null(pre_normed_mins)) {
          pre_normed_maxes <-
            emulator$emulators[[sub_emulator]]$pre_normed_maxes
          pre_normed_mins <-
            emulator$emulators[[sub_emulator]]$pre_normed_mins
        }
        emulator_ref <- emulator_ref + 1
      }
    } else  {
      # Only one emulator in this object
      emulator_types <- c(emulator_types, emulator$type)

      all_model_predictions <- generate_ensemble_training_set(
        emulator, parameters, measures, observed_data, all_model_predictions)
      ensemble_emulators[[emulator_ref]] <- emulator

      #  # Get the generation time and add this to the generation time
      generation_time <- generation_time + emulator$benchmark

      # If the pre_normed_mins and maxes are still NULL, see if this
      # emulator has set them. If it has, we make the assumption this
      # is the case for them all (as they should have all been
      # generated on the same scale
      if (is.null(pre_normed_maxes) & is.null(pre_normed_mins)) {
        pre_normed_maxes <- emulator$pre_normed_maxes
        pre_normed_mins <- emulator$pre_normed_mins
      }

      emulator_ref <- emulator_ref + 1
    }
  }

  generated_ensemble <- create_ensemble(ensemble_emulators,
                                        all_model_predictions,
                                        observed_data, measures,
                                        emulator_types,
                                        pre_normed_mins, pre_normed_maxes,
                                        algorithm_settings,
                                        normalise, timepoint, output_formats)

  # Calculate time taken, only want the elapsed value (3rd one)
  time.taken <- as.numeric(proc.time() - start.time)[3]

  # Add on the total from all emulation creation
  time.taken <- time.taken + generation_time


  # Build the ensemble package for saving and return
  # Add in the mins and maxes from the emulator creation, as we'll need
  # these to denormalise the predictions if the original data was normalised
  # KA: AS TAKEN FROM DATAFRAME
  built_ensemble <- list("ensemble" = generated_ensemble,
                         "timetaken" = time.taken,
                         "pre_normed_maxes" = t(data.frame(pre_normed_maxes)),
                         "pre_normed_mins" = t(data.frame(pre_normed_mins)))

  if (algorithm_settings$save_ensemble) {
    # Save out the ensemble
    if (is.null(timepoint))
      save(built_ensemble, file = "built_ensemble.Rda")
    else
      save(built_ensemble, file = paste("built_ensemble_", timepoint, ".Rda",
                                        sep = ""))
  }

  return(built_ensemble)

}


#' Generate a set of emulators and combine into an ensemble
#'
#' This method generates all requested emulators then combines these into one
#' ensemble. This takes as input a list of the emulation objects to create
#' (could be random forest, support vector machine, neural network, general
#' linear model, and gaussian process model), the simulation parameters and
#' output response labels, an object created by the partitioned_dataset
#' method (training, testing, and validation datasets), and an object
#' created by method emulation_algorithm_settings. The latter sets key
#' arguments used in emulation creation, as detailed in the description
#' accompanying that method.
#'
#' @param model_list Vector of the types of emulation model to create.
#' Accepted abbreviations are: SVM (Support-Vector Machine), GP (Gaussian
#' Process Model), NNET (Neural Network), RF (Random Forest), GLM (General
#' Linear Model)
#' @param parameters Vector containing the names of the simulation parameters
#' in the dataset on which the emulator is being trained
#' @param measures Vector containing the simulation outputs that the emulators
#' should be able to predict
#' @param partitioned_data Object output from the function partition_dataset,
#' an object containing training, testing, and validation data
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms to use in emulation creation. If no setting changes are
#' required, and a neural network is not being generated, this can be left out,
#' and will be generated by generate_requested_emulations (so this defaults to
#' NULL). If you are making any changes to the settings or generating a neural
#' network, you must create this object before calling
#' generate_requested_emulations.
#' @param timepoint If using multiple timepoints, the timepoint for which
#' emulators are being created
#' @param normalised Whether the emulator data has been normalised or not.
#' Affects how training and test output predictions are displayed
#' @param output_formats File formats in which result graphs should be produced
#' @return A list containing the ensemble, the time taken to generate it,
#' and the sampling mins and maxes used in its creation such that
#' unseen data used by and predictions generated by the ensemble can be
#' scaled and rescaled correctly
#' @examples
#' sampleMaxes <- cbind(100,0.9,0.5,0.08,1,5)
#' sampleMins <-cbind(0,0.1,0.1,0.015,0.1,0.25)
#' modelList <- c("RF","GLM")
#' measures<-c("Velocity")
#' parameters<-c("stableBindProbability","chemokineExpressionThreshold",
#' "initialChemokineExpressionValue","maxChemokineExpressionValue",
#' "maxProbabilityOfAdhesion","adhesionFactorExpressionSlope")
#' data("sim_data_for_emulation")
#' partitionedData <- partition_dataset(sim_data_for_emulation[,1:7], parameters,
#' measures, percent_train=75, percent_test=15, percent_validation=10, normalise=TRUE,
#' sample_mins = sampleMins, sample_maxes = sampleMaxes)
#' generated_ensemble<-generate_emulators_and_ensemble(modelList, parameters,
#' measures, partitionedData, normalised=TRUE)
#'
#'
#'
#' @export
generate_emulators_and_ensemble <- function(model_list, parameters, measures,
                                            partitioned_data,
                                            algorithm_settings = NULL,
                                            timepoint = NULL,
                                            normalised = FALSE,
                                            output_formats=c("pdf")) {
  if (length(model_list) > 1) {
    start.time <- proc.time()

    # Generate_requested_emulations checks for algorithm_settings
    emulators_with_test_preds <- generate_requested_emulations(
      model_list, partitioned_data, parameters, measures, algorithm_settings,
      timepoint, normalised, output_formats)

    # Here we need to double check whether any parameters or measures were
    # excluded from data partitioning (if they were all the same)
    if(length(parameters) > length(partitioned_data$parameters))
      parameters<-partitioned_data$parameters
    if(length(measures) > length(partitioned_data$measures))
      measures<-partitioned_data$measures

    # Above will return -1 on error with Neural network settings
    # (if algorithm_settings has not been provided with a structure list)
    if (!(typeof(emulators_with_test_preds) == "list")) {
      message("No Ensemble Generated. Please fix the issues with
            algorithm_settings")
      return(NULL)
    } else {
      all_model_predictions <- emulators_with_test_preds$prediction_set
      emulators <- emulators_with_test_preds$emulators

      generated_ensemble <- create_ensemble(
        emulators, all_model_predictions, partitioned_data$testing, measures,
        model_list, emulators_with_test_preds$pre_normed_mins,
        emulators_with_test_preds$pre_normed_maxes, algorithm_settings,
        normalise = normalised, timepoint, output_formats)

      # Added October 2018: Would be nice to see the stats for the ensemble performance
      #performance_stats<-c("Ensemble")
      #for (m in 1:length(measures))
      #{
      #  performance_stats<-cbind(performance_stats,meanSquaredError(all_model_predictions[, measures[m]], partitioned_data$testing[, measures[m]]))
      #}

      # Calculate time taken, only want the elapsed value (3rd one)
      time.taken <- as.numeric(proc.time() - start.time)[3]

      # Build the ensemble package for saving and return
      # Add in the mins and maxes from the emulator creation,
      # as we'll need these to denormalise the predictions if the original
      # data was normalised
      built_ensemble <- list(
        "ensemble" = generated_ensemble, "timetaken" = time.taken,
        "pre_normed_maxes" = t(data.frame(
          emulators_with_test_preds$pre_normed_maxes)),
        "pre_normed_mins" = t(data.frame(
          emulators_with_test_preds$pre_normed_mins)),
        "statistics" = emulators_with_test_preds$statistics)

      # Save out the ensemble
      if (is.null(timepoint))
        save(built_ensemble, file = "built_ensemble.Rda")
      else
        save(built_ensemble, file = paste("built_ensemble_", timepoint, ".Rda",
                                          sep = ""))
      return(built_ensemble)
    }
  } else {
    message("To generate an ensemble you need at least two types of emulator")
    return(NULL)
  }
}
kalden/spartan documentation built on May 31, 2019, 11:52 p.m.