R/emulation_generation.R

#' Generate emulators for specified machine learning techniques with provided
#' data
#'
#' This method generates an emulator model from a training set for a specified
#' technique, and generates performance statistics from the test set. The
#' currently implemented techniques are a neural network (using the neuralnet
#' package), a random forest (from the randomforest package), a support vector
#' machine (from package e1071), a gaussian process model (from package mlegp),
#' and a general linear model. Where a neural network is desired, the
#' hyper-parameters are determined using k-fold cross validation from a set of
#' specified network structures. Where a simulation has multiple outputs, an
#' emulator model is created for each output response. This method provides
#' capacity to save the generated emulator models to file, in Rda format, and
#' plot a comparison of the predicted responses to a set of those of the
#' training and test sets, giving correlation of determination (R-squared) and
#' mean squared error values. The method returns a list of emulators of a
#' specified technique, one for each simulation output, and the performance
#' statistics for each measure, including the time taken to generate these
#' emulators. If the training data has been normalised, minimum and maximum
#' sampling values for each parameter are also returned such that any
#' predictions generated using this emulation can be rescaled correctly.
#' If plots are desired (by setting a flag in emulation_algorithm settings),
#' plots produced are stored as PDF's in the working directory. The same
#' applies to saving the generated emulator, set by the saveEmulation flag
#' in emulation_algorithm_settings. Note that it must be specified as to
#' whether the data being provided in partitioned_data has been normalised
#' or not: this affects the output of the plots (as the data is rescaled back
#' to its original scale if the data was normalised). Similarly to the rest of
#' spartan, this method can create emulations for multiple timepoints.
#'
#' @param model_list Vector of the types of emulation model to create.
#' Accepted abbreviations are: SVM (Support-Vector Machine), GP (Gaussian
#' Process Model), NNET (Neural Network), RF (Random Forest), GLM (General
#' Linear Model)
#' @param partitioned_data Object output from the function partition_dataset,
#' an object containing training, testing, and validation data
#' @param parameters Vector containing the names of the simulation parameters
#' in the dataset on which the emulator is being trained
#' @param measures Vector containing the simulation outputs that the emulators
#' should be able to predict
#' @param algorithm_settings Object output from the function
#' emulation_algorithm_settings, containing the settings of the machine
#' learning algorithms to use in emulation creation. If no setting changes are
#' required, and a neural network is not being generated, this can be left out,
#' and will be generated by generate_requested_emulations (so this defaults to
#' NULL). If you are making any changes to the settings or generating a neural
#' network, you must create this object before calling
#' generate_requested_emulations.
#' @param timepoint If using multiple timepoints, the timepoint for which
#' emulators are being created
#' @param normalised Whether the emulator data has been normalised or not.
#' Affects how training and test output predictions are displayed
#' @param output_formats File formats in which result graphs should be produced
#' @return Emulation objects, bundled into a list, with the required
#' sampling information to rescale the data these emulations produce if
#' required
#'
#' @export
#'
generate_requested_emulations <- function(model_list, partitioned_data,
                                          parameters, measures,
                                          algorithm_settings = NULL,
                                          timepoint = NULL,
                                          normalised = FALSE,
                                          output_formats=c("pdf")) {

  # First see whether any of the default settings have been overridden
  # if not provided generate the defaults
  if (is.null(algorithm_settings))
    algorithm_settings <- emulation_algorithm_settings()

  # Quick check that parameters and measures are equal to those in the partitioned
  # dataset, where features may have been removed if all values were equal
  # Ideally we should find a better method of doing this
  if(length(parameters) != length(partitioned_data$parameters))
  {
    parameters = partitioned_data$parameters
    message("Parameters List Updated to that Used in Partitioning, where features may have been removed")
  }
  if(length(measures) != length(partitioned_data$measures))
  {
    measures = partitioned_data$measures
    message("Measures List Updated to that Used in Partitioning, where features may have been removed")
  }

  # For a neural nework check that the structure list has been specified
  if ("NNET" %in% model_list & is.null(
    algorithm_settings$network_structures)) {

    message("No Neural Network layer structures provided. Algorithm terminating.
          Please run emulation_algorithm_settings, providing a list as an
          argument for networkStructures, and run this again")
    # -1 used to confirm error when generating ensemble with model generation
    return(-1)
  } else {
    all_model_predictions <- NULL
    all_model_accuracy_stats <- NULL

    # Structure to store the emulators created
    ensemble_emulators <- vector("list", length(model_list))

    # Iterate through all the models that the user wants to generate
    for (model_index in 1:length(model_list)) {
      print(model_list[model_index])
      # Check an acceptable model has been specified
      if (check_acceptable_model_type(model_list[model_index]) == TRUE) {
        # Sse the same code to generate all the currently acceptable model types
        model_fit <- generate_emulator_model(model_list[model_index],
                                             parameters, measures,
                                             partitioned_data,
                                             algorithm_settings,
                                             timepoint = timepoint, normalised,
                                             output_formats)

        # Add the sampled mins and maxes to help rescaling
        model_fit$pre_normed_mins <- partitioned_data$pre_normed_mins
        model_fit$pre_normed_maxes <- partitioned_data$pre_normed_maxes

        # Get predicted results from test set parameters. We can create this
        # using the same method to construct the ensemble training set
        # We don't pass in emulation_predictions to be bound onto -
        # we just want the predictions for this emulator
        emulation_predictions <- generate_ensemble_training_set(
          model_fit, parameters, measures, partitioned_data$testing, NULL)

        # Print the accuracy of the test set, if requested
        if (algorithm_settings$plot_test_accuracy == TRUE) {
          if (normalised) {
            # Scale the predictions such that these are plotted correctly
            unscaled_predictions <- denormalise_dataset(
              emulation_predictions, rbind(model_fit$pre_normed_mins[measures]),
              rbind(model_fit$pre_normed_maxes[measures]))
            unscaled_simulations <- denormalise_dataset(
              partitioned_data$testing, rbind(partitioned_data$pre_normed_mins),
              rbind(partitioned_data$pre_normed_maxes))
            produce_accuracy_plots_all_measures(
              paste(model_list[model_index], "_TestSet", sep = ""), measures,
              unscaled_predictions, unscaled_simulations, output_formats,
              timepoint)

            #produce_accuracy_plots_all_measures(
            #  paste(model_list[model_index], "_TestSet_Normed", sep = ""),
             # measures, emulation_predictions, partitioned_data$testing,
            #  timepoint)

          } else {
            produce_accuracy_plots_all_measures(
              paste(model_list[model_index], "_TestSet", sep = ""),
              measures, emulation_predictions, partitioned_data$testing,
              output_formats, timepoint)
          }
        }

        # Stats for the generation of this model:
        model_stats <- build_performance_statistics(model_list[model_index],
                                                    emulation_predictions,
                                                    partitioned_data$testing,
                                                    measures,
                                                    model_fit$benchmark)
        all_model_accuracy_stats <- rbind(all_model_accuracy_stats,
                                          model_stats)

        # Store the emulator
        ensemble_emulators[[model_index]] <- model_fit

        # Bind the results to the global set
        all_model_predictions <- cbind(all_model_predictions,
                                     emulation_predictions)
      }
    }


    # Build the object that is to be both saved out (if requested) and returned
    # Oct 2018 - as we're now checking parameters and measures are not all the same
    # and removing these, we need to return the updated parameters and measures incase
    # used in an ensemble
    built_emulation <- list(
      "emulators" = ensemble_emulators,
      "prediction_set" = all_model_predictions,
      "statistics" = all_model_accuracy_stats,
      "pre_normed_mins" = partitioned_data$pre_normed_mins,
      "pre_normed_maxes" = partitioned_data$pre_normed_maxes,
      "parameters"=partitioned_data$parameters,
      "measures"=partitioned_data$measures)

    if (algorithm_settings$save_emulators) {
      if (is.null(timepoint))
        save(built_emulation, file = paste("Built_Emulation_",
                                           paste(model_list, collapse = "_"),
                                           ".Rda", sep = ""))
      else
        save(built_emulation, file = paste("Built_Emulation_",
                                           paste(model_list, collapse = "_"),
                                           "_", timepoint, ".Rda", sep = ""))
    }

    return(built_emulation)
  }
}
kalden/spartan documentation built on May 31, 2019, 11:52 p.m.