R/crcmodel_set_posterior.R

Defines functions calculate_weighted_averages sample_from_posterior crcmodel_set_posterior

#------------------------------------------------------------------------------#
#
# CRCRDM: Robust Decision Making tools for Colorectal Cancer Screening Models
#
# Author: Pedro Nascimento de Lima
# See LICENSE.txt and README.txt for information on usage and licensing
#------------------------------------------------------------------------------#

#------------------------------------------------------------------------------#
# Set Posterior Method
# Purpose: Samples from multiple posterior files and defines parameters
# from the posterior distribution.
# Creation Date: July 2021
#------------------------------------------------------------------------------#

# See documentation in the crcmodel class.
crcmodel_set_posterior = function(self, posteriors_list, posterior_weights, cols_to_ignore, use_average, n_posterior, seed, resample) {

  # Setting a seed for reproducibility because this function will create a sample:
  if(!missing(seed)){
    set.seed(seed = seed)
  }

  # Checking Inputs:

  # Posterior is a list:
  assertthat::assert_that(is.list(posteriors_list),msg = "posteriors_list object should be a list.")

  # it only contains data.frames:
  assertthat::assert_that(all(sapply(posteriors_list, is.data.frame)),msg = "posteriors_list object should be a list.")

  # every data.frame contains exactly the same columns:
  names_list = lapply(posteriors_list, names)

  # all names of the data.frames are exactly the same:
  assertthat::assert_that(all(sapply(names_list, identical, names_list[[1]])), msg = "All dataframes in the posterior_lists should have the same parameters. each parameter data.frame must have the same names.")
  assertthat::assert_that(all(sapply(names_list, base::setequal, names_list[[1]])), msg = "All dataframes in the posterior_lists should have the same parameters. each parameter data.frame must have the same names.")

  # posterior_weights must be provided:
  assertthat::assert_that(!missing(posterior_weights), msg = "posterior_weights parameter must be provided.")

  # posterior_weights needs to be a character:
  assertthat::assert_that(is.character(posterior_weights), msg = "posterior_weights should be a character defining the collumn name of the weights to use to sample from the posterior.")

  # It should be a single character
  assertthat::assert_that(length(posterior_weights) == 1, msg = "posterior_weights should be a single character")

  # it also needs to exist in the posterior file - we already checked all of them are the same, so we can just check the first one.
  assertthat::assert_that(posterior_weights %in% names_list[[1]])

  # Defining the names of the posteriors and unselecting undesired columns:
  for(posterior_id in 1:length(names(posteriors_list)) ) {
    posteriors_list[[posterior_id]]$posterior.df.id = posterior_id
    posteriors_list[[posterior_id]]$posterior.df.name = names(posteriors_list)[posterior_id]
    # Ignoring unwanted variables:
    posteriors_list[[posterior_id]] = posteriors_list[[posterior_id]] %>%
      dplyr::select(-dplyr::any_of(cols_to_ignore))

  }

  # If we want to use the average, we can do so:
  if(use_average) {

  # calculate weighted averages for every posterior in the posteriors_list:
    posterior_params <- purrr::map_dfr(.x = posteriors_list, .f = calculate_weighted_averages, posterior_weights = posterior_weights) %>%
      dplyr::mutate(param.id = row_number())

  # If not using weighted averages, sample from the posterior using its weights:
  } else {

    # n_posterior is smaller or equal to the size of the dataframe.
    assertthat::assert_that(all(sapply(posteriors_list, nrow) >= n_posterior), msg = "Number of rows in each posterior data.frame must be greater or equal than n_posterior.")

    # n_posterior is not missing
    assertthat::assert_that(!missing(n_posterior), msg = "n_posterior parameter must be provided.")

    # Use the purrr::map_dfr function to sample from the posterior distribution for each posterior file provided.
    posterior_params <- purrr::map_dfr(.x = posteriors_list, .f = sample_from_posterior, n_posterior = n_posterior, posterior_weights = posterior_weights, resample = resample) %>%
      dplyr::mutate(param.id = row_number())

    row.names(posterior_params) = NULL
  }

  # The main result of this function is the posterior_params data.frame, which we assign to self:
  self$posterior_params = posterior_params

  if (resample) {
    self$posterior_params <- self$posterior_params %>%
      dplyr::select(-any_of(posterior_weights))
  }

  # Return the model object:
  invisible(self)

}



# Auxiliary Functions -----------------------------------------------------

# Sample from Posterior ------------------------------------------

# This internal function samples from the posterior distribution. It is defined here because it is only used by the set_posterior function.
sample_from_posterior = function(posterior_data_frame, n_posterior, posterior_weights, resample) {

  # Assign an id to the original posterior table:
  posterior_data_frame = posterior_data_frame %>%
    mutate(posterior.orig.row.id = dplyr::row_number())

  # Next, sample from the posterior with replacement:
  if(resample){
    ids_to_select = sample(x = posterior_data_frame$posterior.orig.row.id, size = n_posterior, replace = T, prob = posterior_data_frame[,posterior_weights])
  # Or don't resample at all and return the posterior directly
    } else {
    ids_to_select = posterior_data_frame$posterior.orig.row.id
  }

  # Selects these rows from the data.frame:
  posterior_sample = posterior_data_frame[ids_to_select,] %>%
    dplyr::mutate(posterior.df.row.id = dplyr::row_number())

  return(posterior_sample)

}


# Calculate weighted averages ------------------------------------

# This private function is used to calculate weighted averages if the user wants them.
calculate_weighted_averages = function(df, posterior_weights) {
  # Calculate a normalized_weights variable to ensure that the weighted average will be correct
  # And the weights add up to one.

  # ensure this is a vanilla data.frame
  df = as.data.frame(df)

  df$normalized_weights = df[,posterior_weights] / sum(df[,posterior_weights])

  # Calculate the weighted average for every numeric variable:
  df %>%
    # Multiply value by the normalized weights:
    mutate(across(where(is.numeric) & !c(.data$posterior.df.id, .data$posterior.df.name), ~ .x * df$normalized_weights)) %>%
    select(-.data$normalized_weights) %>%
    # Add them up:
    group_by(.data$posterior.df.id, .data$posterior.df.name) %>%
    summarise(across(where(is.numeric),sum), .groups = "drop") %>%
    as.data.frame()
}
c-rutter/crcrdm documentation built on July 1, 2023, 7:35 a.m.