R/explore_space.R

Defines functions explore_space

Documented in explore_space

# TODO make print statements optional
# TODO reuse/Abstract profiling code from lik_profile()

#' Explore parameter space
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' The function is aimed at getting an idea of how the parameter space
#' of a model behaves, so that parameter identifiability problems and correlations
#' between parameters can be explored. Therefore, the function samples a large
#' number of parameter sets by randomly drawing from each parameter's 95%
#' confidence interval (generated by [lik_profile()]). It then
#' checks how many of the parameter sets are within acceptable limits by comparing
#' the likelihood ratio of a parameter set vs. the original parameter set against
#' a chi-square distribution as degrees of freedom (df) the total number of profile
#' parameters (outer rim) or one df (inner rim). If needed, the function resamples
#' until at least `nr_accept` parameters sets are within the inner rim
#'
#' @param x a list of [caliset] objects
#' @param par best fit parameters from joined calibration
#' @param res output of [lik_profile()] function
#' @param output character vector, name of output column of [simulate()] that
#'  is used in calibration
#' @param data only needed if `x` is a [scenario]
#' @param sample_size number of samples to draw from each parameter interval
#' @param max_iter max number of iterations to redraw samples (within a smaller space), and repeat the process
#' @param nr_accept threshold for number of points sampled within the inner circle
#' @param sample_factor multiplication factor for sampling (95% interval * sample factor)
#' @param individual if `FALSE` (default), the log likelihood is calculated across
#' the whole dataset. Alternatively, if `TRUE`, log likelihoods are calculated for
#' each (group of) *set*(s) individually.
#' @param log_scale `FALSE` (default), option to calculate the log likelihood on a
#' log scale (i.e., observations and predictions are log transformed during calculation)
#' @param data_type Character argument, `"continuous"` (default) or `"count"`, to specify the data type
#' for the log likelihood calculations.
#' @param max_runs *deprecated* alias for `max_iter` parameter
#' @param ... additional parameters passed through to [simulate()]
#'
#' @return a list containing a plot to explore the parameter space, and the `data.frame`
#' supporting it
#'
#' @seealso [plot.param_space]
#' @export
#' @autoglobal
#' @importFrom lifecycle is_present deprecated
#' @examples
#' \donttest{
#' library(dplyr)
#' # Example with Lemna model - physiological params
#' # Before applying the function, a model needs to be calibrated and its parameters profiled
#' # Inputs for likelihood profiling
#'
#' # observations - control run
#' obs <- schmitt2013 %>%
#'   filter(trial == "T0")
#'
#' # parameters after calibration
#' params <- c(
#'   k_phot_max = 5.663571,
#'   k_resp = 1.938689,
#'   Topt = 26.7
#' )
#'
#' # set parameter boundaries (if different from defaults)
#' bounds <- list(
#'   k_resp = list(0, 10),
#'   k_phot_max = list(0, 30),
#'   Topt = list(20, 30)
#' )
#'
#' # update metsulfuron
#' myscenario <- metsulfuron %>%
#'   set_init(c(BM = 1.2, E = 1, M_int = 0)) %>%
#'   set_param(list(
#'     k_0 = 5E-5,
#'     a_k = 0.25,
#'     BM50 = 17600,
#'     mass_per_frond = 0.1
#'   )) %>%
#'   set_noexposure() %>%
#'   set_param(params) %>%
#'   set_bounds(bounds)
#'
#' # Likelihood profiling
#' res <- lik_profile(
#'   x = myscenario,
#'   data = obs,
#'   output = "FrondNo",
#'   par = params,
#'   refit = FALSE,
#'   type = "fine",
#'   method = "Brent"
#' )
#' # plot
#' plot(res)
#'
#' # parameter space explorer
#' set.seed(1) # for reproducibility
#' res_space <- explore_space(
#'   x = myscenario,
#'   data = obs,
#'   par = params,
#'   res = res,
#'   output = "FrondNo",
#'   sample_size = 1000,
#'   max_iter = 20,
#'   nr_accept = 100)
#'
#' plot(res_space)
#'
#' }
explore_space <- function(x,
                          par,
                          res,
                          output,
                          data,
                          sample_size = 1000,
                          max_iter = 30,
                          nr_accept = 100,
                          sample_factor = 1.2,
                          individual = FALSE,
                          log_scale = FALSE,
                          data_type = c("continuous", "count"),
                          max_runs=deprecated(),
                          ...) {
  if(is_present(max_runs)) {
    lifecycle::deprecate_warn("1.5.0", "cvasi::explore_space(max_runs)", "cvasi::explore_space(max_iter)")
    max_iter <- max_runs
  }

  # some checks
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # convert to list of calibration sets if needed
  if (length(x) == 1) {
    if (is_scenario(x)) {
      if(is.null(data)) {
        stop("Scenario provided, but argument 'data' is missing")
      } else {
        #message("Scenario converted to calibration set")
        if(is.data.frame(data))
          data <- tox_data(data)
        x <- td2cs(x, data, output_var=output)
      }
    }
  }
  # check sensible use of function
  if(length(output) > 1){
    stop("Parameter `output` must have length of one")
  }
  if (length(names(res)) <= 1) {
    stop("parameter space exploration only makes sense for 2 or more parameters")
  }
  # check if the same output is used throughout
  #if(!(output %in% x[[1]]@scenario@endpoints)){
  #  stop("Specified output not present in scenario object (x)")
  #}
  #if(!(output %in% colnames(x[[1]]@data))){
  #  stop("Specified output not present in data of scenario object (x)")
  #}
  # check if explored parameters are part of the model
  if (any(names(res) %in% names(x[[1]]@scenario@param) == "FALSE")) {
    stop("Profiled parameters (in res) are not part of the scenario object (x)")
  }
  # check if boundaries for explored parameters are available
  if(any(names(res) %in% names(x[[1]]@scenario@param.bounds) == "FALSE")){
    stop("No parameter boundaries available for explored parameters, please set bounds in scenario object (x)")
  }
  if(!is.numeric(sample_factor))
    stop("Argument 'sample_factor' must be numeric")
  if(length(sample_factor) != 1)
    stop("Argument 'sample_factor' must be of length one")
  if(is.na(sample_factor) | is.nan(sample_factor))
    stop("Argument 'sample_factor' has invalid value")
  if(sample_factor <= 1)
    stop("Argument 'sample_factor' out of range, must be greater than 1.0")


  # check that a correct option for data_type is entered
  data_type <- match.arg(data_type)


  # calculate log likelihood with original model, for the profiled pars
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  # get all data, and weights
  all_data <- lapply(x, slot, name = "data")
  data_tag <- unlist(lapply(x, slot, name = "tag"))
  names(all_data) <- data_tag
  data_weights <- unlist(lapply(x, slot, name = "weight"))

  # Make predictions with the original model
  rs <- eval_cs(set_param(x, par), output=output, verbose=FALSE, .ignore_method=TRUE, ...)

  # Calulate log likelihood with the original model
  #   Option 1: calculation of loglik across the whole dataset
  if (individual == FALSE) {
    # calc log lik
    ll_orig <- log_lik(
      npars = length(res),
      obs = rs$obs,
      pred = rs$pred,
      log_scale = log_scale,
      data_type = data_type
    )
  } else {
    #  Option 2: calculation for individual sub-datasets, which are then combined
    ll_list <- list()
    subsets <- data.frame(obs=rs$obs,
                          pred=rs$pred,
                          wgts=rs$wgts,
                          tags=unlist(rs$tags)) %>%
      dplyr::group_by(tags) %>%
      dplyr::group_split()

    for (i in seq_along(subsets)) {
      ll_list[[i]] <- log_lik(
        npars = length(res),
        obs = subsets[[i]]$obs,
        pred = subsets[[i]]$pred,
        log_scale = log_scale,
        data_type = data_type
      ) * unique(subsets[[i]]$wgts)
    }
    ll_orig <- sum(unlist(ll_list))
  }

  # obtain a (random uniform) sample from each parameter's profile region,
  # and include the original param values
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  par_sample_list <- list()
  for (i in 1:length(res)) {
    par_sample_list[[i]] <- c(
      res[[i]]$orig_par_value, # original par value is put first
      stats::runif(sample_size,
        min = res[[i]]$prof_region[1] / sample_factor,
        max = res[[i]]$prof_region[2] * sample_factor
      )
    )
    names(par_sample_list)[[i]] <- names(res[i])
  }
  par_sample <- data.frame(par_sample_list)

  # calculate LL, LLR for each parameter set
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  LL <- NULL
  ## TODO parallelize sample evaluation
  for (i in 1:nrow(par_sample)) {

    # make predictions
    rs <- eval_cs(set_param(x, par_sample[i, ]), output=output, verbose=FALSE, .ignore_method=TRUE, ...)

    # calculate log likelihood
    #   Option 1: calculation of loglik across the whole dataset
    if (individual == FALSE) {
      # calc log lik
      ll_new <- log_lik( # list with only 1 entry
        npars = length(res),
        obs = rs$obs,
        pred = rs$pred,
        log_scale = log_scale,
        data_type = data_type
      )
    } else {
      #  Option 2: calculation for individual sub-datasets, which are then combined
      ll_list <- list()
      subsets <- data.frame(obs=rs$obs,
                            pred=rs$pred,
                            wgts=rs$wgts,
                            tags=unlist(rs$tags)) %>%
        dplyr::group_by(tags) %>%
        dplyr::group_split()

      for (i in seq_along(subsets)) {
        ll_list[[i]] <- log_lik(
          npars = length(res),
          obs = subsets[[i]]$obs,
          pred = subsets[[i]]$pred,
          log_scale = log_scale,
          data_type = data_type
        ) * unique(subsets[[i]]$wgts)
      }
      ll_new <- sum(unlist(ll_list))
    }

    LL <- c(LL, sum(unlist(ll_new)))
  } # end of loop per sample i

  # add likelihood to the dataframe
  par_sample$LL <- LL

  # calc likelihood ratio: -2*ln(likelihood ratio)
  par_sample$LLR <- -2 * (par_sample$LL - ll_orig) # matlab BYOM L. 704

  # X2 difference test to compare nested models
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # cutoffs for the Chi-square test, for alpha = 0.05 significance level,
  # these values are used in the chi-square test to determine if a fit is significantly
  # different from a previous one (i.e. larger than the cutoff given here)
  chi_crit_j <- stats::qchisq(p=0.95, df=length(res)) # cut-off for the 95% profile region
  chi_crit_s <- stats::qchisq(p=0.95, df=1) # cut-off for single par. CI

  # clean dataset based on criteria
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  par_sample$LLR_accept <- ifelse(par_sample$LLR < chi_crit_j, "accept", "reject") # accept combinations within the profile region
  par_sample$LLR_quality <- ifelse(par_sample$LLR < chi_crit_s, "Inner", "Outer") # give preference to those with 95% CI inner rim
  # calculate nr of points within inner rim
  inner_ind <- which(par_sample$LLR_quality == "Inner") # remember which ones are in the inner rim (used/updated later in the while loop)
  inner_size <- length(inner_ind)

  # Additional sampling (repeating steps above with slight modifications)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  sample_iter <- 1
  while (inner_size < nr_accept) { # number of accepted points should be >= nr_accept

    message("resampling parameter space, iteration: ", sample_iter)

    # obtain a (random normal) sample based on the previously "inner" rim samples
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    ## FIXME TODO if 1st sampling resulted in a very small sample size of 'accepted'
    ##    then re-sampling might fail or will be unstable, 1st sampling must be repeated
    ##    until sample size n >= 10?
    par_sample_list <- list()
    for (i in 1:length(res)) {
      par_sample_list[[i]] <- stats::rnorm(sample_size,
        mean = mean(par_sample[inner_ind, i]),
        sd = stats::sd(par_sample[inner_ind, i]) * 1.5
      )
      names(par_sample_list)[[i]] <- names(res[i])
    }
    par_sample_new <- data.frame(par_sample_list)

    # ensure that values are all within param bounds
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    for(i in 1:length(res)){ # for each parameter
      nm <- names(res)[i]
      too_small <- which(par_sample_new[, i] < x[[1]]@scenario@param.bounds[[nm]][1])
      too_large <- which(par_sample_new[, i] > x[[1]]@scenario@param.bounds[[nm]][2])
      if(length(c(too_small, too_large)) > 0){ # remove samples outside boundaries (if present)
        par_sample_new <- par_sample_new[-c(too_small, too_large), ]
        if (nrow(par_sample_new) == 0) {
          stop("all sampled parameter values out of bounds")
        }
      }
    }

    # calculate LL, LLR for each parameter set
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    LL <- NULL
    ## TODO parallize enumeration, combine code with initial LL/LLR loop
    for (i in 1:nrow(par_sample_new)) {

      rs <- eval_cs(set_param(x, par_sample_new[i, ]), output=output, verbose=FALSE, .ignore_method=TRUE, ...)

      # calculate log likelihood
      #   Option 1: calculation of loglik across the whole dataset
      if (individual == FALSE) {
        # calc log lik
        ll_new <- log_lik( # list with only 1 entry
          npars = length(res),
          obs = rs$obs,
          pred = rs$pred,
          log_scale = log_scale,
          data_type = data_type
        )
      } else {
        #  Option 2: calculation for individual sub-datasets, which are then combined
        ll_list <- list()
        subsets <- data.frame(obs=rs$obs,
                              pred=rs$pred,
                              wgts=rs$wgts,
                              tags=unlist(rs$tags)) %>%
          dplyr::group_by(tags) %>%
          dplyr::group_split()

        for (i in seq_along(subsets)) {
          ll_list[[i]] <- log_lik(
            npars = length(res),
            obs = subsets[[i]]$obs,
            pred = subsets[[i]]$pred,
            log_scale = log_scale,
            data_type = data_type
          ) * unique(subsets[[i]]$wgts)
        }
        ll_new <- sum(unlist(ll_list))
      }

      LL <- c(LL, sum(unlist(ll_new)))
    } # end of loop per sample i

    # add likelihood to the dataframe
    par_sample_new$LL <- LL

    # calc likelihood ratio: -2*ln(likelihood ratio)
    par_sample_new$LLR <- -2 * (par_sample_new$LL - ll_orig) # matlab BYOM L. 704


    # clean dataset based on criteria
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    par_sample_new$LLR_accept <- ifelse(par_sample_new$LLR < chi_crit_j, "accept", "reject") # accept combinations within the profile region
    par_sample_new$LLR_quality <- ifelse(par_sample_new$LLR < chi_crit_s, "Inner", "Outer") # give preference to those with 95% CI inner rim

    # join with previous df
    par_sample <- rbind(par_sample, par_sample_new)

    # calculate nr of points within inner rim (and update inner_ind)
    inner_ind <- which(par_sample$LLR_quality == "Inner") # remember which ones are in the inner rim (used/updated later in the while loop)
    inner_size <- length(inner_ind)

    sample_iter <- sample_iter + 1
    if (sample_iter > max_iter) {
      message("Maximum number of iterations exceeded, increase 'max_iter' for improved results")
      break()
    }
  } # end while loop

  # find best LLR value, and compare with original LLR (which is per definition = 0)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (min(par_sample$LLR, na.rm=TRUE) < 0) {
    message("better optimum found in space explorer")
    best_fit_ind <- which.min(par_sample$LLR)
    par_sample$LLR_quality[best_fit_ind] <- "Best fit"
  }
  par_sample[1, "LLR_quality"] <- "Original fit"


  # clean and return
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  output <- par_sample %>%
    dplyr::filter(LLR_accept == "accept") %>%
    dplyr::select(!c(LLR_accept))

  class(output) <- c("param_space", class(output))
  return(output)
}

Try the cvasi package in your browser

Any scripts or data that you put into this service are public.

cvasi documentation built on Sept. 11, 2025, 5:11 p.m.