Nothing
# TODO make print statements optional
# TODO reuse/Abstract profiling code from lik_profile()
#' Explore parameter space
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' The function is aimed at getting an idea of how the parameter space
#' of a model behaves, so that parameter identifiability problems and correlations
#' between parameters can be explored. Therefore, the function samples a large
#' number of parameter sets by randomly drawing from each parameter's 95%
#' confidence interval (generated by [lik_profile()]). It then
#' checks how many of the parameter sets are within acceptable limits by comparing
#' the likelihood ratio of a parameter set vs. the original parameter set against
#' a chi-square distribution as degrees of freedom (df) the total number of profile
#' parameters (outer rim) or one df (inner rim). If needed, the function resamples
#' until at least `nr_accept` parameters sets are within the inner rim
#'
#' @param x a list of [caliset] objects
#' @param par best fit parameters from joined calibration
#' @param res output of [lik_profile()] function
#' @param output character vector, name of output column of [simulate()] that
#' is used in calibration
#' @param data only needed if `x` is a [scenario]
#' @param sample_size number of samples to draw from each parameter interval
#' @param max_iter max number of iterations to redraw samples (within a smaller space), and repeat the process
#' @param nr_accept threshold for number of points sampled within the inner circle
#' @param sample_factor multiplication factor for sampling (95% interval * sample factor)
#' @param individual if `FALSE` (default), the log likelihood is calculated across
#' the whole dataset. Alternatively, if `TRUE`, log likelihoods are calculated for
#' each (group of) *set*(s) individually.
#' @param log_scale `FALSE` (default), option to calculate the log likelihood on a
#' log scale (i.e., observations and predictions are log transformed during calculation)
#' @param data_type Character argument, `"continuous"` (default) or `"count"`, to specify the data type
#' for the log likelihood calculations.
#' @param max_runs *deprecated* alias for `max_iter` parameter
#' @param ... additional parameters passed through to [simulate()]
#'
#' @return a list containing a plot to explore the parameter space, and the `data.frame`
#' supporting it
#'
#' @seealso [plot.param_space]
#' @export
#' @autoglobal
#' @importFrom lifecycle is_present deprecated
#' @examples
#' \donttest{
#' library(dplyr)
#' # Example with Lemna model - physiological params
#' # Before applying the function, a model needs to be calibrated and its parameters profiled
#' # Inputs for likelihood profiling
#'
#' # observations - control run
#' obs <- schmitt2013 %>%
#' filter(trial == "T0")
#'
#' # parameters after calibration
#' params <- c(
#' k_phot_max = 5.663571,
#' k_resp = 1.938689,
#' Topt = 26.7
#' )
#'
#' # set parameter boundaries (if different from defaults)
#' bounds <- list(
#' k_resp = list(0, 10),
#' k_phot_max = list(0, 30),
#' Topt = list(20, 30)
#' )
#'
#' # update metsulfuron
#' myscenario <- metsulfuron %>%
#' set_init(c(BM = 1.2, E = 1, M_int = 0)) %>%
#' set_param(list(
#' k_0 = 5E-5,
#' a_k = 0.25,
#' BM50 = 17600,
#' mass_per_frond = 0.1
#' )) %>%
#' set_noexposure() %>%
#' set_param(params) %>%
#' set_bounds(bounds)
#'
#' # Likelihood profiling
#' res <- lik_profile(
#' x = myscenario,
#' data = obs,
#' output = "FrondNo",
#' par = params,
#' refit = FALSE,
#' type = "fine",
#' method = "Brent"
#' )
#' # plot
#' plot(res)
#'
#' # parameter space explorer
#' set.seed(1) # for reproducibility
#' res_space <- explore_space(
#' x = myscenario,
#' data = obs,
#' par = params,
#' res = res,
#' output = "FrondNo",
#' sample_size = 1000,
#' max_iter = 20,
#' nr_accept = 100)
#'
#' plot(res_space)
#'
#' }
explore_space <- function(x,
par,
res,
output,
data,
sample_size = 1000,
max_iter = 30,
nr_accept = 100,
sample_factor = 1.2,
individual = FALSE,
log_scale = FALSE,
data_type = c("continuous", "count"),
max_runs=deprecated(),
...) {
if(is_present(max_runs)) {
lifecycle::deprecate_warn("1.5.0", "cvasi::explore_space(max_runs)", "cvasi::explore_space(max_iter)")
max_iter <- max_runs
}
# some checks
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# convert to list of calibration sets if needed
if (length(x) == 1) {
if (is_scenario(x)) {
if(is.null(data)) {
stop("Scenario provided, but argument 'data' is missing")
} else {
#message("Scenario converted to calibration set")
if(is.data.frame(data))
data <- tox_data(data)
x <- td2cs(x, data, output_var=output)
}
}
}
# check sensible use of function
if(length(output) > 1){
stop("Parameter `output` must have length of one")
}
if (length(names(res)) <= 1) {
stop("parameter space exploration only makes sense for 2 or more parameters")
}
# check if the same output is used throughout
#if(!(output %in% x[[1]]@scenario@endpoints)){
# stop("Specified output not present in scenario object (x)")
#}
#if(!(output %in% colnames(x[[1]]@data))){
# stop("Specified output not present in data of scenario object (x)")
#}
# check if explored parameters are part of the model
if (any(names(res) %in% names(x[[1]]@scenario@param) == "FALSE")) {
stop("Profiled parameters (in res) are not part of the scenario object (x)")
}
# check if boundaries for explored parameters are available
if(any(names(res) %in% names(x[[1]]@scenario@param.bounds) == "FALSE")){
stop("No parameter boundaries available for explored parameters, please set bounds in scenario object (x)")
}
if(!is.numeric(sample_factor))
stop("Argument 'sample_factor' must be numeric")
if(length(sample_factor) != 1)
stop("Argument 'sample_factor' must be of length one")
if(is.na(sample_factor) | is.nan(sample_factor))
stop("Argument 'sample_factor' has invalid value")
if(sample_factor <= 1)
stop("Argument 'sample_factor' out of range, must be greater than 1.0")
# check that a correct option for data_type is entered
data_type <- match.arg(data_type)
# calculate log likelihood with original model, for the profiled pars
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# get all data, and weights
all_data <- lapply(x, slot, name = "data")
data_tag <- unlist(lapply(x, slot, name = "tag"))
names(all_data) <- data_tag
data_weights <- unlist(lapply(x, slot, name = "weight"))
# Make predictions with the original model
rs <- eval_cs(set_param(x, par), output=output, verbose=FALSE, .ignore_method=TRUE, ...)
# Calulate log likelihood with the original model
# Option 1: calculation of loglik across the whole dataset
if (individual == FALSE) {
# calc log lik
ll_orig <- log_lik(
npars = length(res),
obs = rs$obs,
pred = rs$pred,
log_scale = log_scale,
data_type = data_type
)
} else {
# Option 2: calculation for individual sub-datasets, which are then combined
ll_list <- list()
subsets <- data.frame(obs=rs$obs,
pred=rs$pred,
wgts=rs$wgts,
tags=unlist(rs$tags)) %>%
dplyr::group_by(tags) %>%
dplyr::group_split()
for (i in seq_along(subsets)) {
ll_list[[i]] <- log_lik(
npars = length(res),
obs = subsets[[i]]$obs,
pred = subsets[[i]]$pred,
log_scale = log_scale,
data_type = data_type
) * unique(subsets[[i]]$wgts)
}
ll_orig <- sum(unlist(ll_list))
}
# obtain a (random uniform) sample from each parameter's profile region,
# and include the original param values
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
par_sample_list <- list()
for (i in 1:length(res)) {
par_sample_list[[i]] <- c(
res[[i]]$orig_par_value, # original par value is put first
stats::runif(sample_size,
min = res[[i]]$prof_region[1] / sample_factor,
max = res[[i]]$prof_region[2] * sample_factor
)
)
names(par_sample_list)[[i]] <- names(res[i])
}
par_sample <- data.frame(par_sample_list)
# calculate LL, LLR for each parameter set
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
LL <- NULL
## TODO parallelize sample evaluation
for (i in 1:nrow(par_sample)) {
# make predictions
rs <- eval_cs(set_param(x, par_sample[i, ]), output=output, verbose=FALSE, .ignore_method=TRUE, ...)
# calculate log likelihood
# Option 1: calculation of loglik across the whole dataset
if (individual == FALSE) {
# calc log lik
ll_new <- log_lik( # list with only 1 entry
npars = length(res),
obs = rs$obs,
pred = rs$pred,
log_scale = log_scale,
data_type = data_type
)
} else {
# Option 2: calculation for individual sub-datasets, which are then combined
ll_list <- list()
subsets <- data.frame(obs=rs$obs,
pred=rs$pred,
wgts=rs$wgts,
tags=unlist(rs$tags)) %>%
dplyr::group_by(tags) %>%
dplyr::group_split()
for (i in seq_along(subsets)) {
ll_list[[i]] <- log_lik(
npars = length(res),
obs = subsets[[i]]$obs,
pred = subsets[[i]]$pred,
log_scale = log_scale,
data_type = data_type
) * unique(subsets[[i]]$wgts)
}
ll_new <- sum(unlist(ll_list))
}
LL <- c(LL, sum(unlist(ll_new)))
} # end of loop per sample i
# add likelihood to the dataframe
par_sample$LL <- LL
# calc likelihood ratio: -2*ln(likelihood ratio)
par_sample$LLR <- -2 * (par_sample$LL - ll_orig) # matlab BYOM L. 704
# X2 difference test to compare nested models
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# cutoffs for the Chi-square test, for alpha = 0.05 significance level,
# these values are used in the chi-square test to determine if a fit is significantly
# different from a previous one (i.e. larger than the cutoff given here)
chi_crit_j <- stats::qchisq(p=0.95, df=length(res)) # cut-off for the 95% profile region
chi_crit_s <- stats::qchisq(p=0.95, df=1) # cut-off for single par. CI
# clean dataset based on criteria
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
par_sample$LLR_accept <- ifelse(par_sample$LLR < chi_crit_j, "accept", "reject") # accept combinations within the profile region
par_sample$LLR_quality <- ifelse(par_sample$LLR < chi_crit_s, "Inner", "Outer") # give preference to those with 95% CI inner rim
# calculate nr of points within inner rim
inner_ind <- which(par_sample$LLR_quality == "Inner") # remember which ones are in the inner rim (used/updated later in the while loop)
inner_size <- length(inner_ind)
# Additional sampling (repeating steps above with slight modifications)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sample_iter <- 1
while (inner_size < nr_accept) { # number of accepted points should be >= nr_accept
message("resampling parameter space, iteration: ", sample_iter)
# obtain a (random normal) sample based on the previously "inner" rim samples
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## FIXME TODO if 1st sampling resulted in a very small sample size of 'accepted'
## then re-sampling might fail or will be unstable, 1st sampling must be repeated
## until sample size n >= 10?
par_sample_list <- list()
for (i in 1:length(res)) {
par_sample_list[[i]] <- stats::rnorm(sample_size,
mean = mean(par_sample[inner_ind, i]),
sd = stats::sd(par_sample[inner_ind, i]) * 1.5
)
names(par_sample_list)[[i]] <- names(res[i])
}
par_sample_new <- data.frame(par_sample_list)
# ensure that values are all within param bounds
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for(i in 1:length(res)){ # for each parameter
nm <- names(res)[i]
too_small <- which(par_sample_new[, i] < x[[1]]@scenario@param.bounds[[nm]][1])
too_large <- which(par_sample_new[, i] > x[[1]]@scenario@param.bounds[[nm]][2])
if(length(c(too_small, too_large)) > 0){ # remove samples outside boundaries (if present)
par_sample_new <- par_sample_new[-c(too_small, too_large), ]
if (nrow(par_sample_new) == 0) {
stop("all sampled parameter values out of bounds")
}
}
}
# calculate LL, LLR for each parameter set
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
LL <- NULL
## TODO parallize enumeration, combine code with initial LL/LLR loop
for (i in 1:nrow(par_sample_new)) {
rs <- eval_cs(set_param(x, par_sample_new[i, ]), output=output, verbose=FALSE, .ignore_method=TRUE, ...)
# calculate log likelihood
# Option 1: calculation of loglik across the whole dataset
if (individual == FALSE) {
# calc log lik
ll_new <- log_lik( # list with only 1 entry
npars = length(res),
obs = rs$obs,
pred = rs$pred,
log_scale = log_scale,
data_type = data_type
)
} else {
# Option 2: calculation for individual sub-datasets, which are then combined
ll_list <- list()
subsets <- data.frame(obs=rs$obs,
pred=rs$pred,
wgts=rs$wgts,
tags=unlist(rs$tags)) %>%
dplyr::group_by(tags) %>%
dplyr::group_split()
for (i in seq_along(subsets)) {
ll_list[[i]] <- log_lik(
npars = length(res),
obs = subsets[[i]]$obs,
pred = subsets[[i]]$pred,
log_scale = log_scale,
data_type = data_type
) * unique(subsets[[i]]$wgts)
}
ll_new <- sum(unlist(ll_list))
}
LL <- c(LL, sum(unlist(ll_new)))
} # end of loop per sample i
# add likelihood to the dataframe
par_sample_new$LL <- LL
# calc likelihood ratio: -2*ln(likelihood ratio)
par_sample_new$LLR <- -2 * (par_sample_new$LL - ll_orig) # matlab BYOM L. 704
# clean dataset based on criteria
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
par_sample_new$LLR_accept <- ifelse(par_sample_new$LLR < chi_crit_j, "accept", "reject") # accept combinations within the profile region
par_sample_new$LLR_quality <- ifelse(par_sample_new$LLR < chi_crit_s, "Inner", "Outer") # give preference to those with 95% CI inner rim
# join with previous df
par_sample <- rbind(par_sample, par_sample_new)
# calculate nr of points within inner rim (and update inner_ind)
inner_ind <- which(par_sample$LLR_quality == "Inner") # remember which ones are in the inner rim (used/updated later in the while loop)
inner_size <- length(inner_ind)
sample_iter <- sample_iter + 1
if (sample_iter > max_iter) {
message("Maximum number of iterations exceeded, increase 'max_iter' for improved results")
break()
}
} # end while loop
# find best LLR value, and compare with original LLR (which is per definition = 0)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if (min(par_sample$LLR, na.rm=TRUE) < 0) {
message("better optimum found in space explorer")
best_fit_ind <- which.min(par_sample$LLR)
par_sample$LLR_quality[best_fit_ind] <- "Best fit"
}
par_sample[1, "LLR_quality"] <- "Original fit"
# clean and return
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
output <- par_sample %>%
dplyr::filter(LLR_accept == "accept") %>%
dplyr::select(!c(LLR_accept))
class(output) <- c("param_space", class(output))
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.