## Helper functions for uniform sphere sampling
#'
#' @importFrom stats rexp rnorm
runifs <- function(n, d, c = rep(0, d), r = 1) {
init_points <- matrix(rnorm(n*d, 0, 1), nrow = n, byrow = T)
exps <- rexp(n, 0.5)
denoms <- (exps + apply(init_points, 1, function(x) sum(x^2)))^(0.5)
swept <- sweep(init_points, 1, denoms, "/")
return(t(apply(swept, 1, function(x) x*r + c)))
}
punifs <- function(x, c = rep(0, length(x)), r = 1) {
return(ifelse(sum((x-c)^2 / r^2) <= 1, 1, 0))
}
# Helper function to obtain the most 'recent' emulators for each output.
obtain_recent <- function(ems, dup = FALSE, showReduce = TRUE) {
if (!is.null(ems$expectation)) {
em_indices <- duplicated(map_chr(ems$expectation, "output_name"))
if (!dup) em_indices <- !em_indices
ems <- list(
expectation = ems$expectation[em_indices],
variance = ems$variance[em_indices]
)
}
else if (!is.null(ems$mode1)) {
em_indices <- duplicated(map_chr(ems$mode1$expectation, "output_name"))
if (!dup) em_indices <- !em_indices
ems <- list(
mode1 = list(
expectation = ems$mode1$expectation[em_indices],
variance = ems$mode1$variance[em_indices]
),
mode2 = list(
expectation = ems$mode2$expectation[em_indices],
variance = ems$mode2$variance[em_indices]
),
prop = ems$prop[[1]]
)
}
else {
em_indices <- duplicated(map_chr(ems, "output_name"))
if (!dup) em_indices <- !em_indices
ems <- ems[em_indices]
}
if (showReduce) return(list(ems = ems, reduced = any(em_indices == FALSE)))
return(ems)
}
## Function for pca transformation based on a sample of points
pca_transform <- function(x, s_points, forward = TRUE) {
if (is.data.frame(s_points)) s_points <- data.matrix(s_points)
if (is.data.frame(x)) x <- data.matrix(x)
s_trafo <- sweep(
sweep(
s_points, 2,
apply(s_points, 2, mean), "-"
),
2, apply(s_points, 2, sd), "/"
)
s_estruct <- eigen(cov(s_trafo))
s_estruct$values <- map_dbl(s_estruct$values, ~ifelse(. < 1e-10, 1e-10, .))
if (forward) {
x <- sweep(
sweep(
x, 2, apply(s_points, 2, mean), "-"
),
2, apply(s_points, 2, sd), "/"
)
return(x %*% s_estruct$vectors %*% diag(1/sqrt(s_estruct$values)))
}
pre_traf <- x %*% diag(sqrt(s_estruct$values)) %*% t(s_estruct$vectors)
return(sweep(sweep(pre_traf, 2, apply(s_points, 2, sd), "*"), 2, apply(s_points, 2, mean), "+"))
}
## Function to check points are in range
in_range <- function(data, ranges) {
apply(data, 1, function(x)
all(map_lgl(seq_along(ranges),
~x[.] >= ranges[[.]][1] && x[.] <= ranges[[.]][2])))
}
#' Generate Maximin Sample of Points
#'
#' Create a maximin sample from a collection of valid points
#'
#' The point proposal methods in \code{\link{generate_new_design}} can have some
#' undesirable properties; particularly over-representation of the boundary of
#' the non-implausible space. This function attempts to find an 'optimal' space-
#' filling design, using the maximin criteria. A subset of the candidate points
#' are selected and the minimum distance between any pair of points is selected;
#' the subset of points which maximises this measure is returned.
#'
#' @param points The candidate points to select from.
#' @param n The number of points desired in the final selection.
#' @param reps The number of subselections to make before returning a choice
#' @param nms The names of the inputs parameters of the points.
#'
#' @return A data.frame containing the maximin subset.
#'
#' @export
maximin_sample <- function(points, n, reps = 1000, nms) {
c_measure <- op <- NULL
opts <- map(1:reps, function(rep) {
tp <- points[sample(nrow(points), n), , drop = FALSE]
if (!is.data.frame(tp)) tp <- setNames(tp, nms)
measure <- min(dist(tp))
return(list(points = tp, value = measure))
})
return(opts[[which.max(map_dbl(opts, "value"))]]$points)
}
#' Generate Proposal Points
#'
#' Given a set of trained emulators, this finds the next set of points that will be
#' informative for a subsequent wave of emulation or, in the event that the
#' current wave is the last desired, a set of points that optimally span the
#' parameter region of interest. There are a number of different methods that can
#' be utilised, alone or in combination with one another, to generate the points.
#'
#' If the \code{method} argument contains \code{'lhs'}, a Latin hypercube is generated and
#' non-implausible points from this design are retained. If more points are accepted
#' than the next design requires, then points are subselected using a maximin argument.
#'
#' If \code{method} contains \code{'line'}, then line sampling is performed. Given an
#' already established collection of non-implausible points, rays are drawn between
#' pairs of points (selected so as to maximise the distance between them) and more
#' points are sampled along the rays. Points thus sampled are retained if they lie
#' near a boundary of the non-implausible space, or on the boundary of the parameter
#' region of interest.
#'
#' If \code{method} contains \code{'importance'}, importance sampling is performed.
#' Given a collection of non-implausible points, a mixture distribution of either
#' multivariate normal or uniform ellipsoid proposals around the current non-implausible
#' set are constructed. The optimal standard deviation (in the normal case) or radius
#' (in the ellipsoid case) is determined using a burn-in phase, and points are
#' proposed until the desired number of points have been found.
#'
#' If \code{method} contains \code{'slice'}, then slice sampling is performed. Given
#' a single known non-implausible point, a minimum enclosing hyperrectangle (perhaps
#' after transforming the space) is determined and points are sampled for each dimension
#' of the parameter space uniformly, shrinking the minimum enclosing hyperrectangle as
#' appropriate. This method is akin to to a Gibbs sampler.
#'
#' If \code{method} contains \code{'optical'}, then optical depth sampling is used.
#' Given a set of non-implausible points, an approximation of the one-dimensional
#' marginal distributions for each parameter can be determined. From these derived
#' marginals, points are sampled and subject to rejection as in the LHD sampling.
#'
#' For any sampling strategy, the parameters \code{ems}, \code{n_points}, and \code{z}
#' must be provided. All methods rely on a means of assessing point suitability, which
#' we refer to as an implausibility measure. By default, this uses nth-maximum implausibility
#' as provided by \code{\link{nth_implausible}}; a user-defined method can be provided
#' instead by supplying the function call to \code{opts[["accept_measure"]]}. Any
#' such function must take at least five arguments: the emulators, the points, the
#' targets, and a cutoff, as well as a \code{...} argument to ensure compatibility with
#' the default behaviour of the point proposal method. Note that, in accordance with
#' the default functionality of \code{\link{nth_implausible}}, if emulating more than
#' 10 outputs and an explicit \code{opts$nth} argument is not provided, then second-max
#' implausibility is used as the measure.
#'
#' The option \code{opts[["seek"]]} determines how many points should be chosen that
#' have a higher probability of matching targets, as opposed to not missing targets. Due
#' to the danger of such an approach,
#' this value should not be too high and should be used sparingly at early waves;
#' even at later waves, it is inadvisable to seek more than 10\% of the output points
#' using this metric. The default is \code{seek = 0}, and can be provided as either
#' a percentage of points desired (in the range [0,1]) or the fixed number of points.
#'
#' The default behaviour is as follows. A set of initial points are generated from a
#' large LHD; line sampling is performed to find the boundaries of the space; then importance
#' sampling is used to fill out the space. The proposed set of points are thinned and
#' both line and importance sampling are applied again; this resampling behaviour is
#' controlled by \code{opts[["resample"]]}, where \code{resample = n} indicates that
#' the proposal will be thinned and resampled from \code{n} times (resulting in \code{n+1}
#' proposal stages).
#'
#' In regions where the non-implausible space at a given cutoff value is very hard to find,
#' the point proposal will start at a higher cutoff where it can find a space-filling design.
#' Given such a design at a higher cutoff, it can subselect to a lower cutoff by demanding
#' some percentage of the proposed points are retained and repeat. This approach terminates
#' if the 'ladder' of cutoffs reaches the desired cutoff, or if the process asymptotes at
#' a particular higher cutoff. The opts \code{ladder_tolerance} and \code{cutoff_tolerance}
#' determine the minimum improvement required in consecutive cutoffs for the process to not
#' be considered to be asymptoting and the level of closeness to the desired cutoff at which
#' we are prepared to stop, respectively. For instance, setting \code{ladder_tolerance} to
#' 0.1 and \code{cutoff_tolerance} to 0.01, with a cutoff of 3, will terminate the process
#' if two consecutive cutoffs proposed are within 0.1 of each other, or when the points proposed
#' all have implausibility less than the 3.01.
#'
#' These methods may work slowly, or not at all, if the target space is extremely small in
#' comparison with the initial non-yet-ruled-out (NROY) space; it may also fail to give a
#' representative sample if the target space is formed of disconnected regions of different
#' volumes.
#'
#' @section Arguments within \code{opts}:
#' \describe{
#' \item{accept_measure}{A custom implausibility measure to be used.}
#' \item{cluster}{Whether to try to apply emulator clustering.}
#' \item{cutoff_tolerance}{Tolerance for an obtained cutoff to be similar enough to that desired.}
#' \item{ladder_tolerance}{Tolerance with which to determine if the process is asymptoting.}
#' \item{nth}{The level of nth implausibility to apply, if using the default implausibility.}
#' \item{resample}{How many times to perform the resampling step once points are found.}
#' \item{seek}{How many 'good' points should be sought: either as an integer or a ratio.}
#' \item{to_file}{If output is to be written to file periodically, the file location.}
#' \item{points.factor (LHS, Cluster LHS)}{How many more points than desired to sample.}
#' \item{pca_lhs (LHS)}{Whether to apply PCA to the space before proposing.}
#' \item{n_lines (Line)}{How many lines to draw.}
#' \item{ppl (Line)}{The number of points to sample per line.}
#' \item{imp_distro (Importance)}{The distribution to propose around points.}
#' \item{imp_scale (Importance)}{The radius, or standard deviation, of proposed distributions.}
#' \item{pca_slice (Slice)}{Whether to apply PCA to the space before slice sampling.}
#' \item{seek_distro (Seek)}{The distribution to apply when looking for 'good' points.}
#' }
#'
#' @importFrom mvtnorm dmvnorm rmvnorm
#' @importFrom stats setNames runif dist cov
#' @importFrom utils write.csv stack
#' @importFrom purrr imap
#' @importFrom lhs randomLHS
#'
#' @param ems A list of \code{\link{Emulator}} objects, trained
#' on previous design points.
#' @param n_points The desired number of points to propose.
#' @param z The targets to match to.
#' @param method Which methods to use.
#' @param cutoff The value of the cutoff to use to assess suitability.
#' @param plausible_set An optional set of known non-implausible points, to avoid LHD sampling.
#' @param verbose Should progress statements be printed to the console?
#' @param thin Should maximin sampling be applied as part of the proposal stage?
#' @param opts A named list of opts as described.
#' @param ... Any parameters to pass via chaining to individual sampling functions (eg \code{distro}
#' for importance sampling or \code{ordering} for collecting emulators).
#'
#' @return A data.frame containing the set of new points upon which to run the model.
#'
#' @export
#'
#' @examples
#' \donttest{ # Excessive runtime
#' # A simple example that uses number of the native and ... parameter opts.
#' pts <- generate_new_design(SIREmulators$ems, 100, SIREmulators$targets,
#' distro = 'sphere', opts = list(resample = 0))
#' # Non-default methods
#' pts_slice <- generate_new_design(SIREmulators$ems, 100, SIREmulators$targets,
#' method = 'slice')
#' ## Example using custom measure functionality
#' custom_measure <- function(ems, x, z, cutoff, ...) {
#' imps_df <- nth_implausible(ems, x, z, get_raw = TRUE)
#' sorted_imps <- t(apply(imps_df, 1, sort, decreasing = TRUE))
#' imps1 <- sorted_imps[,1] <= cutoff
#' imps2 <- sorted_imps[,2] <= cutoff - 0.5
#' constraint <- apply(x, 1, function(y) y[[1]] <= 0.4)
#' return(imps1 & imps2 & constraint)
#' }
#' pts_custom <- generate_new_design(SIREmulators$ems, 100, SIREmulators$targets,
#' opts = list(accept_measure = custom_measure))
#' }
generate_new_design <- function(ems, n_points, z, method = "default", cutoff = 3,
plausible_set, verbose = interactive(), thin = TRUE,
opts = NULL, ...) {
if (is.null(opts)) opts <- list(...)
else {
collected_opts <- unlist(c(opts, list(...)), recursive = FALSE)
opts <- as.list(collected_opts[!duplicated(names(collected_opts))])
}
if (is.null(opts$accept_measure)) opts$accept_measure <- "default"
if (!is.null(opts$cluster) && opts$cluster == 1) opts$cluster <- TRUE
if (is.null(opts$cluster) || !is.logical(opts$cluster)) opts$cluster <- FALSE
if (is.null(opts$use_collect)) opts$use_collect <- TRUE
else tryCatch(is.logical(opts$use_collect), warning = function(e) {warning("Emulator collection setting not logical; setting to TRUE"); opts$use_collect <- TRUE})
if (is.null(opts$cutoff_tolerance)) opts$cutoff_tolerance <- 0.01
else tryCatch(opts$cutoff_tolerance <- as.numeric(opts$cutoff_tolerance), warning = function(e) {warning("Cutoff tolerance is not numeric; setting to 0.01"); opts$cutoff_tolerance <- 0.01})
if (is.null(opts$ladder_tolerance)) opts$ladder_tolerance <- 0.1
else tryCatch(opts$ladder_tolerance <- as.numeric(opts$ladder_tolerance), warning = function(e) {warning("Ladder tolerance is not numeric; setting to 0.1"); opts$ladder_tolerance <- 0.1})
if (is.null(opts$nth)) opts$nth <- NA
else tryCatch(opts$nth <- as.numeric(opts$nth), warning = function(e) {warning("Nth-implausibility nth is not numeric; setting to 1"); opts$nth <- 1})
if (is.null(opts$resample)) opts$resample <- 1
else tryCatch(opts$resample <- as.numeric(opts$resample), warning = function(e) {warning("Resample number is not numeric; setting to 1"); opts$resample <- 1})
if (is.null(opts$seek)) opts$seek <- 0
else tryCatch(opts$seek <- as.numeric(opts$seek), warning = function(e) {warning("Seek value is not numeric; setting to 0"); opts$seek <- 0})
if (opts$seek > n_points) {
warning("Value of seek is larger than the number of desired points; setting equal to the number of points.")
opts$seek <- n_points
}
if (is.null(opts$to_file)) opts$to_file <- NA
if (!is.na(opts$to_file)) { #nocov start
tryCatch(
write.csv(data.frame(), file = opts$to_file, row.names = FALSE),
error = function(e) {warning("Cannot open directory provided in to_file; output will not be saved to an external file"); opts$to_file <- NA}
)
} #nocov end
if (is.null(list(...)[["cutoff_info"]])) min_cutoff <- cutoff
else min_cutoff <- list(...)[["cutoff_info"]][1]
if (opts$use_collect)
ems <- collect_emulators(ems, z, cutoff, ...)
ranges <- getRanges(ems)
if (is.na(opts$nth)) {
if (!is.null(ems$expectation))
nems <- length(unique(map_chr(ems$expectation, "output_name")))
else if (!is.null(ems$mode1))
nems <- length(unique(map_chr(ems$mode1$expectation, "output_name")))
else
nems <- length(unique(map_chr(ems, "output_name")))
opts$nth <- ifelse(nems > 10, 2, 1)
}
if (is.character(opts$accept_measure) && opts$accept_measure == "default") imp_func <- function(ems, x, z, ...) nth_implausible(ems, x, z, n = opts$nth, ...)
if (length(method) == 1 && method == "default") {
method <- c('lhs', 'line', 'importance')
user_select <- FALSE
}
else user_select <- TRUE
possible_methods <- c('lhs', 'line', 'importance', 'slice', 'optical')
which_methods <- possible_methods[possible_methods %in% method]
n_current <- 0
if (any(!method %in% possible_methods)) {
possible_user_methods <- which(!method %in% possible_methods)
for (i in possible_user_methods) {
tryCatch({get(i); which_methods <- c(which_methods, i)},
error = function(e) warning(paste("Method", i, "not found.")))
}
}
if (missing(plausible_set) || 'lhs' %in% which_methods) {
if (verbose) cat("Proposing from LHS...\n") #nocov
if (!opts$cluster) {
lh_gen <- lhs_gen(ems, ranges, max(n_points, 10*length(ranges)),
z, cutoff, verbose, opts)
points <- lh_gen$points
this_cutoff <- lh_gen$cutoff
}
else {
cluster_ems <- obtain_recent(ems)
cluster_gen <- lhs_gen_cluster(cluster_ems$ems, ranges,
max(n_points, 10*length(ranges)),
z, cutoff, verbose, opts)
if (cluster_ems$reduced) {
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
leftover_imps <- imp_func(
obtain_recent(ems, TRUE, FALSE),
cluster_gen$points, z, ordered = TRUE
)
this_cutoff <- max(cluster_gen$cutoff,
sort(leftover_imps)[min(length(leftover_imps)-1, floor(0.8*leftover_imps), 5*length(ranges))])
points <- cluster_gen$points[leftover_imps <= this_cutoff,]
}
else {
i_bools <- rep(FALSE, nrow(cluster_gen$points))
required_points <- min(nrow(cluster_gen$points)-1, floor(0.8*nrow(cluster_gen$points)), 5*length(ranges))
c_current <- cluster_gen$cutoff
while (sum(i_bools) < required_points) {
false_indices <- which(!i_bools)
imp_bools <- opts$accept_measure(obtain_recent(ems, TRUE, FALSE), cluster_gen$points[false_indices,], z, n = opts$nth, cutoff = c_current)
i_bools[false_indices] <- i_bools[false_indices] | imp_bools
c_current <- c_current + 0.5
}
this_cutoff <- c_current - 0.5
points <- cluster_gen$points[i_bools,]
}
}
else {
points <- cluster_gen$points
this_cutoff <- cluster_gen$cutoff
}
}
meta_verbose <- TRUE
if (this_cutoff == cutoff && nrow(points) >= 0.25 * n_points && !user_select) {
if (verbose) cat("LHS has high yield; no other methods required.\n") #nocov
meta_verbose <- FALSE
all_points <- points
while(nrow(all_points) < n_points) {
if (verbose) cat("Proposing from LHS...\n") #nocov
if (!opts$cluster) {
lh_gen <- lhs_gen(ems, ranges, max(n_points, 10*length(ranges)),
z, cutoff, verbose, opts)
points <- lh_gen$points
this_cutoff <- lh_gen$cutoff
}
else {
cluster_ems <- obtain_recent(ems)
cluster_gen <- lhs_gen_cluster(cluster_ems$ems, ranges,
max(n_points, 10*length(ranges)),
z, cutoff, verbose, opts)
if (cluster_ems$reduced) {
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
leftover_imps <- imp_func(
obtain_recent(ems, TRUE, FALSE),
cluster_gen$points, z, ordered = TRUE
)
this_cutoff <- max(cluster_gen$cutoff,
sort(leftover_imps)[min(length(leftover_imps)-1, floor(0.8*leftover_imps), 5*length(ranges))])
points <- cluster_gen$points[leftover_imps <= this_cutoff,]
}
else {
i_bools <- rep(FALSE, nrow(cluster_gen$points))
required_points <- min(nrow(cluster_gen$points)-1, floor(0.8*nrow(cluster_gen$points)), 5*length(ranges))
c_current <- cluster_gen$cutoff
while (sum(i_bools) < required_points) {
false_indices <- which(!i_bools)
imp_bools <- opts$accept_measure(obtain_recent(ems, TRUE, FALSE), cluster_gen$points[false_indices,], z, n = opts$nth, cutoff = c_current)
i_bools[false_indices] <- i_bools[false_indices] | imp_bools
c_current <- c_current + 0.5
}
this_cutoff <- c_current - 0.5
points <- cluster_gen$points[i_bools,]
}
}
else {
points <- cluster_gen$points
this_cutoff <- cluster_gen$cutoff
}
}
all_points <- rbind(all_points, points)
}
points <- all_points
}
if (is.null(nrow(points))) points <- data.frame(matrix(points, ncol = 1)) |> setNames(names(ems[[1]]$ranges))
if (nrow(points) >= n_points && this_cutoff == cutoff) {
if (verbose && meta_verbose) cat("Enough points generated from LHD - no need to apply other methods.\n") #nocov
if (opts$seek > 0) {
if (verbose) cat("Searching for high probability match points...\n") #nocov
if (opts$seek <= 1) opts$seek <- floor(n_points * opts$seek)
extra_points <- seek_good(ems, z, points, cutoff, verbose, opts)
}
else extra_points <- NULL
if (nrow(points) > n_points - opts$seek && thin) {
if (verbose) cat("Selecting final points using maximin criterion...\n") #nocov
points <- maximin_sample(points, n_points - opts$seek, nms = names(ranges))
}
return(rbind(extra_points, points))
}
}
else {
plausible_set <- plausible_set[,names(ranges)]
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
point_imps <- imp_func(ems, plausible_set, z, max_imp = Inf, ordered = TRUE)
optimal_cut <- sort(point_imps)[min(length(point_imps)-1, floor(0.8*length(point_imps)), 5*length(ranges))]
is_asymptoting <- (optimal_cut > cutoff && (optimal_cut - sort(point_imps)[1] < opts$cutoff_tolerance))
}
else {
if (!is.null(list(...)[["cutoff_info"]])) {
c_details <- list(...)[["cutoff_info"]]
cutoff_sequence <- c(cutoff, seq(mean(c(cutoff, c_details[1])), c_details[2], by = diff(c_details)/20+(c_details[1]-cutoff)/40))
}
else
cutoff_sequence <- seq(cutoff, 20, by = 0.05)
points_accept <- c(0)
optimal_cut <- cutoff
if (!verbose || !requireNamespace("progressr", quietly = TRUE)) {
required_points <- min(max(1, nrow(plausible_set)-1, floor(0.8*nrow(plausible_set))), 5*length(ranges))
which_match <- rep(FALSE, nrow(plausible_set))
for (i in cutoff_sequence) {
c_bools <- opts$accept_measure(ems, plausible_set[!which_match,], z, cutoff = i, n = opts$nth)
which_match[!which_match] <- c_bools
points_accept <- c(points_accept, sum(which_match))
if (sum(which_match) >= required_points) {
optimal_cut <- i
break
}
}
}
else {
progressr::with_progress({
prog <- progressr::progressor(steps = length(cutoff_sequence))
required_points <- min(max(1, nrow(plausible_set)-1, floor(0.8*nrow(plausible_set))), 5*length(ranges))
which_match <- rep(FALSE, nrow(plausible_set))
for (i in seq_along(cutoff_sequence)) {
c_bools <- opts$accept_measure(ems, plausible_set[!which_match,], z, cutoff = cutoff_sequence[i], n = opts$nth)
which_match[!which_match] <- c_bools
points_accept <- c(points_accept, sum(which_match))
if (sum(which_match) >= required_points) {
optimal_cut <- cutoff_sequence[i]
break
}
prog(message = sprintf("Implausibility ladder check step %g", i))
}
})
}
min_cutoff <- cutoff_sequence[which(points_accept != 0)[1]-1]
is_asymptoting <- (optimal_cut > cutoff &&
(points_accept[length(points_accept)-1] == 0) || points_accept[length(points_accept)] >= n_points)
}
if (is_asymptoting && optimal_cut-cutoff > opts$cutoff_tolerance) {
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
if (verbose) cat("Point proposal seems to be asymptoting around cutoff", #nocov
round(optimal_cut, 3), "- terminating.\n") #nocov
recent_ems <- obtain_recent(ems, FALSE, FALSE)
if (!is.null(recent_ems$mode1))
relev_ems <- list(mode1 = recent_ems$mode1$expectation,
mode2 = recent_ems$mode2$expectation)
else if (!is.null(recent_ems$expectation))
relev_ems <- recent_ems$expectation
else
relev_ems <- recent_ems
if (is.null(relev_ems$mode1)) {
recent_imps <- do.call("cbind.data.frame",
map(
relev_ems,
~.$implausibility(plausible_set, z[[.$output_name]])
))
recent_exps <- do.call("cbind.data.frame",
map(
relev_ems, ~.$get_exp(plausible_set)
))
preflight(cbind(plausible_set, recent_exps), z)
}
else {
recent_imps1 <- do.call('cbind.data.frame',
map(
relev_ems$mode1,
~.$implausibility(plausible_set, z[[.$output_name]])
))
recent_imps2 <- do.call('cbind.data.frame',
map(
relev_ems$mode2,
~.$implausibility(plausible_set, z[[.$output_name]])
))
recent_imps <- pmin(recent_imps1, recent_imps2)
}
ind <- values <- NULL
plot_imps <- stack(recent_imps)
plot_imps$ind <- factor(plot_imps$ind, levels = names(relev_ems))
if (verbose) { #nocov start
print(ggplot(data = plot_imps, aes(x = ind, y = values)) +
geom_boxplot() +
labs(title = "Implausibility Boxplot"))
cat("Inspect implausibility boxplot for problematic outputs,",
"and consider transforming them or removing them from",
"this wave.\n")
} #nocov end
}
else {
if (verbose) cat("Point proposal seems to be asymptoting around cutoff", #nocov
round(optimal_cut, 3), "- terminating.\n") #nocov
}
if (!is.na(opts$to_file)) { #nocov start
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
write.csv(plausible_set[point_imps <= optimal_cut,],
file = opts$to_file, row.names = FALSE)
return(list(points = plausible_set[point_imps <= optimal_cut,],
cutoff = optimal_cut))
}
else {
good_points <- plausible_set[opts$accept_measure(ems, plausible_set, z, n = opts$nth, cutoff = optimal_cut),]
write.csv(good_points, file = opts$to_file, row.names = FALSE)
return(list(points = good_points, cutoff = optimal_cut))
}
} #nocov end
if (is.character(opts$accept_measure) && opts$accept_measure == "default")
return(list(points = plausible_set[point_imps <= optimal_cut,],
cutoff = optimal_cut))
good_points <- plausible_set[opts$accept_measure(ems, plausible_set, z, cutoff = optimal_cut),]
return(list(points = good_points, cutoff = optimal_cut))
}
if (optimal_cut < cutoff) this_cutoff <- cutoff
else this_cutoff <- round(optimal_cut, 3)
if (is.character(opts$accept_measure) && opts$accept_measure == "default")
points <- plausible_set[point_imps <= this_cutoff,]
else
points <- plausible_set[opts$accept_measure(ems, plausible_set, z, n = opts$nth, cutoff = this_cutoff),]
}
if (length(ranges) == 1)
points <- setNames(data.frame(temp = points), names(ranges))
n_current <- nrow(points)
if (is.null(n_current) || n_current == 0) {
warning("No non-implausible points found from initial step.")
return(points)
}
if (verbose) cat(n_current, " initial valid points generated for I=", #nocov
round(this_cutoff, 3), "\n", sep = "") #nocov
if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE)
# if ("optical" %in% which_methods && nrow(points) < n_points) {
# if (verbose) cat("Performing optical depth sampling...\n") #nocov
# points <- op_depth_gen(ems, ranges, n_points, z, cutoff = this_cutoff,
# plausible_set = points, verbose = verbose, opts = opts)
# if (verbose) cat("Optical depth sampling generated", #nocov
# nrow(points)-n_current, "more points.\n") #nocov
# n_current <- nrow(points)
# }
# if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE)
if ("line" %in% which_methods && nrow(points) < n_points) {
if (verbose) cat("Performing line sampling...\n") #nocov
points <- line_sample(ems, ranges, z, points, cutoff = this_cutoff, opts = opts)
if (verbose) cat("Line sampling generated", #nocov
nrow(points) - n_current, "more points.\n") #nocov
n_current <- nrow(points)
}
if ("slice" %in% which_methods) {
if (verbose) cat("Performing slice sampling...\n") #nocov
points <- slice_gen(ems, ranges, n_points, z, points, this_cutoff, opts)
if (verbose) cat("Slice sampling generated", #nocov
nrow(points) - n_current, "more points.\n") #nocov
n_current <- nrow(points)
}
if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE) #nocov
if ("importance" %in% which_methods && nrow(points) < n_points) {
if (verbose) cat("Performing importance sampling...\n") #nocov
points <- importance_sample(ems, n_points, z, points, this_cutoff, opts)
if (verbose) cat("Importance sampling generated", #nocov
nrow(points)-n_current, "more points.\n") #nocov
n_current <- nrow(points)
}
if (this_cutoff - cutoff > opts$cutoff_tolerance) {
if(!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE) #nocov
if (length(which_methods) == 1 && which_methods == "lhs") {
warning("Could not generate points to desired target cutoff")
return(points)
}
new_opts <- opts
new_opts$chain_call <- TRUE
new_opts$resample <- 0
points <- generate_new_design(ems, n_points, z, which_methods[!which_methods %in% c('lhs')],
cutoff = cutoff, plausible_set = points, verbose = verbose,
opts = new_opts, cutoff_info = c(min_cutoff, this_cutoff))
}
else if (this_cutoff != cutoff) {
if (verbose) #nocov start
cat("Point implausibilities within tolerance.",
"Proposed points have maximum implausibility",
round(this_cutoff, 4), "\n") #nocov end
}
if (!is.null(points$cutoff)) {
cutoff <- points$cutoff
points <- points$points
}
if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE)
if (length(intersect(which_methods, c("importance", "line", "slice"))) > 0 && opts$resample > 0) {
for (nsamp in 1:opts$resample) {
if (verbose) cat("Resample", nsamp, "\n") #nocov
points <- maximin_sample(points, min(nrow(points), ceiling(n_points/2)), nms = names(ranges))
n_current <- nrow(points)
if ("line" %in% which_methods) {
if (verbose) cat("Performing line sampling...\n") #nocov
points <- line_sample(ems, ranges, z, points, cutoff = cutoff, opts = opts)
if (verbose) cat("Line sampling generated", #nocov
nrow(points)-n_current, "more points.\n") #nocov
if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE) #nocov
n_current <- nrow(points)
}
if ("slice" %in% which_methods) {
if (verbose) cat("Performing slice sampling...\n") #nocov
points <- slice_gen(ems, ranges, n_points, z, points, cutoff, opts)
if (verbose) cat("Slice sampling generated", #nocov
nrow(points)-n_current, "more points.\n") #nocov
if (!is.na(opts$to_file)) write.csv(points, file = opts$to_file, row.names = FALSE) #nocov
n_current <- nrow(points)
}
if ("importance" %in% which_methods && nrow(points) < n_points) {
if (verbose) cat("Performing importance sampling...\n") #nocov
points <- importance_sample(ems, n_points, z, points, cutoff, opts)
if (verbose) cat("Importance sampling generated", #nocov
nrow(points) - n_current, "more points.\n") #nocov
n_current <- nrow(points)
}
}
}
if (opts$seek > 0) {
if (verbose) cat("Searching for points with high matching probability...\n") #nocov
if (opts$seek <= 1) opts$seek <- floor(n_points*opts$seek)
extra_points <- seek_good(ems, z, points, cutoff, verbose, opts)
}
else extra_points <- NULL
if (nrow(points) > n_points - opts$seek && thin) {
if (verbose) cat("Selecting final points using maximin criterion...\n") #nocov
points <- maximin_sample(points, n_points - opts$seek, nms = names(ranges))
}
if (!is.null(opts$chain_call)) return(list(points = rbind(points, extra_points), cutoff = cutoff))
if (!is.na(opts$to_file)) write.csv(rbind(points, extra_points), #nocov
file = opts$to_file, row.names = FALSE)
return(rbind(points, extra_points))
}
## LHD sampling function
lhs_gen <- function(ems, ranges, n_points, z, cutoff = 3, verbose, opts = NULL) {
if (is.null(opts$points.factor)) opts$points.factor <- 40
tryCatch(opts$points.factor <- as.numeric(opts$points.factor),
warning = function(e) {warning("opts$points.factor is not numeric; setting to 40"); opts$points.factor <- 40})
if (is.null(opts$pca_lhs) || !is.logical(opts$pca_lhs)) opts$pca_lhs <- FALSE
if (opts$pca_lhs) {
if (!is.null(ems$expectation)) {
train_pts <- ems$expectation[[1]]$in_data
init_ranges <- ems$expectation[[1]]$ranges
}
else if (!is.null(ems$mode1)) {
init_ranges <- ems$mode1$expectation[[1]]$ranges
train_pts <- unique(rbind(ems$mode1$expectation[[1]]$in_data, ems$mode2$expectation[[1]]$in_data)[,init_ranges])
}
else {
train_pts <- ems[[1]]$in_data
init_ranges <- ems[[1]]$ranges
}
actual_points <- eval_funcs(scale_input, train_pts, init_ranges, FALSE)
pca_points <- pca_transform(actual_points, actual_points)
pca_ranges <- map(seq_len(ncol(pca_points)), ~range(pca_points[,.])) |> setNames(paste0("X", seq_len(ncol(pca_points))))
pca_ranges <- map(pca_ranges, ~.*c(0.9, 1.1))
temp_pts <- eval_funcs(scale_input,
setNames(
data.frame(2 * randomLHS(n_points * opts$points.factor, length(pca_ranges)) - 0.5),
paste0("X", seq_along(pca_ranges))
), pca_ranges, FALSE)
points <- data.frame(pca_transform(temp_pts, actual_points, FALSE)) |> setNames(names(init_ranges))
points <- points[in_range(points, init_ranges),]
}
else {
points <- eval_funcs(scale_input, setNames(
data.frame(2* (randomLHS(n_points * opts$points.factor, length(ranges)) - 0.5)),
names(ranges)), ranges, FALSE)
}
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
imp_func <- function(ems, x, z, ...) nth_implausible(ems, x, z, n = opts$nth, ...)
point_imps <- imp_func(ems, points, z, max_imp = Inf, ordered = TRUE)
required_points <- min(max(1, length(point_imps)-1, floor(0.8*length(point_imps))), 5*length(ranges))
if (sum(point_imps <= cutoff) < required_points) {
cutoff_current <- sort(point_imps)[required_points]
if (sort(point_imps)[1] >= 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = points[point_imps <= 0,], cutoff = 0))
}
}
else cutoff_current <- cutoff
return(list(points = points[point_imps <= cutoff_current,],
cutoff = cutoff_current))
}
else {
i_bools <- rep(FALSE, nrow(points))
required_points <- min(max(1, nrow(points)-1, floor(0.8*nrow(points))), 5*length(ranges))
c_current <- cutoff
while (sum(i_bools) < required_points) {
if (c_current > 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = points[rep(FALSE, nrow(points)),], cutoff = 0))
}
false_indices <- which(!i_bools)
imp_bools <- opts$accept_measure(ems, points[false_indices,], z, cutoff = c_current, n = opts$nth)
i_bools[false_indices] <- i_bools[false_indices] | imp_bools
c_current <- c_current + 0.5
}
return(list(points = points[i_bools,], cutoff = c_current - 0.5))
}
}
## Cluster LHD sampling function
#'
#' @importFrom cluster daisy fanny
lhs_gen_cluster <- function(ems, ranges, n_points, z, cutoff = 3, verbose = FALSE, opts) {
if (is.null(opts$points.factor)) opts$points.factor <- 40
tryCatch(opts$points.factor <- as.numeric(opts$points.factor),
warning = function(e) {warning("opts$points.factor is not numeric; setting to 40"); opts$points.factor <- 40})
which_active <- setNames(
data.frame(do.call('rbind',
map(ems, ~.$active_vars))),
names(ranges)
)
cluster_id <- NULL
tryCatch({
dist_mat <- suppressWarnings(daisy(which_active, metric = 'gower'))
single_clust <- fanny(dist_mat, k = 1)
double_clust <- fanny(dist_mat, k = 2)
if (single_clust$objective[["objective"]] < double_clust$objective[["objective"]])
cluster_id <- c(single_clust$clustering, use.names = FALSE)
else cluster_id <- c(double_clust$clustering, use.names = FALSE)
},
error = function(e) {
warning("Cannot distinguish two clusters in emulator active variables.")
}
)
# cluster_id <- tryCatch(
# Mclust(which_active, G = 1:2, verbose = FALSE)$classification,
# error = function(e) {
# warning("Cannot distinguish two clusters in emulator active variables.")
# return(NULL)
# }
# )
if (is.null(cluster_id) || length(unique(cluster_id)) == 1)
return(lhs_gen(ems, ranges, n_points, z, cutoff, verbose, opts))
c1 <- ems[cluster_id == 1]
c2 <- ems[cluster_id == 2]
p1 <- unique(do.call(c, map(c1, ~names(ranges)[.$active_vars])))
p2 <- unique(do.call(c, map(c2, ~names(ranges)[.$active_vars])))
pn <- intersect(p1, p2)
if (length(union(p1, p2)) == length(pn) && all(union(p1, p2) %in% pn))
return(lhs_gen(ems, ranges, n_points, z, cutoff, verbose, opts))
if (length(p1) > length(p2)) {
c1 <- ems[cluster_id == 2]
c2 <- ems[cluster_id == 1]
p1 <- unique(do.call(c, map(c2, ~names(ranges)[.$active_vars])))
p2 <- unique(do.call(c, map(c1, ~names(ranges)[.$active_vars])))
}
if (verbose) cat("Clusters determined. Cluster 1 has length", #nocov start
length(c1), "with", length(p1),
"active variables; cluster 2 has length",
length(c2), "with", length(p2),
"active variables;", length(pn), "shared.\n")
if (verbose) cat("Proposing from clusters...\n") #nocov end
cluster_proposal <- function(ems, params, ranges, np, what_cutoff = cutoff) {
req_points <- min(floor(np * opts$points.factor/2 - 1),
floor(0.4 * np * opts$points.factor),
5*length(ranges))
prop_lhs <- setNames(
data.frame(2 * (randomLHS(np * opts$points.factor, length(params)) - 0.5)),
params
)
spare_p <- ranges[!names(ranges) %in% params]
for (i in names(spare_p)) prop_lhs[[i]] <- runif(nrow(prop_lhs), -1, 1)
prop_lhs <- eval_funcs(scale_input, prop_lhs[,names(ranges)], ranges, FALSE)
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
imps <- nth_implausible(ems, prop_lhs, z, n = max(1, opts$nth-1), max_imp = Inf, ordered = TRUE)
if (sum(imps <= what_cutoff) < req_points) {
cutoff_current <- sort(imps)[req_points]
if (sort(imps)[1] >= 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = prop_lhs[imps <= 0,], cutoff = 0))
}
}
else cutoff_current <- what_cutoff
valid <- prop_lhs[imps <= cutoff_current,]
}
else {
i_bools <- rep(FALSE, nrow(prop_lhs))
c_current <- what_cutoff
while (sum(i_bools) < req_points) {
if (c_current > 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = prop_lhs[rep(FALSE, nrow(prop_lhs)),], cutoff = 0))
}
false_indices <- which(!i_bools)
imp_bools <- opts$accept_measure(ems, prop_lhs[false_indices,], z, cutoff = c_current, n = opts$nth)
i_bools[false_indices] <- i_bools[false_indices] | imp_bools
c_current <- c_current + 0.5
}
cutoff_current <- c_current - 0.5
valid <- prop_lhs[i_bools,]
}
return(list(points = valid, cutoff = cutoff_current))
}
clust_1_prop <- cluster_proposal(c1, p1, ranges, n_points)
clust_2_prop <- cluster_proposal(c2, p2, ranges, n_points)
if (nrow(clust_1_prop$points) == 0) return(clust_1_prop)
if (nrow(clust_2_prop$points) == 0) return(clust_2_prop)
if (clust_1_prop$cutoff != clust_2_prop$cutoff) {
if (clust_1_prop$cutoff < clust_2_prop$cutoff)
clust_1_prop <- cluster_proposal(c1, p1, ranges, n_points, clust_2_prop$cutoff)
else
clust_2_prop <- cluster_proposal(c2, p2, ranges, n_points, clust_1_prop$cutoff)
}
cutoff_val <- clust_1_prop$cutoff
if (length(intersect(p1, p2)) == 0) {
if (verbose) cat("No shared variables in clusters.\n")
relev_1 <- clust_1_prop$points[,p1, drop = FALSE] |> setNames(p1)
relev_2 <- clust_2_prop$points[,p2, drop = FALSE] |> setNames(p2)
complete_df <- data.frame(cbind(relev_1[rep(seq_len(nrow(relev_1)),
each = nrow(relev_2)),],
relev_2[rep(seq_len(nrow(relev_2)),
nrow(relev_1)),])) |> setNames(c(p1, p2))
if (length(setdiff(names(ranges), union(p1, p2))) != 0) {
for (i in setdiff(names(ranges), union(p1, p2)))
complete_df[,i] <- runif(nrow(complete_df), ranges[[i]][1], ranges[[i]][2])
}
complete_df <- complete_df[,names(ranges)]
return(list(points = complete_df, cutoff = cutoff_val))
}
relev_1 <- data.frame(clust_1_prop$points[,setdiff(p1, pn), drop = FALSE]) |>
setNames(setdiff(p1, pn))
relev_2 <- data.frame(clust_2_prop$points[,setdiff(p2, pn), drop = FALSE]) |>
setNames(setdiff(p2, pn))
combine_relev <- rbind(clust_1_prop$points[,pn, drop = FALSE],
clust_2_prop$points[,pn, drop = FALSE]) |> setNames(pn)
combine_ranges <- apply(combine_relev, 2, range)
complete_df <- data.frame(cbind(relev_1[rep(seq_len(nrow(relev_1)),
each = nrow(relev_2)),],
relev_2[rep(seq_len(nrow(relev_2)),
nrow(relev_1)),])) |>
setNames(c(names(relev_1), names(relev_2)))
for (i in pn) {
complete_df[,i] <- runif(nrow(complete_df), combine_ranges[1,i], combine_ranges[2,i])
}
complete_df <- complete_df[,names(ranges)]
req_points <- min(floor(n_points * opts$points.factor/2 - 1),
floor(0.4 * n_points * opts$points.factor),
5*length(ranges))
if (is.character(opts$accept_measure) && opts$accept_measure == "default") {
imps <- nth_implausible(ems, complete_df, z, n = opts$nth, max_imp = Inf, ordered = TRUE)
if (sum(imps <= cutoff_val) < req_points) {
cutoff_current <- sort(imps)[req_points]
if (sort(imps)[1] >= 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = complete_df[imps <= 0,], cutoff = 0))
}
}
else cutoff_current <- cutoff_val
valid <- complete_df[imps <= cutoff_current,]
}
else {
i_bools <- rep(FALSE, nrow(complete_df))
c_current <- cutoff_val
while (sum(i_bools) < req_points) {
if (c_current > 20) {
warning(paste("Parameter space has no points below implausibility 20;",
"terminating early. This may not indicate model inadequacy:",
"inspect results and re-run if applicable."))
return(list(points = complete_df[rep(FALSE, nrow(complete_df)),], cutoff = 0))
}
false_indices <- which(!i_bools)
imp_bools <- opts$accept_measure(ems, complete_df[false_indices,], z, cutoff = c_current, n = opts$nth)
i_bools[false_indices] <- i_bools[false_indices] | imp_bools
c_current <- c_current + 0.5
}
cutoff_current <- c_current - 0.5
valid <- complete_df[i_bools,]
}
return(list(points = valid, cutoff = cutoff_current))
}
## Line sampling function
line_sample <- function(ems, ranges, z, s_points, cutoff = 3, opts) {
if (is.null(opts$n_lines)) opts$n_lines <- 20
tryCatch(opts$n_lines <- as.numeric(opts$n_lines), warning = function(e) {warning("opts$n_lines not numeric; setting to 20"); opts$n_lines <- 20})
if (is.null(opts$ppl)) opts$ppl <- 50
tryCatch(opts$ppl <- as.numeric(opts$ppl), warning = function(e) {warning("opts$ppl not numeric; setting to 50"); opts$ppl <- 50})
if (nrow(s_points) < 2) return(s_points)
n_lines <- min(nrow(s_points)*(nrow(s_points)-1)/2, opts$n_lines)
if (opts$ppl %% 4 == 1) opts$ppl <- opts$ppl + 1
s_lines <- lapply(1:(10*opts$n_lines), function(x) {
pts <- s_points[sample(nrow(s_points), 2),]
pt_dist <- dist(pts)
return(list(p1 = pts[1,], p2 = pts[2,], d = pt_dist))
})
s_lines <- s_lines[!duplicated(map_dbl(s_lines, ~.$d))]
best_pts <- s_lines[order(map_dbl(s_lines, ~.$d),
decreasing = TRUE)][1:opts$n_lines]
samp_pts <- lapply(best_pts, function(x) {
tryCatch(
{
do.call('rbind', lapply(seq(-x[[3]], x[[3]], length.out = opts$ppl),
function(y) (x[[1]]+x[[2]])/2 + y * (x[[2]]-x[[1]])))
},
error = function(e) {
warning("Problem with line sampling; investigate.")
return(NULL)
}
)
})
samp_pts <- samp_pts[!map_lgl(samp_pts, is.null)]
samp_pts <- map(samp_pts, ~.[in_range(., ranges),])
samp_pts <- samp_pts[!map_lgl(
samp_pts, ~(is.null(.) || is.null(nrow(.)) || nrow(.) == 0)
)]
if (is.character(opts$accept_measure) && opts$accept_measure == "default")
imps <- map(samp_pts,
~nth_implausible(
ems, ., z,
n = opts$nth, cutoff = cutoff,
ordered = TRUE
))
else imps <- map(samp_pts, ~opts$accept_measure(ems, ., z, cutoff = cutoff, n = opts$nth))
include_pts <- imap(samp_pts, function(x, i) {
pts <- x
imp <- imps[[i]]
included <- map_lgl(seq_along(imp), function(y) {
if (!imp[y]) return(FALSE)
if (y == 1 || y == length(imp)) return(TRUE)
if (!imp[y+1] || !imp[y-1]) return(TRUE)
return(FALSE)
})
return(pts[included,])
})
out_df <- rbind(s_points, do.call('rbind', include_pts))
uniqueness <- row.names(unique(signif(out_df, 7)))
return(out_df[uniqueness,])
}
importance_sample <- function(ems, n_points, z, s_points, cutoff = 3, opts) {
if (is.null(opts$imp_distro) || !opts$imp_distro %in% c('sphere', 'normal')) opts$imp_distro <- 'sphere'
if (is.null(opts$imp_scale)) opts$imp_scale <- 2
tryCatch(opts$imp_scale <- as.numeric(opts$imp_scale),
warning = function(e) {warning("opts$imp_scale is not numeric; setting to 2"); opts$imp_scale <- 2})
if (nrow(s_points) >= n_points)
return(s_points)
m_points <- n_points - nrow(s_points)
ranges <- getRanges(ems, FALSE)
new_points <- s_points
propose_points <- function(sp, sd, how_many = n_points) {
if (is.character(opts$accept_measure) && opts$accept_measure == "default")
imp_func <- function(ems, x, z, ...) nth_implausible(ems, x, z, ...)
else
imp_func <- opts$accept_measure
sp_trafo <- pca_transform(sp, s_points)
wp <- sp_trafo[sample(nrow(sp_trafo), how_many, replace = TRUE), ]
if (opts$imp_distro == "normal")
pp <- t(apply(wp, 1, function(x)
mvtnorm::rmvnorm(1, mean = unlist(x, use.names = FALSE), sigma = sd)))
else
pp <- t(apply(wp, 1, function(x)
runifs(1, length(s_points), unlist(x, use.names = FALSE), r = sd)))
prop_points <- data.frame(pp)
back_traf <- setNames(data.frame(pca_transform(pp, s_points, FALSE)), names(ranges))
valid <- in_range(back_traf, ranges) &
imp_func(ems, back_traf, z, cutoff = cutoff, n = opts$nth, ordered = TRUE)
if (opts$imp_distro == "normal") {
tweights <- apply(prop_points, 1, function(x)
1/nrow(sp_trafo) * sum(
apply(sp_trafo, 1, function(y)
mvtnorm::dmvnorm(x, mean = y, sigma = sd))
))
min_w <- min(apply(sp_trafo, 1, function(x)
1/nrow(sp_trafo) * sum(
apply(sp_trafo, 1, function(y) mvtnorm::dmvnorm(x, mean = y, sigma = sd))
)))
weights <- min_w/tweights
}
else {
weights <- apply(prop_points, 1, function(x)
sum(apply(sp_trafo, 1, function(y)
punifs(x, y, r = sd))))
}
allow <- runif(length(weights)) < 1/weights
accepted <- valid & allow
return(back_traf[accepted,])
}
if (opts$imp_distro == "normal" && !is.matrix(opts$imp_scale)) sd <- diag(2, length(ranges))
else if (length(opts$imp_scale) == 1) sd <- rep(2, length(ranges))
accept_rate <- NULL
upper_accept <- 0.225
lower_accept <- 0.1
while((is.null(accept_rate) || accept_rate > upper_accept ||
accept_rate < lower_accept) && nrow(new_points) < n_points) {
if (!is.null(accept_rate)) {
if (accept_rate > upper_accept) sd <- sd * 1.1
else sd <- sd * 0.9
}
how_many <- max(floor(n_points/4), 1000)
prop_points <- propose_points(s_points, sd, how_many)
new_points <- rbind(new_points, prop_points)
uniqueness <- row.names(unique(signif(new_points, 7)))
new_points <- new_points[uniqueness,]
if (!is.na(opts$to_file)) write.csv(new_points, file = opts$to_file, row.names = FALSE) #nocov
accept_rate <- nrow(prop_points)/how_many
}
while(nrow(new_points) < n_points) {
prop_points <- propose_points(s_points, sd, ceiling(1.5*m_points/accept_rate))
new_points <- rbind(new_points, prop_points)
uniqueness <- row.names(unique(signif(new_points, 7)))
new_points <- new_points[uniqueness,]
if (!is.na(opts$to_file)) write.csv(new_points, file = opts$to_file, row.names = FALSE) #nocov
}
return(new_points)
}
slice_gen <- function(ems, ranges, n_points, z, points, cutoff, opts) {
if (is.null(opts$pca_slice) || !is.logical(opts$pca_slice)) opts$pca_slice <- FALSE
if (is.character(opts$accept_measure) && opts$accept_measure == "default")
imp_func <- function(ems, x, z, ...) nth_implausible(ems, x, z, ...)
else
imp_func <- opts$accept_measure
make_slice <- function(points, ranges, indices) {
new_coords <- vapply(ranges, function(x) runif(1, x[1], x[2]), numeric(1))
old_values <- rep(0, nrow(points))
for (i in seq_along(indices)) {
old_values[i] <- points[i, indices[i]]
points[i, indices[i]] <- new_coords[i]
}
return(list(p = points, o = old_values))
}
complete_points <- pca_base <- points
if (opts$pca_slice) {
points <- pca_transform(points, pca_base)
pca_ranges <- map(seq_along(pca_base), ~c(-5,5))
}
else
pca_ranges <- ranges
index_list <- rep(1, nrow(points))
range_list <- map(seq_along(index_list),
~pca_ranges[[index_list[.]]])
while (nrow(complete_points) < n_points) {
new_slice <- make_slice(points, range_list, index_list)
points <- new_slice$p
old_vals <- new_slice$o
if (opts$pca_slice) {
imps <- imp_func(ems,
setNames(data.frame(matrix(
pca_transform(points, pca_base, FALSE),
nrow = nrow(points)
)), names(ranges)),
z, cutoff = cutoff, n = opts$nth, ordered = TRUE)
in_ranges <- in_range(pca_transform(points, pca_base, FALSE), ranges)
}
else {
imps <- imp_func(ems, points, z, cutoff = cutoff, n = opts$nth, ordered = TRUE)
in_ranges <- in_range(points, ranges)
}
for (i in seq_along(imps)) {
if (imps[i] && in_ranges[i]) {
range_list[[index_list[i]]] <- pca_ranges[[index_list[i]]]
if (index_list[i] == length(pca_ranges)) {
if (opts$pca_slice)
complete_points <- rbind(complete_points,
setNames(data.frame(matrix(
pca_transform(points[i,], pca_base, FALSE),
nrow = 1
)), names(ranges)))
else
complete_points <- rbind(complete_points, points[i,])
index_list[i] <- 1
}
else
index_list[i] <- index_list[i]+1
range_list[[i]] <- pca_ranges[[index_list[i]]]
}
else {
if (points[i, index_list[i]] < old_vals[i])
range_list[[i]][1] <- points[i, index_list[i]]
else range_list[[i]][2] <- points[i, index_list[i]]
points[i, index_list[i]] <- old_vals[i]
}
}
}
return(complete_points)
}
## Good point generation
#'
#' @importFrom stats pnorm
seek_good <- function(ems, z, plausible_set, cutoff = 3, verbose,
opts) {
if (!is.null(ems$expectation)) ems <- ems$expectation
n_points <- opts$seek
if (is.null(opts$seek_distro)) opts$seek_distro <- "norm"
dist_func <- get(paste0("p", opts$seek_distro))
get_prob <- function(ems, points, targets) {
for (i in seq_along(targets)) {
if (!is.atomic(targets[[i]]))
targets[[i]] <- c(targets[[i]]$val - 3*targets[[i]]$sigma,
targets[[i]]$val + 3*targets[[i]]$sigma)
}
em_exps <- do.call('cbind.data.frame',
map(ems, ~.$get_exp(points)))
em_sds <- sqrt(do.call('cbind.data.frame',
map(ems, ~.$get_cov(points))))
em_probs <- do.call('rbind.data.frame',
map(seq_len(nrow(em_exps)), function(x) {
map_dbl(seq_along(em_exps[x,]), function(y) {
pnorm(targets[[ems[[y]]$output_name]][2],
em_exps[x,y], em_sds[x,y]) -
pnorm(targets[[ems[[y]]$output_name]][1],
em_exps[x,y], em_sds[x,y])
})
}))
result <- apply(
em_probs, 1,
function(x) prod(
map_dbl(
unique(names(targets)),
~min(x[map_chr(ems, function(a) a$output_name) == .]))))
return(result)
}
select_minimal <- function(data, first_index, how_many) {
data_dist <- data.matrix(
dist(data, upper = TRUE, diag = TRUE))[-first_index,]
picked_rows <- c(first_index)
while(length(picked_rows) < how_many) {
new_point_dists <- map_dbl(row.names(data_dist), function(x) {
temp_dd <- data_dist[!row.names(data_dist) %in% x,]
dists <- apply(temp_dd[,c(picked_rows, as.numeric(x))], 1, min)
return(max(dists))
})
next_row <- row.names(data_dist)[which.min(new_point_dists)]
data_dist <- data_dist[!row.names(data_dist) %in% next_row,]
picked_rows <- c(picked_rows, as.numeric(next_row))
}
return(data[picked_rows,])
}
if (nrow(plausible_set) > 5*n_points) {
plausible_set <- maximin_sample(plausible_set, 5*n_points, nms = names(plausible_set))
}
point_set <- importance_sample(ems, 20*n_points,
z,
plausible_set,
cutoff = cutoff, opts)
if (verbose) cat("Generated candidate set...\n")
probs <- get_prob(ems, point_set, z)
o_points <- point_set[order(probs, decreasing = TRUE),]
keep_points <- o_points[1:(10*n_points),]
row.names(keep_points) <- seq_len(nrow(keep_points))
if (verbose) cat("Subselected highest probability points. Thinning...\n")
final_set <- select_minimal(keep_points, 1, n_points)
return(final_set)
}
#' Generate Proposal Points
#'
#' This function is deprecated in favour of \code{\link{generate_new_design}}.
#'
#' Given a set of trained emulators, this finds the next set of points that will be
#' informative for a subsequent wave of emulation or, in the event that the
#' current wave is the last desired, a set of points that optimally span the
#' parameter region of interest. There are a number of different methods that can
#' be utilised, alone or in combination with one another, to generate the points.
#'
#' If the \code{method} argument contains \code{'lhs'}, a Latin hypercube is generated and
#' non-implausible points from this design are retained. If more points are accepted
#' than the next design requires, then points are subselected using a maximin argument.
#'
#' If \code{method} contains \code{'line'}, then line sampling is performed. Given an
#' already established collection of non-implausible points, rays are drawn between
#' pairs of points (selected so as to maximise the distance between them) and more
#' points are sampled along the rays. Points thus sampled are retained if they lie
#' near a boundary of the non-implausible space, or on the boundary of the parameter
#' region of interest.
#'
#' If \code{method} contains \code{'importance'}, importance sampling is performed.
#' Given a collection of non-implausible points, a mixture distribution of either
#' multivariate normal or uniform ellipsoid proposals around the current non-implausible
#' set are constructed. The optimal standard deviation (in the normal case) or radius
#' (in the ellipsoid case) is determined using a burn-in phase, and points are
#' proposed until the desired number of points have been found.
#'
#' If \code{method} contains \code{'slice'}, then slice sampling is performed. Given
#' a single known non-implausible point, a minimum enclosing hyperrectangle (perhaps
#' after transforming the space) is determined and points are sampled for each dimension
#' of the parameter space uniformly, shrinking the minimum enclosing hyperrectangle as
#' appropriate. This method is akin to to a Gibbs sampler.
#'
#' If \code{method} contains \code{'optical'}, then optical depth sampling is used.
#' Given a set of non-implausible points, an approximation of the one-dimensional
#' marginal distributions for each parameter can be determined. From these derived
#' marginals, points are sampled and subject to rejection as in the LHD sampling.
#'
#' For any sampling strategy, the parameters \code{ems}, \code{n_points}, and \code{z}
#' must be provided. All methods rely on a means of assessing point suitability, which
#' we refer to as an implausibility measure. By default, this uses nth-maximum implausibility
#' as provided by \code{\link{nth_implausible}}; a user-defined method can be provided
#' instead by supplying the function call to \code{opts[["accept_measure"]]}. Any
#' such function must take at least five arguments: the emulators, the points, the
#' targets, and a cutoff, as well as a \code{...} argument to ensure compatibility with
#' the default behaviour of the point proposal method.
#'
#' The option \code{opts[["seek"]]} determines how many points should be chosen that
#' have a higher probability of matching targets, as opposed to not missing targets. Due
#' to the danger of such an approach if a representative space-filling design over the
#' space, this value should not be too high and should be used sparingly at early waves;
#' even at later waves, it is inadvisable to seek more than 10\% of the output points
#' using this metric. The default is \code{seek = 0}, and can be provided as either
#' a percentage of points desired (in the range [0,1]) or the fixed number of points.
#'
#' The default behaviour is as follows. A set of initial points are generated from a
#' large LHD; line sampling is performed to find the boundaries of the space; then importance
#' sampling is used to fill out the space. The proposed set of points are thinned and
#' both line and importance sampling are applied again; this resampling behaviour is
#' controlled by \code{opts[["resample"]]}, where \code{resample = n} indicates that
#' the proposal will be thinned and resampled from \code{n} times (resulting in \code{n+1}
#' proposal stages).
#'
#' In regions where the non-implausible space at a given cutoff value is very hard to find,
#' the point proposal will start at a higher cutoff where it can find a space-filling design.
#' Given such a design at a higher cutoff, it can subselect to a lower cutoff by demanding
#' some percentage of the proposed points are retained and repeat. This approach terminates
#' if the 'ladder' of cutoffs reaches the desired cutoff, or if the process asymptotes at
#' a particular higher cutoff. The opts \code{ladder_tolerance} and \code{cutoff_tolerance}
#' determine the minimum improvement required in consecutive cutoffs for the process to not
#' be considered to be asymptoting and the level of closeness to the desired cutoff at which
#' we are prepared to stop, respectively. For instance, setting \code{ladder_tolerance} to
#' 0.1 and \code{cutoff_tolerance} to 0.01, with a cutoff of 3, will terminate the process
#' if two consecutive cutoffs proposed are within 0.1 of each other, or when the points proposed
#' all have implausibility less than the 3.01.
#'
#' These methods may work slowly, or not at all, if the target space is extremely small in
#' comparison with the initial non-yet-ruled-out (NROY) space; it may also fail to give a
#' representative sample if the target space is formed of disconnected regions of different
#' volumes.
#'
#' @section Arguments within \code{opts}:
#' \describe{
#' \item{accept_measure}{A custom implausibility measure to be used.}
#' \item{cluster}{Whether to try to apply emulator clustering.}
#' \item{cutoff_tolerance}{Tolerance for an obtained cutoff to be similar enough to that desired.}
#' \item{ladder_tolerance}{Tolerance with which to determine if the process is asymptoting.}
#' \item{nth}{The level of nth implausibility to apply, if using the default implausibility.}
#' \item{resample}{How many times to perform the resampling step once points are found.}
#' \item{seek}{How many 'good' points should be sought: either as an integer or a ratio.}
#' \item{to_file}{If output is to be written to file periodically, the file location.}
#' \item{points.factor (LHS, Cluster LHS)}{How many more points than desired to sample.}
#' \item{pca_lhs (LHS)}{Whether to apply PCA to the space before proposing.}
#' \item{n_lines (Line)}{How many lines to draw.}
#' \item{ppl (Line)}{The number of points to sample per line.}
#' \item{imp_distro (Importance)}{The distribution to propose around points.}
#' \item{imp_scale (Importance)}{The radius, or standard deviation, of proposed distributions.}
#' \item{pca_slice (Slice)}{Whether to apply PCA to the space before slice sampling.}
#' \item{seek_distro (Seek)}{The distribution to apply when looking for 'good' points.}
#' }
#'
#' @param ems A list of \code{\link{Emulator}} objects, trained
#' on previous design points.
#' @param n_points The desired number of points to propose.
#' @param z The targets to match to.
#' @param method Which methods to use.
#' @param cutoff The value of the cutoff to use to assess suitability.
#' @param plausible_set An optional set of known non-implausible points, to avoid LHD sampling.
#' @param verbose Should progress statements be printed to the console?
#' @param opts A named list of opts as described.
#' @param ... Any parameters to pass via chaining to individual sampling functions (eg \code{distro}
#' for importance sampling or \code{ordering} for collecting emulators).
#'
#' @return A data.frame containing the set of new points upon which to run the model.
#'
#' @export
#'
generate_new_runs <- function(ems, n_points, z, method = "default", cutoff = 3,
plausible_set, verbose = interactive(),
opts = NULL, ...) {
.Deprecated("generate_new_design")
generate_new_design(ems, n_points, z, method, cutoff, plausible_set,
verbose, opts, ...)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.