R/modelbuilding.R

Defines functions convertRanges

## Helper function for converting ranges from data.frame or data.matrix to list
convertRanges <- function(object) {
  if (is.null(object) || missing(object)) return(object)
  if (inherits(object, "matrix")) {
    if (ncol(object) == 2 &&
        length(row.names(object)[row.names(object) != ""]) == nrow(object))
      return(setNames(
        map(seq_len(nrow(object)),
                   ~c(min(object[.,]), max(object[.,]), use.names = FALSE)
        ),
        row.names(object)
      ))
    else {
      warning(paste("Data.matrix or ranges is misspecified",
                    "(either row.names incomplete, or dimension of data.frame is not nx2)."))
      return(NULL)
    }
  }
  if (is.list(object) && !is.data.frame(object)) {
    if (length(names(object)) == length(object) &&
        all(map_dbl(object, length) == 2))
      return(map(object, sort))
    else {
      warning(paste("List of ranges is misspecified,",
                    "(either not all named, or not all have maximum and minimum value)."))
      return(NULL)
    }
  }
  if (is.data.frame(object)) {
    if (length(object) == 2 &&
        !any(map_lgl(seq_len(nrow(object)), ~row.names(object)[.]==.)))
      return(map(seq_len(nrow(object)),
                        ~c(min(object[.,]), max(object[.,]))) |>
               setNames(row.names(object)))
    else {
      warning(paste("Data.frame of ranges is misspecified",
                    "(either row.names incomplete, or dimension of data.frame is not n by 2."))
      return(NULL)
    }
  }
}

#' Model Generation
#'
#' Creates a best fit of coefficients for a given data set.
#'
#' There are two ways to generate the model: either start with all possible terms
#' (including cross-terms) up to order \code{n}, and then stepwise-remove; or start
#' with an intercept and stepwise-add terms up to order \code{n}, only retaining
#' a term is the information critereon is improved. Which method is chosen is
#' dependent on the value of \code{add} - if \code{add = FALSE} and there are not
#' enough degrees of freedom to accommodate all possible terms, a warning will
#' be given.
#'
#' @importFrom stats lm step setNames as.formula anova
#'
#' @param data A \code{data.frame} containing the input and output values
#' @param ranges A named list consisting of the input parameter ranges
#' @param output_name A string corresponding to the output name to fit to
#' @param add Should stepwise-add or stepwise-delete be performed?
#' @param order The order to which a polynomial should be fitted
#' @param u_form An 'upper form' for the model fit (used internally)
#' @param verbose Should the name of the output be printed?
#'
#' @keywords internal
#' @noRd
#'
#' @return The fitted \code{lm} model object.
get_coefficient_model <- function(data, ranges, output_name, add = FALSE,
                                  order = 2, u_form = NULL, verbose = FALSE) {
  if (verbose) cat(output_name, "\n") #nocov
  lower_form <- as.formula(paste(output_name, "1", sep = "~"))
  if (is.null(u_form)) {
    if (order == 1)
      upper_form <- as.formula(paste(output_name,
                                     "~",
                                     paste0(c("1", names(ranges)), collapse = "+"),
                                            sep = ""))
    else {
      start_string <- paste(output_name, "~",
                              paste0(c("1", names(ranges)), collapse = "+"), sep = "")
      string_vec <- c(start_string)
      for (i in 2:order) {
        string_vec <-c(string_vec,
                        paste0("I(", names(ranges), "^", i, ")", collapse = "+"))
      }
      upper_form <- as.formula(paste0(string_vec, collapse = "+"))
      start_model <- get_coefficient_model(data = data, ranges = ranges,
                                           output_name = output_name, add = add,
                                           order = order, u_form = upper_form)
      a_vars <- names(start_model$coefficients)[-1]
      a_vars <- names(ranges)[map_lgl(names(ranges), ~any(grepl(., a_vars)))]
      if (length(a_vars) == 0)
        upper_form <- as.formula(paste(output_name, "~ 1"))
      else {
        start_string <- paste(c(output_name, "~",
                                paste0(c("1", a_vars), collapse = "+")), sep = "")
        for (i in 2:order) {
          string_vec <- c(string_vec,
                          paste0("I(", a_vars, "^", i, ")", collapse = "+"),
                          paste("(", paste0(c("1", a_vars), collapse = "+"), ")^", order, sep = "")
                          )
          string_vec <- c(string_vec,
                          paste0(c(outer(a_vars, names(ranges), paste, sep = ":"))))
        }
        upper_form <- as.formula(paste0(string_vec, collapse = "+"))
      }
    }
  }
  else
    upper_form <- u_form
  if (!add && choose(length(ranges) + order, length(ranges)) > 2*nrow(data)/3) {
    if (verbose)
      warning(paste("Maximum number of regression terms is greater than",
                    "the available degrees of freedom. Changing to add = TRUE."))
    add <- TRUE
  }
  if (!is.data.frame(data))
    data <- setNames(data.frame(data), c(names(ranges), output_name))
  model <- tryCatch({
    if (add) {
      return(step(lm(formula = lower_form, data = data),
                    scope = list(lower = lower_form, upper = upper_form),
                    direction = "both", trace = 0, k = log(nrow(data))))
    }
    else {
      return(step(lm(formula = upper_form, data = data),
                    scope = list(lower = lower_form, upper = upper_form),
                    direction = "backward", trace = 0, k = log(nrow(data))))
    }
  },
  error = function(e) {
    warning(paste("Model selection suggests perfect fit for output", output_name, "- possible overfitting issue. Removing this output from emulation."))
    return(NULL)
  })
  if (is.null(model)) return(model)
  if (order != 1) {
    mod_coeffs <- summary(model)$coefficients[-1,]
    mod_anv <- anova(model)
    tot_sos <- sum(mod_anv$`Sum Sq`)
    pow_sos <- mod_anv[!row.names(mod_anv) %in% c(names(ranges), "Residuals"), "Sum Sq"]/tot_sos
    pow_names <- row.names(mod_coeffs)[!row.names(mod_coeffs) %in% names(ranges)]
    pow_remove <- pow_names[pow_sos < 0.01]
    final_terms <- row.names(mod_coeffs)[!row.names(mod_coeffs) %in% pow_remove]
    model <- lm(data = data, formula = as.formula(
      paste(output_name, "~", paste0(c("1", final_terms), collapse = "+"))
    ))
  }
return(model)
}

#' Hyperparameter Estimation
#'
#' Performs hyperparameter fitting using MLE.
#'
#' The maximised regression coefficients \code{beta} and the overall standard deviation
#' \code{sigma} can be found in closed form, given the hyperparameters of the correlatio
#' structure. Those hyperparameters are estimated by first setting up a coarse grid over
#' the main hyperparameters and finding the value which maximises the likelihood. This
#' initial guess is used as a seed for a Nelder-Mead optimiser to finesse the estimate
#' for the hyperparameters and the nugget term. Once done, the final ML estimates of beta
#' and sigma are calculated.
#'
#' @param inputs The input data
#' @param outputs The output values (usually as residuals from a fitted regression)
#' @param model The basis functions of the regression surface
#' @param corr_name The name of the type of correlation function to use
#' @param hp_range The allowed range of the hyperparameter(s)
#' @param beta If provided, the regression coefficients will be treated as known
#' @param delta The value of the nugget term. If \code{delta = NULL}, it will be treated
#' as a hyperparameter
#' @param verbose Should the output name be printed?
#'
#' @importFrom stats optim
#' @importFrom purrr exec
#' @importFrom MASS ginv
#'
#' @keywords internal
#' @noRd
#'
#' @return A list of hyperparameter values
hyperparameter_estimate <- function(inputs, outputs, model, corr_name = "exp_sq",
                                    hp_range, beta = NULL, delta = NULL,
                                    nsteps = 30, verbose = FALSE,
                                    log_likelihood = TRUE, perform_opt = FALSE) {
  if (verbose) cat(names(outputs)[[1]], "\n")
  corr <- Correlator$new(corr_name,
                         hp = setNames(
                           map(names(hp_range),
                                      ~hp_range[[.]][[1]]), names(hp_range))
  )
  if (!is.data.frame(inputs)) inputs <- data.frame(inputs)
  if (!is.null(beta) &&
      ((inherits(model, "lm") && length(coef(model)) != length(beta)) ||
       (!inherits(model, "lm") && length(beta) != length(model))))
    stop("Number of coefficients does not match number of regression functions")
  if (inherits(model, "lm")) H <- model.matrix(model) #%*% diag(coef(model), nrow = length(coef(model)))
  else H <- t(eval_funcs(model, inputs))
  if (dim(H)[1] == 1) H <- t(H)
  av <- map_lgl(seq_along(names(inputs)), function(x) {
    point_vec <- c(rep(0, x-1), 1, rep(0, length(names(inputs))-x))
    if (inherits(model, "lm"))
      func_vals <- model.matrix(model$terms,
                                setNames(data.frame(matrix(c(point_vec, 0), nrow = 1)), c(names(inputs), names(outputs))))
    else
      func_vals <- map_dbl(model, exec, point_vec)
    sum(func_vals) > 1
  })
  if (all(av == FALSE)) av <- c(TRUE)
  corr_mat <- function(points, hp, delta) {
    this_corr <- corr$set_hyper_p(hp, delta)
    this_corr$get_corr(points, actives = av)
  }
  func_to_opt <- function(params, log_lik = log_likelihood, return_stats = FALSE) {
    hp <- params[seq_along(hp_range)]
    delta <- params[length(params)]
    if (is.na(delta)) delta <- 0
    A <- corr_mat(inputs, hp, delta)
    a_det <- det(A)
    if (a_det < 0) a_det <- 1e-20
    A_inv <- tryCatch(chol2inv(chol(A)), error = function(e) ginv(A))
    outputs <- map_dbl(seq_len(nrow(outputs)), ~as.numeric(outputs[.,]))
    if (is.null(beta)) {
      inv_mat <- tryCatch(chol2inv(chol(t(H) %*% A_inv %*% H)), error = function(e) ginv(t(H) %*% A_inv %*% H))
      b_ml <- inv_mat %*% t(H) %*% A_inv %*% outputs
    }
    else b_ml <- beta
    mod_diff <- if (nrow(H) == 1) t(outputs - H * b_ml) else outputs - H %*% b_ml
    sigmasq_ml <- t(mod_diff) %*% A_inv %*% mod_diff/length(outputs)
    if (log_lik)
      lik <- -length(outputs) * log(sigmasq_ml)/2 - log(a_det)/2
    else
      lik <- 1/sqrt(sigmasq_ml^length(outputs)) * 1/sqrt(a_det)
    if (is.infinite(lik)) lik <- -Inf
    if (return_stats) return(list(beta = b_ml, sigma = as.numeric(sqrt(sigmasq_ml))))
    else return(lik)
  }
  if (length(hp_range[['theta']]) == 1) {
    if (is.null(delta)) delta <- 0.05
    best_point <- hp_range
    best_delta <- delta
    best_params <- c(best_point, best_delta)
  }
  func_grad <- function(params, log_lik = log_likelihood) { #nocov start
    hp <- params[seq_along(hp_range)]
    delta <- params[length(params)]
    if (is.na(delta)) delta <- 0
    A <- corr_mat(inputs, hp, delta)
    a_det <- det(A)
    if (a_det < 0) a_det <- 1e-20
    A_inv <- tryCatch(chol2inv(chol(A)), error = function(e) ginv(A))
    outputs <- map_dbl(seq_len(nrow(outputs)), ~as.numeric(outputs[.,]))
    A_diff_theta <- -2*(1-delta) * as.matrix(dist(inputs, diag = TRUE, upper = TRUE))^2/params[1]^3 * A
    A_diff_delta <- -A + diag(1, nrow = length(outputs))
    if (is.null(beta)) {
      inv_mat <- tryCatch(chol2inv(chol(t(H) %*% A_inv %*% H)), error = function(e) ginv(t(H) %*% A_inv %*% H))
      b_ml <- inv_mat %*% t(H) %*% A_inv %*% outputs
    }
    else b_ml <- beta
    mod_diff <- if (nrow(H) == 1) t(outputs - H * b_ml) else outputs - H %*% b_ml
    sigmasq_ml <- t(mod_diff) %*% A_inv %*% mod_diff/length(outputs)
    if (log_lik)
      lik <- map_dbl(list(A_diff_theta, A_diff_delta),
                            ~-(length(outputs) * (t(mod_diff) %*% A_inv %*% . %*% A_inv %*% mod_diff)/(t(mod_diff) %*% A_inv %*% mod_diff) +
                                sum(diag(A_inv %*% .)))/2
                            )
    else
      lik <- map_dbl(list(A_diff_theta, A_diff_delta),
                            ~-(t(mod_diff) %*% A_inv %*% . %*% A_inv %*% mod_diff)/(2*length(outputs)*sigmasq_ml^(length(outputs)/2 + 1) * sqrt(a_det)) -
                              sum(diag(A_inv %*% .))/(2 * sigmasq_ml^(length(outputs)/2) * sqrt(a_det)))
    if (any(is.infinite(lik))) lik <- c(0,0)
    return(lik)
  } #nocov end
  if (length(hp_range[['theta']]) == 1) {
    if (is.null(delta)) delta <- 0.05
    best_point <- hp_range
    best_delta <- delta
    best_params <- c(best_point, best_delta)
  }
  else {
    if (corr_name == "matern") {
      grid_search <- expand.grid(
        theta = seq(hp_range$theta[[1]], hp_range$theta[[2]], length.out = nsteps*4),
        nu = c(0.5, 1.5, 2.5)
      )
    }
    else {
      grid_search <- expand.grid(map(hp_range,
                                            ~seq(.[[1]], .[[2]], length.out = nsteps)))
    }
    grid_liks <- apply(grid_search, 1, function(x) {
      if (is.null(delta)) return(func_to_opt(c(x, 0.01)))
      else return(func_to_opt(c(x, delta)))
    })
    dists <- diag(Inf, nrow(inputs)) +
      as.matrix(dist(inputs, diag = TRUE, upper = TRUE))
    maximin_distance <- max(apply(dists, 1, min))
    best_point <- setNames(
      as.list(c(grid_search[which.max(grid_liks),])),
      names(hp_range)
    )
    if (maximin_distance < hp_range[['theta']][2])
      best_point$theta <- max(
        maximin_distance, grid_search[which.max(grid_liks), 'theta']
      )
    if (sum(av) == length(inputs)) delta <- 0
    if (is.null(delta)) best_delta <- 0.05
    else best_delta <- ifelse(delta == 0, 1e-10, delta)
    initial_params <- unlist(c(best_point, best_delta), use.names = FALSE)
  }
  if (perform_opt) {
    if (corr$corr_name == "exp_sq")
      optimise <- optim(initial_params, fn = func_to_opt,
                        gr = func_grad, method = "L-BFGS-B",
                        lower = c(map_dbl(hp_range, ~.[[1]]), 0),
                        upper = c(map_dbl(hp_range, ~.[[2]]), 0.2),
                        control = list(fnscale = -1))
    else
      optimise <- optim(initial_params, fn = func_to_opt,
                        method = "L-BFGS-B",
                        lower = c(map_dbl(hp_range, ~.[[1]]), 0),
                        upper = c(map_dbl(hp_range, ~.[[2]]), 0.2),
                        control = list(fnscale = -1))
    best_params <- optimise$par
  }
  else {
    best_params <- unlist(c(best_point, best_delta), use.names = FALSE)
  }
  other_pars <- func_to_opt(best_params, return_stats = TRUE)
  return(list(hp = setNames(as.list(best_params[-length(best_params)]), names(hp_range)),
              delta = best_params[length(best_params)],
              sigma = other_pars$sigma, beta = other_pars$beta))
}

#' Generate Emulators from Data
#'
#' Given data from simulator runs, generates a set of \code{\link{Emulator}} objects,
#' one for each output.
#'
#' Many of the parameters that can be passed to this function are optional: the minimal operating
#' example requires \code{input_data}, \code{output_names}, and one of \code{ranges} or
#' \code{input_names}. If \code{ranges} is supplied, the input names are intuited from that list,
#' data.frame, or data.matrix; if only \code{input_names} is supplied, then ranges are
#' assumed to be [-1, 1] for each input.
#'
#' The ranges can be provided in a few different ways: either as a named list of length-2
#' numeric vectors (corresponding to upper and lower bounds for each parameter); as a
#' data.frame with 2 columns and each row corresponding to a parameter; or as a data.matrix
#' defined similarly as the data.frame. In the cases where the ranges are provided as a
#' data.frame or data.matrix, the \code{row.names} of the data object must be provided, and
#' a warning will be given if not.
#'
#' If the set \code{(input_data, output_names, ranges)} is provided and nothing else,
#' then emulators are fitted as follows. The basis functions and associated regression
#' coefficients are generated using linear regression up to quadratic order, allowing for
#' cross-terms. These regression parameters are assumed 'known'.
#'
#' The correlation function c(x, x') is assumed to be \code{\link{exp_sq}} and a corresponding
#' \code{\link{Correlator}} object is created. The hyperparameters of the correlation
#' structure are determined using a constrained maximum likelihood argument. This determines
#' the variance, correlation length, and nugget term.
#'
#' The maximum allowed order of the regression coefficients is controlled by \code{order};
#' the regression coefficients themselves can be deemed uncertain by setting
#' \code{beta.var = TRUE} (in which case their values can change in the hyperparameter
#' estimation); the hyperparameter search can be overridden by specifying ranges for
#' each using \code{hp_range}.
#'
#' In the presence of expert beliefs about the structure of the emulators, information
#' can be supplied directly using the \code{specified_priors} argument. This can contain
#' specific regression coefficient values \code{beta} and regression functions \code{func},
#' correlation structures \code{u}, hyperparameter values \code{hyper_p} and nugget term
#' values \code{delta}.
#'
#' Some rudimentary data handling functionality exists, but is not a substitute for
#' sense-checking input data directly. The \code{na.rm} option will remove rows of
#' training data that contain NA values if true; the \code{check.ranges} option allows
#' a redefinition of the ranges of input parameters for emulator training if true. The
#' latter is a common practice in later waves of emulation in order to maximise the
#' predictive power of the emulators, but should only be used if it is believed that
#' the training set provided is truly representative of and spans the full space of
#' interest.
#'
#' Various different classes of emulator can be created using this function, depending
#' on the nature of the model. The \code{emulator_type} argument accepts a few different
#' options:
#'
#' \describe{
#' \item{"variance"}{Create emulators for the mean and variance surfaces, for each stochastic output}
#' \item{"covariance}{Create emulators for the mean surface, and a covariance matrix for the variance surface}
#' \item{"multistate"}{Create sets of emulators per output for multistate stochastic systems}
#' \item{"default"}{Deterministic emulators with no covariance structure}
#' }
#'
#' The "default" behaviour will apply if the \code{emulator_type} argument is not supplied, or
#' does not match any of the above options. If the data provided looks to display stochasticity,
#' but default behaviour is used, a warning will be generated and only the first model result
#' for each individual parameter set will be used in training.
#'
#' For examples of this function's usage (including optional argument behaviour), see the examples.
#'
#' @param input_data Required. A data.frame containing parameter and output values
#' @param output_names Required. A character vector of output names
#' @param ranges Required if input_names is not given. A named list of input parameter ranges
#' @param input_names Required if ranges is not given. The names of the parameters
#' @param emulator_type Selects between deterministic, variance, covariance, and multistate emulation
#' @param specified_priors A collection of user-determined priors (see description)
#' @param order To what polynomial order should regression surfaces be fitted?
#' @param beta.var Should uncertainty in the regression coefficients be included?
#' @param corr_name If not exp_sq, the name of the correlation structures to fit
#' @param adjusted Should the return emulators be Bayes linear adjusted?
#' @param discrepancies Any known internal or external discrepancies of the model
#' @param verbose Should status updates be provided?
#' @param na.rm If TRUE, removes output values that are NA
#' @param check.ranges If TRUE, modifies ranges to a conservative minimum enclosing hyperrectangle
#' @param targets If provided, outputs are checked for consistent over/underestimation
#' @param has.hierarchy Internal - distinguishes deterministic from hierarchical emulators
#' @param covariance_opts User-specified options for emulating covariance matrices
#' @param ... Any additional parameters for custom correlators or additional verbosity options
#'
#' @importFrom rlang hash
#' @importFrom cluster daisy fanny
#' @importFrom stats coef formula model.matrix var
#' @importFrom purrr map map_dbl map_lgl map_chr map2_dbl
#'
#' @return An appropriately structured list of \code{\link{Emulator}} objects
#' @export
#'
#' @examples
#' # Deterministic: use the SIRSample training dataset as an example.
#' ranges <- list(aSI = c(0.1, 0.8), aIR = c(0, 0.5), aSR = c(0, 0.05))
#' out_vars <- c('nS', 'nI', 'nR')
#' ems_linear <- emulator_from_data(SIRSample$training, out_vars, ranges, order = 1)
#' ems_linear # Printout of the key information.
#'
#' # Stochastic: use the BirthDeath training dataset
#' v_ems <- emulator_from_data(BirthDeath$training, c("Y"),
#'  list(lambda = c(0, 0.08), mu = c(0.04, 0.13)), emulator_type = 'variance')
#'
#' # If different specifications are wanted for variance/expectation ems, then
#' # enter a list with entries 'variance', 'expectation'. Eg corr_names
#' v_ems_corr <- emulator_from_data(BirthDeath$training, c("Y"),
#'  list(lambda = c(0, 0.08), mu = c(0.4, 0.13)), emulator_type = 'variance',
#'  corr_name = list(variance = "matern", expectation = "exp_sq")
#' )
#'
#' \donttest{ # Excessive runtime
#'   ems_quad <- emulator_from_data(SIRSample$training, out_vars, ranges)
#'   ems_quad # Now includes quadratic terms
#'   ems_cub <- emulator_from_data(SIRSample$training, out_vars, ranges, order = 3)
#'   ems_cub # Up to cubic order in the parameters
#'
#'   ems_unadjusted <- emulator_from_data(SIRSample$training, out_vars, ranges, adjusted = FALSE)
#'   ems_unadjusted # Looks the same as ems_quad, but the emulators are not Bayes Linear adjusted
#'
#'   # Reproduce the linear case, but with slightly adjusted beta values
#'   basis_f <- list(
#'    c(function(x) 1, function(x) x[[1]], function(x) x[[2]]),
#'    c(function(x) 1, function(x) x[[1]], function(x) x[[2]]),
#'    c(function(x) 1, function(x) x[[1]], function(x) x[[3]])
#'   )
#'   beta_val <- list(
#'    list(mu = c(550, -400, 250)),
#'    list(mu = c(200, 200, -300)),
#'    list(mu = c(200, 200, -50))
#'   )
#'   ems_custom_beta <- emulator_from_data(SIRSample$training, out_vars, ranges,
#'    specified_priors = list(func = basis_f, beta = beta_val)
#'   )
#'   # Custom correlation functions
#'   corr_structs <- list(
#'    list(sigma = 83, corr = Correlator$new('exp_sq', list(theta = 0.5), nug = 0.1)),
#'    list(sigma = 95, corr = Correlator$new('exp_sq', list(theta = 0.4), nug = 0.25)),
#'    list(sigma = 164, corr = Correlator$new('matern', list(theta = 0.2, nu = 1.5), nug = 0.45))
#'   )
#'   ems_custom_u <- emulator_from_data(SIRSample$training, out_vars, ranges,
#'   specified_priors = list(u = corr_structs))
#'   # Allowing the function to choose hyperparameters for 'non-standard' correlation functions
#'   ems_matern <- emulator_from_data(SIRSample$training, out_vars, ranges, corr_name = 'matern')
#'   # Providing hyperparameters directly
#'   matern_hp <- list(
#'    list(theta = 0.8, nu = 1.5),
#'    list(theta = 0.6, nu = 2.5),
#'    list(theta = 1.2, nu = 0.5)
#'   )
#'   ems_matern2 <- emulator_from_data(SIRSample$training, out_vars, ranges, corr_name = 'matern',
#'    specified_priors = list(hyper_p = matern_hp))
#'   # "Custom" correaltion function with user-specified ranges: gamma exponential
#'   # Any named, defined, correlation function can be passed. See Correlator documentation
#'   ems_gamma <- emulator_from_data(SIRSample$training, out_vars, ranges, corr_name = 'gamma_exp',
#'    specified_priors = list(hyper_p = list(gamma = c(0.01, 2), theta = c(1/3, 2))))
#'
#'   # Multistate emulation: use the stochastic SIR dataset
#'   SIR_names <- c("I10", "I25", "I50", "R10", "R25", "R50")
#'   b_ems <- emulator_from_data(SIR_stochastic$training, SIR_names,
#'    ranges, emulator_type = 'multistate')
#'
#'   # Covariance emulation, with specified non-zero matrix elements
#'   which_cov <- matrix(rep(TRUE, 16), nrow = 4)
#'   which_cov[2,3] <- which_cov[3,2] <- which_cov[1,4] <- which_cov[4,1] <- FALSE
#'   c_ems <- emulator_from_data(SIR_stochastic$training, SIR_names[-c(3,6)], ranges,
#'    emulator_type = 'covariance', covariance_opts = list(matrix = which_cov))
#' }
#'
emulator_from_data <- function(input_data, output_names, ranges,
                               input_names = names(ranges), emulator_type = NULL,
                               specified_priors = NULL, order = 2, beta.var = FALSE,
                               corr_name = "exp_sq", adjusted = TRUE, discrepancies = NULL,
                               verbose = interactive(), na.rm = FALSE, check.ranges = TRUE,
                               targets = NULL, has.hierarchy = FALSE, covariance_opts = NULL, ...) {
  if (is.data.frame(input_data)) {
    ## Data cleaning and checking
    if (!all(output_names %in% names(input_data)))
      stop("output_names do not match data. Check data.frame.")
    if (!is.null(targets) && length(intersect(names(targets), output_names) == length(output_names))) {
      do_preflight <- preflight(input_data, targets[output_names], verbose = verbose, na.rm = na.rm)
      if (do_preflight && verbose) {
        cat("Some outputs may not be adequately emulated", #nocov start
            "due to consistent over/underestimation of outputs in training data.\n")
        cat("Consider looking at the outputs (e.g. using behaviour_plot);",
            "some outputs may require extra runs and/or transformation applied.\n") #nocov end
      }
    }
  }
  if (missing(ranges)) {
    if (is.null(input_names))
      stop("One of input_names or ranges must be provided.")
    warning("No ranges provided: inputs assumed to be in ranges [-1,1].")
    ranges <- setNames(map(input_names, ~c(-1,1)), input_names)
  }
  else {
    ranges <- convertRanges(ranges)
  }
  if (is.null(ranges) || any(sapply(ranges, diff) <= 0)) stop("Ranges either not specified, or misspecified.")
  if (any(grepl("\u00a3", names(ranges)))) {
    stop("Character \u00a3 not permitted in input names - please rename.")
  }
  if (is.null(emulator_type) || !is.character(emulator_type))
    emulator_type <- "default"
  if (!emulator_type %in% c("default", "variance", "covariance", "multistate"))
    emulator_type <- "default"
  if (is.data.frame(input_data))
    output_ranges <- map_dbl(output_names, ~diff(range(input_data[,.])))
  else if (all(output_names %in% names(input_data)))
    output_ranges <- map_dbl(output_names, ~diff(range(input_data[[.]][,.])))
  else
    output_ranges <- rep(1e-2, length(output_names))
  output_ranges[is.na(output_ranges)] <- 1e-2
  if (emulator_type == "multistate")
    input_data_backup <- input_data
  if (is.data.frame(input_data)) {
    data_by_point <- split_dataset(input_data, input_names)
    # unique_hash <- unique(apply(input_data[, input_names, drop = FALSE], 1, hash))
    # data_by_point <- map(unique_hash, function(x) {
    #   input_data[apply(input_data[,names(ranges),drop = FALSE], 1, hash) == x,]
    # })
    if (any(map_dbl(data_by_point, nrow) > 1) && emulator_type == "default") {
      if (FALSE) { #nocov start
        ## Hidden for now. Might change this at some point!
        # if (verbose) {
        want_variance <- readline(paste("emulator_type is default but multiple runs provided for some parameter sets.\n",
                                        "If variance or covariance was wanted, enter 'variance' or 'covariance'.\n",
                                        "Else hit RETURN.\n "))
        if (want_variance == "variance" || want_variance == "covariance") {
          cat("Changed\n")
          emulator_type <- want_variance
        }
        else
          input_data <- do.call('rbind.data.frame', map(data_by_point, ~.[1,,drop=FALSE]))
      } #nocov end
      else {
        warning("emulator_type is default but multiple runs provided for some parameter sets.
            Training to the mean of realisations.")
        input_data <- do.call('rbind.data.frame', map(data_by_point, ~c(.[1,names(ranges)], apply(.[,output_names,drop=FALSE], 2, mean)))) |>
          setNames(names(input_data))
        variabilities <- do.call('rbind.data.frame', map(data_by_point, ~apply(.[,output_names,drop=FALSE], 2, var))) |> setNames(output_names)
        ave_vars <- apply(variabilities, 2, mean)
        if (is.null(discrepancies))
          discrepancies <- map(ave_vars, ~list(external = 0, internal = sqrt(.))) |> setNames(output_names)
        else {
          for (i in output_names) {
            if (is.null(discrepancies[[i]])) discrepancies[[i]] <- list(external = 0, internal = sqrt(ave_vars[[i]]))
            else if (is.null(discrepancies[[i]]$internal)) discrepancies[[i]]$internal <- sqrt(ave_vars[[i]])
            else discrepancies[[i]]$internal <- sqrt(discrepancies[[i]]$internal^2 + ave_vars[[i]])
          }
        }
      }
    }
    if (all(map_dbl(data_by_point, nrow) == 1) && emulator_type != "default") {
      warning("emulator_type is not default but only one model run per point.
            Changing to emulator_type = 'default'.")
      emulator_type <- "default"
    }
    if (na.rm) {
      input_data <- map(output_names, function(o) {
        new_df <- input_data[,c(names(ranges), o)]
        return(new_df[apply(new_df, 1, function(x) !any(is.na(x))),])
      }) |> setNames(output_names)
    }
    else
      input_data <- map(output_names, ~input_data[,c(names(ranges), .)])
  }
  if (check.ranges) {
    ranges <- setNames(
      map(
        names(ranges), function(nm) {
          all_pts <- do.call('c', map(input_data, ~.[,nm]))
          c(
            max(ranges[[nm]][1], min(all_pts)-0.05*diff(range(all_pts))),
            min(ranges[[nm]][2], max(all_pts)+0.05*diff(range(all_pts)))
          )
        }
      ),
    names(ranges))
    ranges <- convertRanges(ranges)
  }
  not_enough_points <- map_lgl(input_data, ~nrow(.) < 10*length(ranges))
  if (verbose && any(not_enough_points)) {
    not_enough_output <- output_names[which(not_enough_points)]
    print_message <- paste("Fewer than", 10*length(ranges), #nocov start
                           "valid points in", length(ranges),
                           "dimensions for outputs", paste0(not_enough_output, collapse = ", "),
                           "- treat these emulated outputs with caution or include more",
                           "training points (min 10 times number of input parameters)." #nocov end
                           )
  }
  data <- map(input_data, function(dat) {
    temp_names <- names(dat)
    cbind.data.frame(eval_funcs(scale_input, dat[,names(ranges)], ranges), dat[,length(dat)]) |>
      setNames(temp_names)
  })
  if (length(ranges) != length(input_names)) {
    input_names <- intersect(names(ranges), input_names)
    ranges <- ranges[input_names]
  }
  if (is.null(list(...)[['more_verbose']]))
    more_verbose <- (length(output_names) > 10)
  else
    more_verbose <- list(...)[['more_verbose']]
  if (!is.null(list(...)[['add']]))
    this_add <- list(...)[['add']]
  else
    this_add <- FALSE
  model_beta_mus <- model_u_sigmas <- model_u_corrs <- NULL
  if (emulator_type == "default") {
    if (is.null(specified_priors$func)) {
      if (verbose) cat("Fitting regression surfaces...\n") #nocov
      models <- map(data, ~get_coefficient_model(., ranges, names(.)[length(.)],
                                                        order = order, verbose = more_verbose, add = this_add))
      output_names <- output_names[!map_lgl(models, is.null)]
      models <- models[!map_lgl(models, is.null)]
      if (length(models) == 0)
        stop("No outputs can be robustly represented by regression surfaces at this order.")
      model_beta_mus <- map(models, coef)
      model_basis_funcs <- map(models, function(m) {
        map(names(m$coefficients), name_to_function, names(ranges))
      })
      if (!beta.var)
        model_beta_sigmas <- map(models, ~diag(0, nrow = length(coef(.))))
      else
        model_beta_sigmas <- map(models, ~vcov(.))
    }
    else {
      models <- NULL
      if (!(is.null(specified_priors$beta) ||
            any(map_lgl(specified_priors$beta, ~is.null(.$mu))))) {
        beta_priors <- specified_priors$beta
        if (any(map_lgl(seq_along(beta_priors), ~length(beta_priors[[.]]$mu) != length(specified_priors$func[[.]]))))
          stop("Provided regression function and coefficient specifications do not match.")
        model_beta_mus <- map(beta_priors, "mu")
        if (!any(map_lgl(beta_priors, ~is.null(.$sigma))))
          model_beta_sigmas <- map(beta_priors, "sigma")
        else
          model_beta_sigmas <- map(beta_priors, function(bp) {
            if (is.null(bp$sigma) || nrow(bp$sigma) != length(bp$mu) || ncol(bp$sigma) != length(bp$mu))
              return(diag(0, nrow = length(bp$mu)))
            return(bp$sigma)
          })
        model_basis_funcs <- specified_priors$func
      }
      else {
        model_basis_funcs <- specified_priors$func
        model_beta_sigmas <- map(model_basis_funcs, ~diag(0, nrow = length(.)))
      }
    }
    if (!is.null(specified_priors$delta)) model_deltas <- specified_priors$delta
    else model_deltas <- NULL
    if (!(is.null(specified_priors$u) || any(map_lgl(specified_priors$u, ~is.null(.$sigma) || is.null(.$corr))))) {
      model_u_sigmas <- map(specified_priors$u, "sigma")
      model_u_corrs <- map(specified_priors$u, "corr")
    }
    if (verbose) cat("Building correlation structures...\n") #nocov
    if (is.null(model_beta_mus) || is.null(model_u_sigmas) || is.null(model_u_corrs)) {
      corr_func <- tryCatch(
        get(corr_name),
        error = function(e) {
          warning(paste("Can't find correlation function of type",
                        corr_name, "- reverting to 'exp_sq'"))
          return(NULL)
        }
      )
      if (is.null(specified_priors$hyper_p)) {
        if (is.null(corr_func)) corr_name <- "exp_sq"
        else {
          if (!corr_name %in% c("exp_sq", "orn_uhl", "matern", "rat_quad")) {
            th_ra <- list(...)[["theta_ranges"]]
            if (is.null(th_ra)) {
              warning(paste("User-defined correlation function", corr_name,
                            "found but no corresponding hyperparameter ranges
                            via theta_ranges. Reverting to 'exp_sq'"))
              corr_name <- 'exp_sq'
            }
            else {
              theta_ranges <- list()
              for (i in seq_along(model_basis_funcs))
                theta_ranges[[length(theta_ranges)+1]] <- th_ra
            }
          }
        }
        if (corr_name == "exp_sq" || corr_name == "orn_uhl")
          theta_ranges <- map(model_basis_funcs,
                                     ~list(theta = c(min(2/(order+1), 1/3), 2/order)))
        else if (corr_name == "matern")
          theta_ranges <- map(model_basis_funcs,
                                     ~list(theta = c(min(2/(order+1), 1/3), 2/order),
                                           nu = c(0.5, 2.5)))
        else if (corr_name == "rat_quad")
          theta_ranges <- map(model_basis_funcs,
                                     ~list(theta = c(min(2/(order+1), 1/3), 2/order),
                                           alpha = c(-1, 1)))
      }
      else {
        if (corr_name == "exp_sq" || corr_name == "orn_uhl") {
          if (length(specified_priors$hyper_p) == 1)
            theta_ranges <- map(model_basis_funcs,
                                       ~list(theta = specified_priors$hyper_p))
          else
            theta_ranges <- map(seq_along(model_basis_funcs),
                                       ~list(theta = specified_priors$hyper_p[[.]]))
        }
        else {
          if (is.null(names(specified_priors$hyper_p)))
            theta_ranges <- map(seq_along(model_basis_funcs),
                                       ~specified_priors$hyper_p[[.]])
          else
            theta_ranges <- map(seq_along(model_basis_funcs),
                                       ~specified_priors$hyper_p)
        }
      }
      if (is.null(models))
        specs <- map(seq_along(model_basis_funcs),
                            ~hyperparameter_estimate(
                              data[[.]][,input_names],
                              data[[.]][,length(data),drop=FALSE],
                              model_basis_funcs[[.]],
                              corr_name = corr_name,
                              hp_range = theta_ranges[[.]],
                              beta = model_beta_mus[[.]],
                              delta = model_deltas,
                              verbose = more_verbose
                            ))
      else
        specs <- map(seq_along(model_basis_funcs),
                            ~hyperparameter_estimate(
                              data[[.]][,input_names,drop=FALSE],
                              data[[.]][,length(data[[.]]),drop=FALSE],
                              models[[.]],
                              corr_name = corr_name,
                              hp_range = theta_ranges[[.]],
                              beta = model_beta_mus[[.]],
                              delta = model_deltas,
                              verbose = more_verbose
                            ))
      if (is.null(model_u_sigmas)) model_u_sigmas <-
          tryCatch(
            map(seq_along(specs), ~max(output_ranges[[.]]*1e-3, as.numeric(specs[[.]]$sigma))),
            error = function(e) {
              map(seq_along(specs), ~as.numeric(specs[[.]]$sigma))
            }
          )
      if (is.null(model_beta_mus)) model_beta_mus <- map(specs, ~.$beta)
      if (is.null(model_u_corrs)) model_u_corrs <- map(specs, ~Correlator$new(corr_name, hp = .$hp, nug = .$delta))
    }
    model_us <- map(seq_along(model_u_corrs),
                           ~list(sigma = model_u_sigmas[[.]], corr = model_u_corrs[[.]]))
    model_betas <- map(seq_along(model_beta_mus),
                              ~list(mu = model_beta_mus[[.]], sigma = model_beta_sigmas[[.]]))
    if (!is.null(discrepancies)) {
      if (is.numeric(discrepancies)) discrepancies <- map(discrepancies,
                                                                 ~list(internal = ., external = 0))
    }
    else {
      discrepancies <- map(model_betas, ~list(internal = 0, external = 0))
    }
    if (verbose) cat("Creating emulators...\n") #nocov
    if (!has.hierarchy) {
      out_ems <- setNames(map(seq_along(model_us),
                                     ~Emulator$new(basis_f = model_basis_funcs[[.]],
                                                   beta = model_betas[[.]],
                                                   u = model_us[[.]],
                                                   ranges = ranges,
                                                   model = tryCatch(models[[.]], error = function(e) NULL),
                                                   discs = discrepancies[[.]],
                                                   )),
                          output_names)
    } else {
      out_ems <- setNames(map(seq_along(model_us),
                                     ~HierarchicalEmulator$new(basis_f = model_basis_funcs[[.]],
                                                               beta = model_betas[[.]],
                                                               u = model_us[[.]],
                                                               ranges = ranges,
                                                               model = tryCatch(models[[.]], error = function(e) NULL),
                                                               discs = discrepancies[[.]])),
                          output_names)
    }
    if (!is.null(model_deltas))
      for (i in seq_along(model_deltas)) out_ems[[i]]$corr$nugget <- model_deltas[[i]]
    for (i in seq_along(out_ems)) out_ems[[i]]$output_name <- output_names[[i]]
    if (adjusted) {
      if (verbose) cat("Performing Bayes linear adjustment...\n") #nocov
      out_ems <- map(seq_along(out_ems), ~out_ems[[.]]$adjust(input_data[[.]], out_ems[[.]]$output_name))
    }
    return(setNames(out_ems, output_names))
  }
  ## Variance emulation starts here
  if (verbose) cat("Multiple model runs per point detected. Splitting by input parameters...\n") #nocov
  unique_points <- unique(do.call('rbind.data.frame', map(input_data, ~.[,input_names])))
  unique_uids <- apply(unique_points, 1, hash)
  data_by_point <- map(input_data, split_dataset, input_names)
  data_by_point <- map(data_by_point, function(dat) {
    dat[map_lgl(dat, ~nrow(.) > 1)]
  })
  split_hashes <- map(data_by_point, function(d) {
    map_chr(d, ~apply(.[,input_names], 1, hash)[1])
  })
  param_sets <- map(unique_uids, function(uid) {
    output_vals <- map(seq_along(split_hashes), function(sh) {
      which_matches <- which(split_hashes[[sh]] == uid)
      if (length(which_matches) == 0) return(NULL)
      return(c(data_by_point[[sh]][[which_matches[1]]][,length(data_by_point[[sh]][[which_matches[1]]])]))
    })
    largest_length <- max(map_dbl(output_vals, length))
    output_vals_padded <- map(output_vals, ~c(., rep(NA, largest_length-length(.))))
    which_matches <- which(unique_uids == uid)[1]
    which_point <- unique_points[which_matches,]
    return(cbind.data.frame(which_point[rep(1,largest_length),], do.call('cbind.data.frame', output_vals_padded)) |>
             setNames(c(input_names, output_names)))
  })
  if (verbose) cat("Separated dataset by unique points...\n") #nocov
  ## Multistate emulation goes here, so that if it isn't multistate we can go to variance emulation
  if (emulator_type == "multistate") {
    proportion <- map_dbl(param_sets, function(x) {
      p_clust <- suppressWarnings(fanny(suppressWarnings(daisy(x[,output_names, drop = FALSE])), k = 2)$clustering)
      return(sum(p_clust == 1)/length(p_clust))
    })
    unique_points <- unique(input_data_backup[,names(ranges)])
    prop_df <- cbind.data.frame(unique_points, proportion) |> setNames(c(names(unique_points), "prop"))
    has_bimodality <- do.call('rbind.data.frame', map(param_sets, function(x) {
      map_lgl(output_names, function(y) {
        if (length(unique(x[,y])) == 1) return(FALSE)
        clust1 <- suppressWarnings(fanny(suppressWarnings(daisy(x[,y,drop=FALSE])), k = 1))
        clust2 <- suppressWarnings(fanny(suppressWarnings(daisy(x[,y,drop=FALSE])), k = 2))
        if (clust1$objective[["objective"]] < clust2$objective[["objective"]]) return(FALSE)
        return(TRUE)
      })
    })) |> setNames(output_names)
    is_bimodal_target <- apply(has_bimodality, 2, function(x) sum(x)/length(x) >= 0.1)
    if (!any(is_bimodal_target)) {
      if (verbose) cat("No targets appear to be multistate. Reverting to variance emulation.\n") #nocov
      emulator_type <- "variance"
    }
    if (emulator_type == "multistate") {
      if (verbose) cat("Training an emulator to proportion in each mode.\n") #nocov
      prop_em <- emulator_from_data(prop_df, c('prop'), ranges, verbose = FALSE,
                                    specified_priors = specified_priors$prop, order = order,
                                    beta.var = beta.var, corr_name = corr_name, adjusted = adjusted,
                                    discrepancies = discrepancies$prop, na.rm = na.rm, check.ranges = check.ranges,
                                    targets = targets$prop, has.hierarchy = FALSE, covariance_opts = NULL, ...)
      if (any(!is_bimodal_target)) {
        if (verbose) cat("Training to single-state targets.\n") #nocov
        non_bimodal <- emulator_from_data(input_data_backup, output_names[!is_bimodal_target], ranges, verbose = FALSE,
                                          specified_priors = specified_priors, order = order, emulator_type = "variance",
                                          beta.var = beta.var, corr_name = corr_name, adjusted = adjusted,
                                          discrepancies = discrepancies, na.rm = na.rm, check.ranges = check.ranges,
                                          targets = targets, has.hierarchy = TRUE, covariance_opts = covariance_opts, ...)

      }
      else {
        if (verbose) {
          cat("No targets appear to be single-state.\n") #nocov
        }
        non_bimodal <- NULL
      }
      if (verbose) cat("Training to multistate targets.\n") #nocov
      bimodal <- map(output_names[is_bimodal_target], function(x) {
        c1_data <- list()
        c2_data <- list()
        param_bimodal <- has_bimodality[,x]
        for (i in seq_along(param_bimodal)) {
          if (!param_bimodal[[i]]) {
            c1_data[[length(c1_data)+1]] <- param_sets[[i]]
            c2_data[[length(c2_data)+1]] <- param_sets[[i]]
          }
          else {
            this_clust <- fanny(suppressWarnings(daisy(param_sets[[i]][,x,drop=FALSE])), k=2)$clustering
            c1_data[[length(c1_data)+1]] <- param_sets[[i]][this_clust == 1,]
            c2_data[[length(c2_data)+1]] <- param_sets[[i]][this_clust == 2,]
          }
        }
        mode1_dat <- do.call('rbind.data.frame', c1_data)[,c(names(ranges), x)]
        mode2_dat <- do.call('rbind.data.frame', c2_data)[,c(names(ranges), x)]
        return(list(mode1 = mode1_dat, mode2 = mode2_dat))
      }) |> setNames(output_names[is_bimodal_target])
      mode1_dats <- map(bimodal, "mode1") |> setNames(output_names[is_bimodal_target])
      mode2_dats <- map(bimodal, "mode2") |> setNames(output_names[is_bimodal_target])
      mode1_ems <- emulator_from_data(mode1_dats, names(mode1_dats), ranges, verbose = FALSE, emulator_type = "variance",
                                      specified_priors = specified_priors, order = order, beta.var = beta.var,
                                      corr_name = corr_name, adjusted = adjusted, discrepancies = discrepancies,
                                      na.rm = na.rm, check.ranges = check.ranges, targets = targets, has.hierarchy = TRUE,
                                      covariance_opts = covariance_opts, more_verbose = FALSE, ...)
      mode2_ems <- emulator_from_data(mode2_dats, names(mode2_dats), ranges, verbose = FALSE, emulator_type = "variance",
                                      specified_priors = specified_priors, order = order, beta.var = beta.var,
                                      corr_name = corr_name, adjusted = adjusted, discrepancies = discrepancies,
                                      na.rm = na.rm, check.ranges = check.ranges, targets = targets, has.hierarchy = TRUE,
                                      covariance_opts = covariance_opts, more_verbose = FALSE, ...)
      if (verbose) cat("Trained emulators. Collating.\n") #nocov
      m1exps <- m2exps <- m1vars <- m2vars <- list()
      for (i in output_names) {
        if (i %in% names(non_bimodal$expectation)) {
          m1exps <- c(m1exps, non_bimodal$expectation[[i]])
          m2exps <- c(m2exps, non_bimodal$expectation[[i]])
          m1vars <- c(m1vars, non_bimodal$variance[[i]])
          m2vars <- c(m2vars, non_bimodal$variance[[i]])
        }
        else {
          m1exps <- c(m1exps, mode1_ems$expectation[[i]])
          m2exps <- c(m2exps, mode2_ems$expectation[[i]])
          m1vars <- c(m1vars, mode1_ems$variance[[i]])
          m2vars <- c(m2vars, mode2_ems$variance[[i]])
        }
      }
      names(m1exps) <- names(m1vars) <-
        names(m2exps) <- names(m2vars) <- output_names
      return(list(
        mode1 = list(expectation = m1exps, variance = m1vars),
        mode2 = list(expectation = m2exps, variance = m2vars),
        prop = prop_em[[1]]
      ))
    }
  }
  if (length(output_names) == 1 && emulator_type == "covariance") {
    warning("Covariance emulation selected, but only one output. Changing to emulator_type = 'variance")
    emulator_type <- "variance"
  }
  if (emulator_type == "variance") {
    ## This is a bit of a cludge to deal with bimodal emulators where
    ## one parameter doesn't contribute to an output
    data_by_point <- map(data_by_point, function(dat) {
      dat_out_name <- names(dat)[length(names(dat))]
      unique_points_in_dat <- map_chr(dat, function(subdat) {
        apply(subdat[,names(ranges)], 1, hash)[1]
      })
      which_missing <- which(!unique_uids %in% unique_points_in_dat)
      if (length(which_missing) == 0) return(dat)
      map(seq_along(unique_uids), function(i) {
        if (!i %in% which_missing) return(dat[[which(unique_points_in_dat == unique_uids[i])]])
        else {
          ## This ALMOST works. But I think it's having downstream effects.
          fake_df <- data.frame(matrix(
            c(rep(0, length(ranges)), NA, rep(0, length(ranges)), NA),
            nrow = 2, byrow = TRUE
          )) |> setNames(c(names(ranges), dat_out_name))
          return(fake_df)
        }
      })
    })
    collected_stats <- map(data_by_point, function(dat) {
      lapply(dat, function(x) {
        n_points <- nrow(x)
        out_vals <- x[,length(x)]
        mu <- mean(out_vals, na.rm = TRUE)
        sigsq <- var(out_vals, na.rm = TRUE)
        kurt <- kurtosis(out_vals, na.rm = TRUE)
        vofv <- sigsq^2 * (kurt - 1 + 2/(n_points-1))/n_points
        kurt_rel <- ifelse(!is.nan(kurt) && !is.na(kurt) && (vofv/sigsq^2) <= 1, kurt, NA)
        return(list(
          point = as.numeric(x[1,input_names], use.names = FALSE),
          mean = mu,
          var = sigsq,
          kurt = kurt_rel,
          np = n_points
        ))
      })
    })
    collected_df_pts <- do.call('rbind.data.frame', map(collected_stats[[1]], "point")) |>
      setNames(input_names)
    collected_df_var <- cbind.data.frame(
      collected_df_pts,
      do.call('cbind.data.frame', map(collected_stats, ~map_dbl(., "var")))
    ) |>
      setNames(c(input_names, output_names))
    collected_df_mean <- cbind.data.frame(
      collected_df_pts,
      do.call('cbind.data.frame', map(collected_stats, ~map_dbl(., "mean")))
    ) |>
      setNames(c(input_names, output_names))
    collected_df_kurt <- cbind.data.frame(
      collected_df_pts,
      do.call('cbind.data.frame', map(collected_stats, ~map_dbl(., "kurt")))
    ) |>
      setNames(c(input_names, output_names))
    collected_df_kurt <- cbind.data.frame(collected_df_kurt,
                                          do.call('cbind.data.frame', map(collected_stats, ~map_dbl(., "np")))) |>
      setNames(c(input_names, output_names, paste0(output_names, "_n")))
  }
  if (emulator_type == "covariance") {
    cov_out_names <- outer(output_names, output_names, paste0)
    data_by_point <- map(seq_along(data_by_point[[1]]), function(i) {
      cbind.data.frame(data_by_point[[1]][[i]][,names(ranges)],
                       do.call('cbind.data.frame', map(data_by_point, ~.[[i]][,length(.[[i]])]))) |>
        setNames(c(names(ranges), output_names))
    })
    collected_stats <- lapply(data_by_point, function(x) {
      n_points <- nrow(x)
      means <- apply(x[,output_names], 2, mean)
      covs <- cov(x[,output_names])
      kurts <- apply(x[,output_names], 2, kurtosis)
      vofv <- diag(covs)^2 * (kurts - 1 + 2/(n_points-1))/n_points
      kurt_relev <- map_dbl(seq_along(kurts), function(y) {
        if (!is.nan(kurts[y]) && (vofv/diag(covs)^2)[y] <= 1) kurts[y] else NA
      })
      if (!is.null(covariance_opts$logged) && covariance_opts$logged) {
        c_est <- eigen(covs)
        c_vals <- c_est$values
        c_vals[c_vals <= 0] <- 1e-8
        c_vals <- log(c_vals)
        covs <- c_est$vectors %*% diag(c_vals) %*% t(c_est$vectors)
      }
      return(
        list(
          point = as.numeric(x[1,input_names], use.names = FALSE),
          means = means,
          covs = covs,
          kurt = kurt_relev,
          np = n_points
        )
      )
    })
    collected_df_var <- do.call('rbind.data.frame',
                                map(collected_stats, function(x) {
                                  c(
                                    x$point, diag(x$covs)
                                  )
                                })) |>
      setNames(c(input_names, output_names))
    collected_df_cov <- do.call('rbind.data.frame',
                                map(collected_stats, function(x) {
                                  c(
                                    x$point, c(x$covs[upper.tri(x$covs)])
                                  )
                                })) |>
      setNames(c(input_names, cov_out_names[upper.tri(cov_out_names)]))
    collected_df_mean <- do.call('rbind.data.frame',
                                 map(collected_stats, function(x) {
                                   c(
                                     x$point, x$means
                                   )
                                 })) |>
      setNames(c(input_names, output_names))
    collected_df_kurt <- do.call('rbind.data.frame',
                                  map(collected_stats, function(x) {
                                    c(
                                      x$point, x$kurt, x$np
                                    )
                                  })) |>
      setNames(c(input_names, output_names, "n"))
  }
  if (verbose) cat("Computed summary statistics...\n") #nocov
  if (verbose) cat("Building variance emulator priors...\n") #nocov
  which_high_rep <- do.call('cbind.data.frame',
                            map(output_names,
                                       ~!is.na(collected_df_kurt[,.]))) |>
    setNames(output_names)
  which_high_rep[!which_high_rep] <- NA
  collected_df_var_model <- collected_df_var
  collected_df_var_model[,output_names] <- collected_df_var_model[,output_names] * which_high_rep
  if (!is.character(corr_name) && !is.null(corr_name$variance)) corr_name_var <- corr_name$variance
  else corr_name_var <- corr_name
  variance_emulators <- emulator_from_data(collected_df_var_model, output_names, ranges,
                                           input_names = input_names, emulator_type = "default",
                                           specified_priors = specified_priors$variance,
                                           order = max(order-1, 1), beta.var = beta.var,
                                           corr_name = corr_name_var, adjusted = FALSE,
                                           discrepancies = discrepancies$variance,
                                           verbose = FALSE, na.rm = TRUE, check.ranges = FALSE,
                                           targets = targets$variance, has.hierarchy = TRUE, ...)
  kurt_aves <- apply(collected_df_kurt[,output_names,drop=FALSE], 2, function(x) {
    if (sum(x[!is.na(x)]) < 2) return(3)
    return(mean(x[!is.na(x)]))
  })
  trained_var_ems <- map(names(variance_emulators), function(v_name) {
    if (round(variance_emulators[[v_name]]$u_sigma, 10) <= 0) {
      s_vars <- collected_df_var[!is.na(collected_df_kurt[,v_name]), v_name]
      if (emulator_type == "covariance")
        s_n <- collected_df_kurt[!is.na(collected_df_kurt[,v_name]), "n"]
      else
        s_n <- collected_df_kurt[!is.na(collected_df_kurt[,v_name]), paste0(v_name, "_n")]
      sig_est <- s_vars^2 * (kurt_aves[v_name]-1+2/(s_n-1))/s_n
      variance_emulators[[v_name]]$u_sigma <- sqrt(mean(sig_est))
    }
    var_mod <- function(x, n) {
      if (n > 1)
        return((variance_emulators[[v_name]]$get_exp(x)^2 + variance_emulators[[v_name]]$get_cov(x))/n *
                 (kurt_aves[v_name] - 1 + 2/(n-1)))
      return(0)
    }
    variance_emulators[[v_name]]$s_diag <- var_mod
    variance_emulators[[v_name]]$em_type <- "variance"
    valid_cov_indices <- which(!is.na(collected_df_var[,v_name]))
    if (emulator_type == "covariance")
      variance_emulators[[v_name]]$samples <- collected_df_kurt$n
    else
      variance_emulators[[v_name]]$samples <- collected_df_kurt[valid_cov_indices,paste0(v_name, "_n")]
    adjust_df <- collected_df_var[valid_cov_indices ,c(input_names, v_name)]
    return(variance_emulators[[v_name]]$adjust(adjust_df, v_name))
  }) |> setNames(output_names)
  if (verbose && emulator_type != "covariance") cat("Completed variance emulators...\n") #nocov
  if (verbose) cat("Creating prior (untrained) mean emulators...\n") #nocov
  if (!is.character(corr_name) && !is.null(corr_name$expectation)) corr_name_exp <- corr_name$expectation
  else corr_name_exp <- corr_name
  mean_emulators <- emulator_from_data(collected_df_mean, output_names, ranges,
                                       input_names = input_names, emulator_type = "default",
                                       specified_priors = specified_priors$expectation,
                                       order = order, beta.var = beta.var,
                                       corr_name = corr_name_exp, adjusted = FALSE,
                                       discrepancies = discrepancies$expectation,
                                       verbose = FALSE, na.rm = TRUE, check.ranges = FALSE,
                                       targets = targets$expectation, has.hierarchy = TRUE, ...)
  if (verbose && emulator_type == "covariance") cat("Variance emulators trained. Creating covariance emulators...\n") #nocov
  ### Covariance emulation here...
  if (emulator_type == "covariance") {
    name_to_time <- function(names) {
      t_names <- sub(".*[^\\d](\\d+)$", "\\1", names, perl = TRUE)
      return(suppressWarnings(as.numeric(t_names)))
    }
    multi_point_cov <- function(i, j, x, theta.t, rho, v_ems) {
      part_indices <- partition_by_output(data, output_names, FALSE, TRUE)
      i_name <- v_ems[[i,i]]$output_name
      j_name <- v_ems[[j,j]]$output_name
      if (nrow(rho) == 1) group1 <- group2 <- 1
      else {
        group1 <- which(map_lgl(part_indices, ~i_name %in% .))
        group2 <- which(map_lgl(part_indices, ~j_name %in% .))
      }
      t_nam <- name_to_time(c(i_name, j_name))
      if (any(is.na(t_nam))) t_nam <- rep(0, 2)
      rho[group1, group2] * exp(-diff(t_nam)^2/theta.t^2) * sqrt(
        v_ems[[i,i]]$get_cov(x) * v_ems[[j,j]]$get_cov(x)
      )
    }
    comb_rv <- function(i, j, x, v_ems, k_est = 3) {
      if (length(k_est) == 1) k_est <- c(k_est, k_est)
      rvi <- (k_est[1]-1)*(v_ems[[i,i]]$get_cov(x) + v_ems[[i,i]]$get_exp(x)^2)
      rvj <- (k_est[2]-1)*(v_ems[[j,j]]$get_cov(x) + v_ems[[j,j]]$get_exp(x)^2)
      return(map2_dbl(rvi, rvj, min))
    }
    cov_vt <- function(x, i, j, n, cov_ems, theta.t, rho, k_est = 3) {
      if (i > j) {
        temp <- j
        j <- i
        i <- temp
      }
      1/n * comb_rv(i, j, x, cov_ems, k_est) + 1/(n*(n-1)) * (
        multi_point_cov(i, j, x, theta.t, rho, cov_ems) +
          cov_ems[[i,i]]$get_exp(x) * cov_ems[[j,j]]$get_exp(x) +
          cov_ems[[i,j]]$get_exp(x)^2
      )
    }
    if (is.null(covariance_opts$matrix))
      covariance_opts$matrix <- matrix(TRUE, nrow = length(variance_emulators), ncol = length(variance_emulators))
    which_outputs <- cov_out_names[upper.tri(cov_out_names) & covariance_opts$matrix]
    init_cov_ems <- emulator_from_data(collected_df_cov, which_outputs, ranges, input_names = input_names,
                                       emulator_type = "default", specified_priors = specified_priors$covariance,
                                       order = max(1, order - 1), beta.var = beta.var, corr_name = corr_name,
                                       adjusted = FALSE, discrepancies = discrepancies$covariance,
                                       verbose = FALSE, more_verbose = FALSE, na.rm = TRUE,
                                       check.ranges = FALSE, targets = targets$covariance, has.hierarchy = TRUE,
                                       ...)
    zero_em <- Proto_emulator$new(
      output_name = "zero_emulator",
      ranges = init_cov_ems[[1]]$ranges,
      predict_func <- function(x) return(rep(0, max(1,nrow(x)))),
      variance_func <- function(x) return(rep(1, max(1,nrow(x))))
    )
    ## Got to at least here
    init_cov_mat <- matrix(nrow = length(variance_emulators), ncol = length(variance_emulators))
    init_cov_mat[upper.tri(init_cov_mat) & covariance_opts$matrix] <- init_cov_ems
    init_cov_mat <- matrix(init_cov_mat, nrow = length(variance_emulators), ncol = length(variance_emulators))
    how_many_zero <- sum(upper.tri(init_cov_mat) & !covariance_opts$matrix)
    if (how_many_zero != 0)
      init_cov_mat[upper.tri(init_cov_mat) & !covariance_opts$matrix] <- list(zero_em)
    init_cov_mat[col(init_cov_mat) == row(init_cov_mat)] <- variance_emulators
    recomb_input_data <- cbind.data.frame(
      input_data[[1]][,input_names],
      do.call('cbind.data.frame', map(input_data, ~.[,length(.)]))
    ) |> setNames(c(input_names, output_names))
    if (is.null(covariance_opts$rho)) rho_mat <- get_mpc_rho_est(recomb_input_data, output_names)
    else rho_mat <- covariance_opts$rho
    if (is.null(covariance_opts$theta)) theta_val <- get_mpc_theta_est(recomb_input_data, output_names, variance_emulators, rho_mat)
    else theta_val <- covariance_opts$theta
    if (theta_val == 0 || is.nan(theta_val)) theta_val <- 1
    trained_cov_ems <- map(1:length(init_cov_ems), function(i) {
      indices <- as.numeric(which(cov_out_names == init_cov_ems[[i]]$output_name, arr.ind = TRUE))
      kurts <- kurt_aves[indices]
      init_cov_ems[[i]]$s_diag <- function(x, n) {
        cov_vt(x, indices[1], indices[2], n, init_cov_mat, theta_val, rho_mat, kurts)
      }
      init_cov_ems[[i]]$samples <- collected_df_kurt$n
      init_cov_ems[[i]]$em_type <- "covariance"
      init_cov_ems[[i]]$adjust(collected_df_cov, init_cov_ems[[i]]$output_name)
    })
    trained_cov_mat <- matrix(nrow = length(trained_var_ems), ncol = length(trained_var_ems))
    trained_cov_mat[upper.tri(trained_cov_mat) & covariance_opts$matrix] <- trained_cov_ems
    trained_cov_mat <- matrix(trained_cov_mat, nrow = length(trained_var_ems), ncol = length(trained_var_ems))
    trained_cov_mat[upper.tri(trained_cov_mat) & !covariance_opts$matrix] <- list(zero_em)
    trained_cov_mat[col(trained_cov_mat) == row(trained_cov_mat)] <- trained_var_ems
  }
  ## Couldn't have got to here
  ## Mean emulators
  if (verbose) {
    if (emulator_type == "covariance")
      cat("Completed covariance emulators. Training mean emulators...\n")
    else
      cat("Training mean emulators...\n")
  }
  trained_mean_ems <- map(output_names, function(m_name) {
    if (!is.null(covariance_opts$logged) && covariance_opts$logged)
      mean_emulators[[m_name]]$s_diag <- function(x, n) exp(trained_var_ems[[m_name]]$get_exp(x))/n
    else
      mean_emulators[[m_name]]$s_diag <- function(x, n) trained_var_ems[[m_name]]$get_exp(x)/n
    valid_mean_indices <- which(!is.nan(collected_df_mean[,m_name]))
    if (emulator_type == "covariance")
      mean_emulators[[m_name]]$samples <- collected_df_kurt$n
    else
      mean_emulators[[m_name]]$samples <- collected_df_kurt[valid_mean_indices, paste0(m_name, "_n")]
    valid_mean <- collected_df_mean[valid_mean_indices, c(input_names, m_name)]
    return(mean_emulators[[m_name]]$adjust(valid_mean, m_name))
  }) |> setNames(output_names)
  if (emulator_type == "covariance")
    return(list(variance = EmulatedMatrix$new(trained_cov_mat, theta_val, rho_mat, logged = (!is.null(covariance_opts$logged) && covariance_opts$logged)),
                expectation = trained_mean_ems))
  return(list(variance = trained_var_ems, expectation = trained_mean_ems))
}

#' Variance Emulator Creation (Deprecated)
#'
#' Trains hierarchical emulators to stochastic systems
#'
#' This function is deprecated in favour of using \code{\link{emulator_from_data}}
#' with argument \code{emulator_type = "variance"}. See the associated help file.
#'
#' For stochastic systems, one may emulate the variance as well as the function itself.
#' This is particularly true if one expects the variance to be very different in different
#' areas of the parameter space (for example, in an epidemic model). This function performs
#' the requisite two-stage Bayes Linear update.
#'
#' All observations are required (including replicates at points) - this function collects
#' them into the required chunks and calculates the summary statistics as required.
#'
#' All other parameters passed to this function are equivalent to those in
#' emulators are the Bayes Linear adjusted forms.
#'
#' @importFrom stats var
#'
#' @param input_data All model runs at all points.
#' @param output_names The observation names.
#' @param ranges A named list of parameter ranges
#' @param input_names The names of the parameters (if \code{ranges} is not provided).
#' @param verbose Should status updates be printed to console?
#' @param na.rm Should NA values be removed before training?
#' @param ... Optional parameters that can be passed to \code{link{emulator_from_data}}.
#'
#' @return A list of lists: one for the variance emulators and one for the function emulators.
#'
#' @references Goldstein & Vernon (2016) in preparation
#'
#' @examples
#' \donttest{ # Excessive runtime
#'  # A simple example using the BirthDeath dataset
#'  v_ems <- variance_emulator_from_data(BirthDeath$training, c("Y"),
#'   list(lambda = c(0, 0.08), mu = c(0.04, 0.13)), c_lengths = c(0.75))
#' }
#'
#' @export
variance_emulator_from_data <- function(input_data, output_names, ranges,
                                        input_names = names(ranges),
                                        verbose = interactive(), na.rm = FALSE, ...) {
  .Deprecated(new = "emulator_from_data", msg = "variance_emulator_from_data(...) is deprecated in favour of emulator_from_data(..., emulator_type = 'variance').")
  if (na.rm) input_data <- input_data[apply(
    input_data, 1, function(x) !any(is.na(x))),]
  unique_points <- unique(input_data[, input_names])
  uids <- apply(unique_points, 1, hash)
  data_by_point <- map(uids, function(x) {
    input_data[apply(input_data[,names(ranges)], 1, hash) == x,]
  })
  data_by_point <- data_by_point[map_lgl(data_by_point, ~nrow(.)>1)]
  if (verbose) cat("Separated dataset by unique points...\n") #nocov
  collected_stats <- do.call('rbind', lapply(data_by_point, function(x) {
    n_points <- nrow(x)
    if (length(output_names) == 1) {
      means <- mean(x[,output_names])
      vars <- var(x[,output_names])
      kurts <- kurtosis(x[,output_names])
    }
    else {
      means <- apply(x[,output_names], 2, mean)
      vars <- apply(x[,output_names], 2, var)
      kurts <- apply(x[,output_names], 2, kurtosis)
    }
    vofv <- vars^2 * (kurts - 1 + 2/(n_points-1))/n_points
    kurt_relevant <- map_dbl(seq_along(kurts), function(x) {
      if(!is.nan(kurts[x]) && (vofv/vars^2)[x] <= 1) kurts[x] else NA
    })
    return(c(x[1, input_names], means, vars, kurt_relevant,
             n_points, use.names = FALSE))
  }))
  collected_df <- setNames(data.frame(collected_stats),
                           c(input_names, paste0(output_names, "mean"),
                             paste0(output_names, "var"),
                             paste0(output_names, "kurt"), "n"))
  collected_df <- data.frame(apply(collected_df, 2, unlist))
  if (length(output_names) == 1)
    collected_df_var <- collected_df[
      !is.na(collected_df[,paste0(output_names, "var")]),]
  else
    collected_df_var <- collected_df[
      apply(collected_df[,paste0(output_names, "var")], 1,
            function(a) !any(is.na(a))),]
  if (verbose) cat("Computed summary statistics...\n") #nocov
  variance_emulators <- map(output_names, function(i) {
    is_high_rep <- !is.na(collected_df_var[,paste0(i,"kurt")])
    all_var <- setNames(
      collected_df_var[,c(input_names, paste0(i, 'var'))], c(input_names, i))
    all_n <- collected_df_var$n
    if (all(is_high_rep)) kurt_ave <- mean(collected_df_var[,paste0(i,'kurt')])
    else if (!any(is_high_rep)) kurt_ave <- 3
    else kurt_ave <- mean(collected_df_var[is_high_rep, paste0(i, 'kurt')])
    # if (verbose) print(paste0(i, " kurtosis:", kurt_ave))
    if (all(is_high_rep) || any(is_high_rep)) {
      var_df <- setNames(
        collected_df_var[is_high_rep, c(input_names, paste0(i, "var"))],
        c(input_names, i))
      npoints <- collected_df_var[is_high_rep, 'n']
      if (sum(is_high_rep) == 1) {
        point <- all_var[is_high_rep,]
        temp_corr <- Correlator$new(hp = list(theta = 0.5))
        variance_em <- HierarchicalEmulator$new(
          basis_f = c(function(x) 1),
          beta = list(mu = c(point[1,i]), sigma = matrix(0, nrow = 1, ncol = 1)),
          u = list(sigma = point[1,i]^2, corr = temp_corr),
          ranges = ranges, out_name = i, verbose = FALSE)
      }
      else {
        variance_em <- emulator_from_data(
          var_df, i, ranges,
          adjusted = FALSE, has.hierarchy = TRUE, verbose = FALSE, ...)[[1]]
      }
    }
    else {
      variance_em <- emulator_from_data(
        all_var, i, ranges,
        adjusted = FALSE, has.hierarchy = TRUE, verbose = FALSE, ...)[[1]]
    }
    if (round(variance_em$u_sigma, 10) <= 0) {
      s_vars <- all_var[is_high_rep, i]
      s_n <- all_n[is_high_rep]
      sig_est <- s_vars^2 * (kurt_ave-1+2/(s_n-1))/s_n
      variance_em$u_sigma <- sqrt(mean(sig_est))
    }
    var_mod <- function(x, n) {
      if (n > 1)
        return((variance_em$get_exp(x)^2 +
                  variance_em$get_cov(x))/n * (kurt_ave - 1 + 2/(n-1)))
      return(0)
    }
    variance_em$s_diag <- var_mod
    variance_em$em_type <- "variance"
    if (all(is_high_rep) || !any(is_high_rep) || sum(!is_high_rep) == 1) {
      variance_em$samples <- all_n
      v_em <- variance_em$adjust(all_var, i)
    }
    else {
      variance_em$samples <- all_n[!is_high_rep]
      v_em <- variance_em$adjust(
        setNames(
          collected_df[!is_high_rep, c(input_names, paste0(i, 'var'))],
          c(input_names, i)), i)
    }
    return(v_em)
  })
  variance_emulators <- setNames(variance_emulators, output_names)
  if (verbose) cat("Completed variance emulators. Training mean emulators...\n") #nocov
  exp_mods <- map(variance_emulators, ~function(x, n) .$get_exp(x)/n)
  exp_data <- setNames(
    collected_df[,c(input_names, paste0(output_names, 'mean'))],
    c(input_names, output_names))
  exp_em <- emulator_from_data(
    exp_data, output_names, ranges, input_names,
    adjusted = FALSE, has.hierarchy = TRUE,
    verbose = verbose, more_verbose = FALSE, ...)
  for (i in seq_along(exp_em)) {
    exp_em[[i]]$s_diag <- exp_mods[[i]]
    exp_em[[i]]$samples <- collected_df$n
  }
  expectation_emulators <- setNames(
    map(
      seq_along(exp_em),
      ~exp_em[[.]]$adjust(exp_data, output_names[[.]])),
    output_names)
  return(list(variance = variance_emulators,
              expectation = expectation_emulators))
}

#' Bimodal Emulation
#'
#' Performs emulation of bimodal outputs and/or systems.
#'
#' This function is deprecated in favour of using \code{\link{emulator_from_data}}
#' with argument \code{emulator_type = "multistate"}. See the associated help file.
#'
#' In many stochastic systems, particularly disease models, the outputs exhibit bimodality - a
#' familiar example is where a disease either takes off or dies out. In these cases, it is not
#' sensible to emulate the outputs based on all realisations, and instead we should emulate each
#' mode separately.
#'
#' This function first tries to identify bimodality. If detected, it determines which of the
#' outputs in the data exhibits the bimodality: to these two separate emulators are trained, one
#' to each mode. The emulators are provided with any data that is relevant to their training; for
#' example, bimodality can exist in some regions of parameter space but not others. Points where
#' bimodality is present have their realisations allocated between the two modes while points
#' where no bimodality exists have their realisations provided to both modes. Targets that do not
#' exhibit bimodality are trained as a normal stochastic output: that is, using the default of
#' \code{\link{variance_emulator_from_data}}.
#'
#' The function also estimates the proportion of realisations in each mode for the set of outputs.
#' This value is also emulated as a deterministic emulator and included in the output.
#'
#' The output of the function is a list, containing three objects: \code{mode1}, \code{mode2}, and
#' \code{prop}. The first two objects have the form produced by \code{variance_emulator_from_data}
#' while \code{prop} has the form of an \code{emulator_from_data} output.
#'
#' @importFrom rlang hash
#' @importFrom cluster daisy fanny
#'
#' @param data The data to train emulators on (as in variance_emulator_from_data)
#' @param output_names The names of the outputs to emulate
#' @param ranges The parameter ranges
#' @param input_names The names of the parameters (by default inferred from \code{ranges})
#' @param verbose Should status updates be provided?
#' @param na.rm Should NA values be removed before training?
#' @param ... Any other parameters to pass to emulator training
#'
#' @return A list \code{(mode1, mode2, prop)} of emulator lists and objects.
#' @export
#'
#' @examples
#'  \donttest{ # Excessive runtime
#'   # Use the stochastic SIR dataset
#'   SIR_ranges <- list(aSI = c(0.1, 0.8), aIR = c(0, 0.5), aSR = c(0, 0.05))
#'   SIR_names <- c("I10", "I25", "I50", "R10", "R25", "R50")
#'   b_ems <- bimodal_emulator_from_data(SIR_stochastic$training, SIR_names, SIR_ranges)
#'  }
#'
bimodal_emulator_from_data <- function(data, output_names, ranges,
                                       input_names = names(ranges),
                                       verbose = interactive(), na.rm = FALSE, ...) {
  .Deprecated(new = "emulator_from_data", msg = "bimodal_emulator_from_data(...) is deprecated in favour of emulator_from_data(..., emulator_type = 'multistate').")
  if (na.rm) input_data <- input_data[apply(
    input_data, 1, function(x) !any(is.na(x))),]
  unique_points <- unique(data[,input_names])
  uids <- apply(unique_points, 1, hash)
  param_sets <- map(uids, function(x) {
    data[apply(data[,input_names], 1, hash) == x,]
  })
  if(verbose) cat("Separated dataset by unique points.\n") #nocov
  proportion <- map_dbl(param_sets, function(x) {
    p_clust <- suppressWarnings(fanny(suppressWarnings(daisy(x[, output_names, drop = FALSE])), k = 2)$clustering)
    return(sum(p_clust == 1)/length(p_clust))
  })
  prop_df <- setNames(
    data.frame(cbind(unique_points, proportion)),
    c(names(unique_points), 'prop'))
  if (verbose) cat("Training emulator to proportion in modes.\n") #nocov
  prop_em <- emulator_from_data(prop_df,
                                c('prop'), ranges, verbose = FALSE, ...)
  if (verbose) cat("Performing clustering to identify modes.\n") #nocov
  has_bimodality <- do.call('rbind.data.frame', map(param_sets, function(x) {
    map_lgl(output_names, function(y) {
      if (length(unique(x[,y])) == 1) return(FALSE)
      clust1 <- suppressWarnings(fanny(suppressWarnings(daisy(x[,y, drop = FALSE])), k = 1))
      clust2 <- suppressWarnings(fanny(suppressWarnings(daisy(x[,y, drop = FALSE])), k = 2))
      if (clust1$objective[["objective"]] < clust2$objective[["objective"]]) return(FALSE)
      return(TRUE)
    })
  })) |> setNames(output_names)
  is_bimodal_target <- apply(
    has_bimodality, 2,
    function(x) sum(x)/length(x) >= 0.1)
  if (!any(is_bimodal_target))
    return(variance_emulator_from_data(
      data, output_names, ranges, verbose = FALSE, ...))
  if (!all(is_bimodal_target)) {
    if (verbose) cat("Training to unimodal targets.\n") #nocov
    non_bimodal <- variance_emulator_from_data(
      data, output_names[!is_bimodal_target], ranges, verbose = FALSE, ...)
  }
  else {
    if (verbose) cat("No targets appear to be unimodal.\n") #nocov
    non_bimodal <- NULL
  }
  if (verbose) cat("Training to bimodal targets.\n") #nocov
  bimodal <- map(output_names[is_bimodal_target], function(x) {
    c1_data <- list()
    c2_data <- list()
    param_bimodal <- has_bimodality[,x]
    for (i in seq_along(param_bimodal)) {
      if (!param_bimodal[[i]]) {
        c1_data[[length(c1_data)+1]] <- param_sets[[i]]
        c2_data[[length(c2_data)+1]] <- param_sets[[i]]
      }
      else {
        this_clust <- suppressWarnings(fanny(suppressWarnings(daisy(param_sets[[i]][,x, drop = FALSE])), k = 2)$clustering)
        c1_data[[length(c1_data)+1]] <- param_sets[[i]][this_clust == 1,]
        c2_data[[length(c2_data)+1]] <- param_sets[[i]][this_clust == 2,]
      }
    }
    mode1_dat <- do.call('rbind', c1_data)
    mode2_dat <- do.call('rbind', c2_data)
    m1em <- tryCatch(
      variance_emulator_from_data(mode1_dat, x, ranges, verbose = FALSE, ...),
      error = function(e) {
        cat("Problem training mode 1 emulator for target ", x, "\n", e, sep = "")
        return(list(expectation = NA, variance = NA))
      }
    )
    m2em <- tryCatch(
      variance_emulator_from_data(mode2_dat, x, ranges, verbose = FALSE, ...),
      error = function(e) {
        cat("Problem training mode 2 emulator for target ", x, "\n", e, sep = "")
        return(list(expectation = NA, variance = NA))
      }
    )
    return(list(m1 = m1em, m2 = m2em))
  })
  bimodals <- setNames(bimodal, output_names[is_bimodal_target])
  if (verbose) cat("Trained emulators. Collating.\n") #nocov
  m1exps <- m2exps <- m1vars <- m2vars <- list()
  for (i in output_names) {
    if (i %in% names(non_bimodal$expectation)) {
      m1exps <- c(m1exps, non_bimodal$expectation[[i]])
      m2exps <- c(m2exps, non_bimodal$expectation[[i]])
      m1vars <- c(m1vars, non_bimodal$variance[[i]])
      m2vars <- c(m2vars, non_bimodal$variance[[i]])
    }
    else {
      m1exps <- c(m1exps, bimodals[[i]]$m1$expectation)
      m2exps <- c(m2exps, bimodals[[i]]$m2$expectation)
      m1vars <- c(m1vars, bimodals[[i]]$m1$variance)
      m2vars <- c(m2vars, bimodals[[i]]$m2$variance)
    }
  }
  names(m1exps) <- names(m1vars) <- output_names
  names(m2exps) <- names(m2vars) <- output_names
  return(list(
    mode1 = list(expectation = m1exps, variance = m1vars),
    mode2 = list(expectation = m2exps, variance = m2vars),
    prop = prop_em[[1]]
  ))
}
Tandethsquire/hmer documentation built on Oct. 25, 2024, 11:09 a.m.