R/diagnostics.R
In hmer: History Matching and Emulation Package

Documented in analyze_diagnostic classification_diag comparison_diag get_diagnostic individual_errors residual_diag space_removal standard_errors summary_diag validation_diagnostics

#' Data Cleaning for Stochastic Emulators
#'
#' Reforms the data to allow for meaningful diagnostics on variance emulators.
#'
#' Takes a data.frame of points (with replicates), and generates a set of
#' meaningful statistics based on them and dependent on the type of emulator.
#'
#' @param data The data to reform
#' @param in_names The names of the input parameters
#' @param out_name The output value
#' @param is.variance Is the emulator a mean or variance emulator?
#' @param boots The number of samples for bootstrapping (if required)
#'
#' @importFrom stats sd
#' @importFrom rlang hash
#'
#' @keywords internal
#' @noRd
#'
#' @return A data.frame consisting of the summary stats (mean, sd, n)
clean_data <- function(data, in_names, out_name,
                       is.variance = FALSE, boots = 1000) {
  unique_points <- unique(data[,in_names])
  uids <- apply(unique_points, 1, hash)
  summary_stats <- map(uids, function(i) {
    relev <- data[apply(data[,in_names], 1, hash) == i, out_name]
    if (!is.variance)
      return(c(mean(relev), sd(relev)^2/length(relev), length(relev)))
    var_mean <- sd(relev)^2
    bootstrap <- map_dbl(1:boots, function(a) {
      repsamp <- sample(relev, length(relev), replace = TRUE)
      sd(repsamp)^2
    })
    return(c(var_mean, sd(bootstrap)^2, length(relev)))
  })
  out_data <- setNames(
    cbind.data.frame(
      unique_points, do.call('rbind', summary_stats)),
    c(in_names, 'mean', 'var', 'n'))
  return(out_data[out_data$n > 1, ])
}

# A function for calculating k-fold cross-validation statistics.
k_fold_measure <- function(em, target = NULL, k = 1, ...) {
  train_data <- setNames(
    cbind.data.frame(
      eval_funcs(
        scale_input, data.frame(em$in_data), em$ranges, FALSE),
      em$out_data), c(names(em$ranges), em$output_name))
  ordering <- seq_len(nrow(train_data))
  if (k > 1) {
    if (nrow(train_data) %% k != 0) k <- 1
    else {
      ordering <- sample(seq_len(nrow(train_data)), nrow(train_data))
      loo_data <- map(1:(nrow(train_data)/k), function(x) {
        relev_em <- em$o_em$clone()
        relev_pts <- ordering[((x-1)*k+1):(k*x)]
        adjust_data <- train_data[-relev_pts,]
        adj_em <- relev_em$adjust(adjust_data, em$output_name)
        outp <- cbind(
          cbind(
            train_data[relev_pts,],
            adj_em$get_exp(train_data[relev_pts, names(em$ranges)])),
          adj_em$get_cov(train_data[relev_pts, names(em$ranges)]))
        if (inherits(em, "Hierarchical"))
          outp <- cbind(
            outp, map_dbl(relev_pts,
                                 ~em$s_diag(train_data[., names(em$ranges)],
                                            1)))
        if (!is.null(target))
          outp <- cbind(
            outp, adj_em$implausibility(train_data[relev_pts, names(em$ranges)],
                                        target))
        return(c(outp, use.names = FALSE))
      })
    }
  }
  if (k == 1) {
    loo_data <- map(seq_len(nrow(train_data)), function(x) {
      relev_em <- em$o_em$clone()
      adjust_data <- train_data[-x,]
      adj_em <- relev_em$adjust(adjust_data, em$output_name)
      outp <- c(train_data[x,], adj_em$get_exp(train_data[x,names(em$ranges)]),
                adj_em$get_cov(train_data[x,names(em$ranges)]))
      if (inherits(em, "Hierarchical"))
        outp <- c(outp, em$s_diag(train_data[x, names(em$ranges)], 1))
      if (!is.null(target))
        outp <- c(outp, adj_em$implausibility(train_data[x, names(em$ranges)],
                                              target))
      return(c(outp, use.names = FALSE))
    })
  }
  res_names <- c(names(em$ranges), em$output_name, "E", "V")
  if (inherits(em, "Hierarchical")) res_names <- c(res_names, "Vest")
  if (!is.null(target)) res_names <- c(res_names, "I")
  out_df <- setNames(
    data.frame(apply(do.call('rbind.data.frame', loo_data), 2, unlist)),
    res_names)
  return(out_df[order(as.numeric(row.names(out_df))),])
}

#' Summary Statistics for Emulators
#'
#' Generates measures for emulator quality
#'
#' A couple of summary statistics can be generated for emulators, based on their
#' prediction errors on a validation set. This function produces the test statistic
#' for a comparison to a relevant chi-squared distribution, and the similar test
#' statistic for an F-distribution. In both cases, the expectation and standard
#' deviation of the underlying distribution are also provided.
#'
#' The output of this function is a logical vector stating whether the derived
#' value lies within 3-sigma of the expected value. In systems where errors are
#' expected to be correlated, higher weight should be given to the Mahalanobis
#' measure than the chi-squared measure. Any anomalous results can be investigated
#' in more depth using the \code{\link{individual_errors}} function.
#'
#' @param emulator The emulator to test
#' @param validation The validation set, consisting of points and output(s)
#' @param verbose Should statistics be printed out?
#'
#' @return Whether the observed value lies within 3-sigma of the expected value.
#'
#' @references Bastos & O'Hagan (2009) <doi:10.1198/TECH.2009.08019>
#' @family diagnostic functions
#' @export
#'
#' @examples
#'  summary_diag(SIREmulators$ems$nR, SIRSample$validation)
summary_diag <- function(emulator, validation, verbose = interactive()) {
  if (inherits(emulator, "EmProto"))
    stop("summary_diag not applicable for Proto_emulator objects.")
  points <- validation[,names(emulator$ranges)]
  outputs <- validation[,emulator$output_name]
  m <- nrow(validation)
  n <- nrow(emulator$in_data)
  q <- length(emulator$basis_f)
  indiv_errs <- outputs - emulator$get_exp(points)
  chi_sq_measure <- sum(indiv_errs^2/emulator$get_cov(points))
  if(verbose) cat("Chi-squared:", round(chi_sq_measure,4), #nocov start
                          "against mean", m,
                          "with standard deviation", round(sqrt(2*m),4), "\n") #nocov end
  cov_mat <- emulator$get_cov(points, full = TRUE)
  cov_inv <- tryCatch(
    chol2inv(chol(cov_mat)),
    error = function(e) {ginv(cov_mat)}
  )
  mahal_measure <- t(indiv_errs) %*% cov_inv %*% indiv_errs
  if (verbose) cat("Mahalanobis:", round(mahal_measure,4), #nocov start
                           "against mean", m, "with standard deviation",
                           round(sqrt(2*m*(m+n-q-2)/(n-q-4)),4), "\n") #nocov end
  chi_valid <- (abs(chi_sq_measure - m)/sqrt(2*m) <= 3)
  mahal_valid <- (abs(mahal_measure - m)/sqrt(2*m*(m+n-q-2)/(n-q-4)) <= 3)
  return(c(chi_valid, mahal_valid))
}

#' Emulator Regression Residuals
#'
#' Plots the emulator residuals.
#'
#' An emulator is composed of two parts: a global regression surface, and a local
#' correlation structure. It can sometimes be informative to examine the residuals
#' of the regression surface on the training set, to determine the extent to which
#' the regression surface is being `corrected for' by the correlation structure.
#'
#' @param emulator The emulator to consider.
#' @param histogram Should a histogram or a scatter plot be shown? Default: FALSE
#' @param ... Any additional arguments (used internally)
#'
#' @return A set of residuals, standardised by the regression surface residual standard error.
#'
#' @family diagnostic functions
#' @export
#'
#' @examples
#' residual_diag(SIREmulators$ems$nS)
#' residual_diag(SIREmulators$ems$nI, TRUE)
#'
residual_diag <- function(emulator, histogram = FALSE, ...) {
  if (inherits(emulator, "EmProto"))
    stop("residual_diag not applicable for Proto_emulator objects.")
  in_points <- eval_funcs(scale_input, data.frame(emulator$in_data),
                          emulator$ranges, FALSE)
  if (is.numeric(emulator$u_sigma))
    standardised_residuals <- emulator$model$residuals/emulator$u_sigma
  else
    standardised_residuals <- emulator$model$residuals/
      apply(in_points, 1, emulator$u_sigma)
  if (histogram) hist(standardised_residuals, xlab = "Standadized Residual", #nocov start
                      ylab = "Frequency",
                      main = paste(emulator$output_name, "Residual Histogram")) #nocov end
  else plot(standardised_residuals, xlab = "Point",
            ylab = "Standardized Residual", pch = 16,
            main = emulator$output_name,
            panel.first = abline(h = c(-3, 0, 3), lty = c(2, 1, 2)))
  return(in_points[abs(standardised_residuals) > 3,])
}

#' Percentage of Space Removed
#'
#' For a wave of emulators, estimates the proportion of space removed at this wave.
#'
#' Given a collection of emulators corresponding to a wave, we can look at an estimate of
#' the proportion of points from previous waves that will be accepted at this wave, either
#' on an emulator-by-emulator basis (to see which outputs are most restrictive) or as an all-wave
#' determination.
#'
#' Naturally, such a statement will be an estimate of the restriction on the full space (which will
#' become more unreliable for higher dimensions), but it can give an order-of-magnitude statement,
#' or useful comparators between different emulators in a wave.
#'
#' If no points are provided, the training points for the emulators are used. For best results, a
#' good number of points should be given: typically one should consider using as many points as one
#' knows to be in the NROY space (including any validation points, if accessible).
#'
#' @param ems The emulators to compute over, as a list
#' @param targets The output target values
#' @param points The points to test against
#' @param ppd If no points are provided and uniform grid is wanted, the number of
#'            points per parameter dimension.
#' @param cutoff The cutoff value for implausibility
#' @param individual If true, gives emulator-by-emulator results; otherwise works with maximum implausibility
#'
#' @return A numeric corresponding to the proportions of points removed.
#'
#' @seealso \code{\link{space_removed}} for a visualisation of the space removal.
#' @export
#'
#' @examples
#'  space_removal(SIREmulators$ems, SIREmulators$targets,
#'   rbind(SIRSample$training, SIRSample$validation))
#'  space_removal(SIREmulators$ems, SIREmulators$targets,
#'   rbind(SIRSample$training, SIRSample$validation), individual = FALSE)
space_removal <- function(ems, targets, points = NULL, ppd = NULL,
                          cutoff = 3, individual = TRUE) {
  if (is.null(points)) {
    if (is.null(ppd)) {
      if (inherits(ems[[1]], "EmProto"))
        stop("Cannot access emulator training points as basis for space removal calculation for proto_ems.")
      input_points <- eval_funcs(scale_input, data.frame(ems[[1]]$in_data),
                                 ems[[1]]$ranges, FALSE)
      output_points <- setNames(do.call('cbind.data.frame',
                                        map(ems, ~.$out_data)),
                                map_chr(ems, ~.$output_name))
      points <- data.frame(cbind(input_points, output_points))
    }
    else {
      maxpoints <- 50000
      ranges <- ems[[1]]$ranges
      if (ppd^length(ranges) > maxpoints) {
        points <- setNames(do.call('cbind.data.frame',
                                   map(ranges, ~runif(maxpoints, .[[1]], .[[2]]))),
                           names(ranges))
      }
      else {
        points <- setNames(
          expand.grid(map(ranges, ~seq(.[[1]], .[[2]], length.out = ppd))),
          names(ranges))
      }
    }
  }
  if (individual) {
    imps <- setNames(
      do.call(
        'cbind.data.frame',
        map(ems, ~.$implausibility(points, targets[[.$output_name]],
                                          cutoff = cutoff))),
      map_chr(ems, ~.$output_name))
    return(1-apply(imps, 2, sum)/nrow(points))
  }
  nth_imps <- nth_implausible(ems, points, targets, cutoff = cutoff)
  return(1-sum(nth_imps)/nrow(points))
}

#' Diagnostic Tests for Emulators
#'
#' Given an emulator, return a diagnostic measure.
#'
#' An emulator's suitability can be checked in a number of ways. This function combines all
#' current diagnostics available in the package, returning a context-dependent data.frame
#' containing the results.
#'
#' Comparison Diagnostics (cd): Given a set of points, the emulator expectation and variance
#' are calculated. This gives a predictive range for the input point, according to the
#' emulator. We compare this against the actual value given by the simulator: points whose
#' emulator prediction is further away from the simulator prediction are to be investigated.
#' This 'distance' is given by \code{stdev}, and an emulator prediction correspondingly
#' should not be further away from the simulator value than stdev*uncertainty.
#'
#' Classification Error (ce): Given a set of targets, the emulator can determine implausibility
#' of a point with respect to the relevant target, accepting or rejecting it as appropriate.
#' We can define a similar `implausibility' function for the simulator: the combination of
#' the two rejection schemes gives four classifications of points. Any point where the
#' emulator would reject the point but the simulator would not should be investigated.
#'
#' Standardized Error (se): The known value at a point, combined with the emulator expectation
#' and uncertainty, can be combined to provide a standardized error for a point. This error
#' should not be too large, in general. but the diagnostic is more useful when looking at
#' a collection of such measures, where systematic bias or over/underconfidence can be seen.
#'
#' Which of the diagnostics is performed can be controlled by the \code{which_diag} argument.
#' If performing classification error diagnostics, a set of targets must be provided; for all
#' diagnostics, a validation (or holdout) set can be provided. If no such set is given, then
#' the emulator diagnostics are performed with respect to its training points, using k-fold
#' cross-validation.
#'
#' @param emulator An object of class Emulator
#' @param targets If desired, the target values for the output(s) of the system
#' @param validation If provided, the emulator is tested against the outputs of these points
#' @param which_diag Which diagnostic measure to use (choosing from cd, ce, se above)
#' @param stdev For `cd', a measure of the allowed distance from prediction and reality
#' @param cleaned Internal for stochastic emulators
#' @param warn Should a warning be shown if ce is chosen and no targets provided?
#' @param kfold Mainly internal: pre-computed k-fold diagnostic results for output
#' @param ... Any other parameters to be passed through to subfunctions.
#'
#' @return A data.frame consisting of the input points, output values, and diagnostic measures.
#' @export
#'
#' @family diagnostic functions
#' @seealso validation_diagnostics
#'
#' @examples
#'  # Use the simple SIR model via SIREmulators
#'  get_diagnostic(SIREmulators$ems$nS, validation = SIRSample$validation)
#'  # Classification error fails without the set of targets
#'  get_diagnostic(SIREmulators$ems$nI, SIREmulators$targets, SIRSample$validation, 'ce')
#'  # No validation set: k-fold cross-validation will be used.
#'  get_diagnostic(SIREmulators$ems$nR, which_diag = 'se')
#'
get_diagnostic <- function(emulator, targets = NULL, validation = NULL,
                           which_diag = 'cd', stdev = 3, cleaned = NULL,
                           warn = TRUE, kfold = NULL, ...) {
  if (is.null(validation) && inherits(emulator, "EmProto"))
    stop("Proto_emulator object requires validation set.")
  if (is.null(targets) && which_diag == 'ce')
    stop("Targets must be provided for classification error diagnostics.")
  if (is.null(validation)) {
    if (is.null(kfold)) {
      if (which_diag == "ce"){
        dat <- k_fold_measure(emulator, targets[[emulator$output_name]], ...)
      }
      else dat <- k_fold_measure(emulator, ...)
    }
    else dat <- kfold
    input_points <- dat[,names(emulator$ranges)]
    output_points <- dat[,emulator$output_name]
    if (which_diag == 'ce')
      em_imp <- dat[,"I"]
    else {
      em_exps <- dat[,"E"]
      em_vars <- dat[,"V"]
      if (inherits(emulator, "Hierarchical"))
        point_vars <- dat[,"Vest"]
    }
  }
  else {
    if (inherits(emulator, "Hierarchical")) {
      if (!is.null(cleaned)) cleaned_data <- cleaned
      else cleaned_data <- clean_data(validation, names(emulator$ranges),
                                      emulator$output_name,
                                      emulator$em_type == "variance")
      input_points <- cleaned_data[,names(emulator$ranges)]
      output_points <- cleaned_data$mean
      if (which_diag == "ce")
        em_imp <- emulator$implausibility(input_points,
                                          targets[[emulator$output_name]])
      else {
        em_exps <- emulator$get_exp(input_points)
        em_vars <- emulator$get_cov(input_points)
        point_vars <- cleaned_data$var/(cleaned_data$n-1)
      }
    }
    else {
      input_points <- validation[,names(emulator$ranges)]
      output_points <- validation[,emulator$output_name]
      if (which_diag == "ce")
        em_imp <- emulator$implausibility(input_points,
                                          targets[[emulator$output_name]])
      else {
        em_exps <- emulator$get_exp(input_points)
        em_vars <- emulator$get_cov(input_points)
      }
    }
  }
  if (which_diag == 'se') {
    num <- em_exps - output_points
    if (inherits(emulator, "Hierarchical"))
      denom <- sqrt(em_vars + point_vars +
                      emulator$disc$internal^2 + emulator$disc$external^2)
    else
      denom <- sqrt(em_vars +
                      emulator$disc$internal^2 + emulator$disc$external^2)
    errors <- num/denom
    out_data <- setNames(
      do.call(
        'cbind.data.frame',
        list(input_points, output_points, errors)),
      c(names(input_points), emulator$output_name, 'error'))
  }
  if (which_diag == 'cd') {
    if (is.null(stdev)) stdev <- 3
    if (inherits(emulator, "Hierarchical"))
      emulator_unc <- stdev * sqrt(em_vars + point_vars +
                                     emulator$disc$internal^2 +
                                     emulator$disc$external^2)
    else
      emulator_unc <- stdev * sqrt(em_vars +
                                     emulator$disc$internal^2 +
                                     emulator$disc$external^2)
    out_data <- setNames(
      do.call(
        'cbind.data.frame',
        list(input_points, output_points, em_exps, emulator_unc)),
      c(names(input_points), emulator$output_name, 'exp', 'unc'))
  }
  if (which_diag == 'ce') {
    this_target <- targets[[emulator$output_name]]
    if (is.atomic(this_target))
      sim_imp <- abs(output_points -
                       rep(mean(this_target), length(output_points)))/
        rep(diff(this_target)/2, length(output_points))
    else
      sim_imp <- map_dbl(output_points,
                                ~sqrt((this_target$val-.)^2/
                                        this_target$sigma^2))
    out_data <- setNames(
      do.call(
        'cbind.data.frame',
        list(input_points, output_points, em_imp, sim_imp)),
      c(names(input_points), emulator$output_name, 'em', 'sim'))
  }
  return(out_data)
}

#' Diagnostic Analysis for Emulators
#'
#' Produces summary and plots for diagnostics
#'
#' Given diagnostic information (almost certainly provided from \code{\link{get_diagnostic}}),
#' we can plot the results and highlight the points that are worthy of concern or further
#' consideration. Each diagnostic available has a plot associated with it which can be produced
#' here:
#'
#' Standardized Error: A histogram of standardized errors. Outliers should be considered, as well
#' as whether very many points have either large or small errors.
#'
#' Comparison Diagnostics: Error bars around points, corresponding to emulator prediction plus or
#' minus emulator uncertainty. A green line indicates where the emulator and simulator prediction
#' would be in complete agreement: error bars that do not overlap with this line (coloured red) are
#' to be considered. Where targets are provided, the colouration is limited only to points where
#' the simulator prediction would be close to the targets.
#'
#' Classification Error: A point plot comparing emulator implausibility to simulator
#' implausibility, sectioned into regions horizontally and vertically by \code{cutoff}. Points
#' that lie in the lower right quadrant (i.e. emulator would reject; simulator would not) should
#' be considered.
#'
#' This function takes a data.frame that contains the input points, simulator values and, depending
#' on the diagnostic, a set of summary measures. It returns a data.frame of any points that failed
#' the diagnostic.
#'
#' We may also superimpose the target bounds on the comparison diagnostics, to get a sense of
#' where it is most important that the emulator and simulator agree. The \code{target_viz}
#' argument controls this, and has three options: 'interval' (a horizontal interval); 'solid'
#' (a solid grey box whose dimensions match the target region in both vertical and horizontal
#' extent); and 'hatched' (similar to solid, but a semi-transparent box with hatching inside).
#' Any such vizualisation has extent equal to the target plus/minus 4.5 times the target
#' uncertainty. By default, \code{target_viz = NULL}, indicating that no superposition is shown.
#'
#' @importFrom graphics abline arrows hist rect
#' @importFrom grDevices rgb
#'
#' @param in_data The data to perform the analysis on
#' @param output_name The name of the output emulated
#' @param targets If required or desired, the targets for the system outputs
#' @param plt Whether or not to plot the analysis
#' @param cutoff The implausibility cutoff for diagnostic `ce'
#' @param target_viz How to show the targets on the diagnostic plots
#' @param ... Any other parameters to pass to subfunctions
#'
#' @return A data.frame of failed points
#'
#' @family diagnostic functions
#'
#' @references Jackson (2018) <http://etheses.dur.ac.uk/12826>
#' @export
#'
#' @seealso \code{\link{get_diagnostic}}
analyze_diagnostic <- function(in_data, output_name, targets = NULL,
                               plt = interactive(), cutoff = 3, target_viz = NULL, ...) {
  if (!is.null(target_viz))
    if (!target_viz %in% c("interval", "solid", "hatched")) target_viz <- NULL
  output_points <- in_data[,output_name]
  input_points <- in_data[, !names(in_data) %in% c(output_name,
                                                   'error', 'em',
                                                   'sim', 'exp', 'unc')]
  if (!is.null(in_data$error)) {
    if (plt) {
      h1 <- hist(in_data$error, plot = FALSE)
      plot(h1, xlab = "Standardised Error", main = output_name)
    }
    emulator_invalid <- abs(in_data$error) > 3
    if (!is.null(targets)) {
      this_target <- targets[[output_name]]
      if (is.atomic(this_target))
        point_invalid <- ((output_points < this_target[1]-diff(this_target)/2) |
                            (output_points > this_target[2]+diff(this_target)/2))
      else
        point_invalid <- ((output_points < this_target$val - 6*this_target$sigma) |
                            (output_points > this_target$val + 6*this_target$sigma))
      if (plt) {
        errors_restricted <- in_data[!point_invalid, 'error']
        h2 <- hist(errors_restricted, breaks = h1$breaks, plot = FALSE)
        plot(h2, add = TRUE, col = 'blue')
      }
      emulator_invalid <- (!point_invalid & emulator_invalid)
    }
  }
  if (!is.null(in_data$exp)) {
    em_ranges <- range(c(in_data$exp + in_data$unc, in_data$exp - in_data$unc))
    emulator_invalid <- (output_points > in_data$exp + in_data$unc) |
      (output_points < in_data$exp - in_data$unc)
    if (!is.null(targets)) {
      this_target <- targets[[output_name]]
      if (is.atomic(this_target)) {
        point_invalid <- ((output_points < this_target[1]-diff(this_target)/2) |
                            (output_points > this_target[2]+diff(this_target)/2))
        panlims <- c(this_target[1]-diff(this_target)/4,
                   this_target[2]+diff(this_target)/4)
      }
      else {
        point_invalid <- ((output_points < this_target$val - 6*this_target$sigma) |
                            (output_points > this_target$val + 6*this_target$sigma))
        panlims <- c(this_target$val - 4.5*this_target$sigma,
                     this_target$val + 4.5*this_target$sigma)
      }
      emulator_invalid <- (!point_invalid & emulator_invalid)
    }
    else panlims <- NULL
    if (plt) {
      plot(output_points, in_data$exp, pch = 16, asp = list(...)[["asp"]],
           col = ifelse(emulator_invalid, 'red', 'black'),
           xlim = range(output_points), ylim = range(em_ranges),
           xlab = 'f(x)', ylab = 'E[f(x)]',
           panel.first = c(
             if (!is.null(target_viz)) {
               if (target_viz == "interval")
                 arrows(x0 = panlims[1], x1 = panlims[2],
                        y0 = min(in_data$exp)+0.05*diff(range(in_data$exp)),
                        y1 = min(in_data$exp)+0.05*diff(range(in_data$exp)),
                        length = 0.05, code = 3, angle = 90)
               else if (target_viz == "solid")
                 rect(xleft = panlims[1], xright = panlims[2],
                      ybottom = panlims[1], ytop = panlims[2],
                      col = rgb(40, 40, 40, 51, maxColorValue = 255))
               else
                 rect(xleft = panlims[1], xright = panlims[2],
                      ybottom = panlims[1], ytop = panlims[2],
                      col = rgb(40, 40, 40, 102, maxColorValue = 255),
                      density = 15, angle = 135)
             }
             else NULL,
               abline(a = 0, b = 1, col = 'green')),
           main = output_name)
      for (i in seq_along(input_points[,1])) {
        if (in_data$unc[[i]] < 1e-6) next
        suppressWarnings(arrows(x0 = output_points[[i]],
               y0 = in_data$exp[[i]] - in_data$unc[[i]],
               x1 = output_points[[i]],
               y1 = in_data$exp[[i]] + in_data$unc[[i]],
               col = ifelse(emulator_invalid[[i]], 'red', 'blue'),
               code = 3, angle = 90, length = 0.05))
      }
    }
  }
  if (!is.null(in_data$em)) {
    if (is.null(targets)) stop("Require target to check classification error.")
    if (is.null(cutoff)) cutoff <- 3
    this_target <- targets[[output_name]]
    t_cutoff <- if (is.atomic(this_target)) 1 * cutoff/3 else cutoff
    emulator_invalid <- (in_data$em > cutoff) & (in_data$sim <= t_cutoff)
    if (plt) {
      plot(in_data$em, in_data$sim, pch = 16,
           col = ifelse(emulator_invalid, 'red', 'black'),
           xlab = "Emulator Implausibility", ylab = "Simulator Implausibility",
           main = output_name,
           panel.first = c(abline(h = t_cutoff), abline(v = cutoff)))
    }
  }
  return(input_points[emulator_invalid,])
}

#' Emulator Diagnostics
#'
#' Performs the standard set of validation diagnostics on emulators.
#'
#' All the diagnostics here can be performed with or without a validation (or `holdout')
#' set of data. The presence of a set of targets is optional for some checks but
#' mandatory for others: the appropriate warnings will be given in the event that some
#' checks cannot be applied.
#'
#' The current options for diagnostics (with the codes for \code{which_diag}) are:
#'
#'   Standardised Errors (se)
#'
#'   Comparison Diagnostics (cd)
#'
#'   Classification Errors (ce)
#'
#'   All of the above (all)
#'
#' For details of each of the tests, see the help file for
#' \code{\link{get_diagnostic}}.
#'
#' @importFrom graphics par
#'
#' @param emulators A list of \code{\link{Emulator}} objects.
#' @param targets The list of observations for the outputs
#' @param validation The validation set, containing all inputs and outputs.
#' @param which_diag Which diagnostics should be performed (see description)
#' @param analyze Should plotting and/or failing points be returned?
#' @param diagnose For bimodal systems, should the expectation or variance be considered?
#' @param ... Any additional parameters to pass to the diagnostics (eg sd, cutoff, ...)
#'
#' @return A data.frame containing points that failed one or more diagnostic tests.
#'
#' @family diagnostic functions
#' @export
#'
#' @examples
#' validation_diagnostics(SIREmulators$ems, SIREmulators$targets, SIRSample$validation)
#' # data.frame of failed points (empty) and a 3x3 set of plots
#' validation_diagnostics(SIREmulators$ems, SIREmulators$targets, SIRSample$validation,
#'  c('ce','cd'))
#' # empty data.frame and a 3x2 set of plots
#' validation_diagnostics(SIREmulators$ems, SIREmulators$targets, SIRSample$validation,
#'  cutoff = 2, sd = 2)
#' # k-fold (with k = 3)
#' validation_diagnostics(SIREmulators$ems, SIREmulators$targets, k = 3)
validation_diagnostics <- function(emulators, targets = NULL,
                                   validation = NULL,
                                   which_diag = c('cd', 'ce', 'se'),
                                   analyze = TRUE,
                                   diagnose = "expectation", ...) {
  oldpar <- par(no.readonly = TRUE)
  on.exit(par(oldpar), add = TRUE)
  if (inherits(emulators, "Emulator"))
    emulators <- setNames(list(emulators), emulators$output_name)
  if ("EmProto" %in% unlist(map(emulators, class), use.names = FALSE) && is.null(validation))
    stop("Proto_emulator objects require a validation set.")
  if (length(which_diag) == 1 && which_diag == 'all') actual_diag <- c('cd', 'ce', 'se')
  else {
    actual_diag <- which_diag[which_diag %in% c('cd', 'ce', 'se')]
    if (length(actual_diag) != length(which_diag))
      warning(paste("Unrecognised diagnostics:",
                    paste0(which_diag[!which_diag %in% actual_diag],
                           collapse = ","),
                    "\n\tValid diagnostics labels are 'cd', 'se', 'ce', or 'all'."))
  }
  actual_diag <- unique(actual_diag)
  if (('ce' %in% actual_diag) && is.null(targets)) {
    warning("No targets provided; cannot perform classification diagnostics.")
    actual_diag <- actual_diag[-which(actual_diag == 'ce')]
  }
  if (!is.null(validation))
    validation <- validation[apply(validation[,names(targets),drop=FALSE], 1, function(x) all(!is.na(x))),]
  if (!is.null(emulators$mode1) && !is.null(emulators$mode2)) {
    if (diagnose == "variance") {
      m1_ems <- emulators$mode1$variance
      m2_ems <- emulators$mode2$variance
    }
    else {
      m1_ems <- emulators$mode1$expectation
      m2_ems <- emulators$mode2$expectation
    }
    if (!is.null(validation)) {
      v_outputs <- validation[,
                              unique(c(map_chr(m1_ems, ~.$output_name),
                                       map_chr(m2_ems, ~.$output_name))), drop = FALSE]
      v_class <- fanny(suppressWarnings(daisy(v_outputs)), k = 2)$clustering
      #v_class <- Mclust(v_outputs, G = 1:2, verbose = FALSE)$classification
      valid_one <- validation[v_class == 1,]
      valid_two <- validation[v_class == 2,]
      cleaning_dat1 <- map(m1_ems, function(x) {
        if (inherits(x, "Hierarchical"))
          valid_dat <- clean_data(valid_one,
                                  names(x$ranges),
                                  x$output_name, x$em_type == "variance")
        else valid_dat <- NULL
        valid_dat
      })
      cleaning_dat2 <- map(m1_ems, function(x) {
        if (inherits(x, "Hierarchical"))
          valid_dat <- clean_data(valid_two,
                                  names(x$ranges),
                                  x$output_name, x$em_type == "variance")
        else valid_dat <- NULL
        valid_dat
      })
      match_modes <- map_dbl(m1_ems, function(x) {
        outputs1 <- c(x$out_data, cleaning_dat1[[x$output_name]]$mean)
        outputs2 <- c(x$out_data, cleaning_dat2[[x$output_name]]$mean)
        if (sd(outputs1) < sd(outputs2)) return(1)
        return(2)
      })
      cluster1 <- map(
        seq_along(m1_ems),
        ~list(emulator = m1_ems[[.]],
              validation = list(valid_one, valid_two)[[match_modes[.]]]))
      cluster2 <- map(
        seq_along(m2_ems),
        ~list(emulator = m2_ems[[.]],
              validation = list(valid_two, valid_one)[[match_modes[.]]]))
      res_one <- setNames(unlist(map(cluster1, function(x) {
        suppressWarnings(
          validation_diagnostics(x$emulator, targets, x$validation,
                                 which_diag, analyze = FALSE, ...))
      }), recursive = FALSE), map_chr(cluster1, ~.$emulator$output_name))
      res_two <- setNames(unlist(map(cluster2, function(x) {
        suppressWarnings(
          validation_diagnostics(x$emulator, targets, x$validation,
                                 which_diag, analyze = FALSE, ...))
      }), recursive = FALSE), map_chr(cluster2, ~.$emulator$output_name))
    }
    else {
      res_one <- setNames(
        unlist(
          map(
            m1_ems,
            ~suppressWarnings(
              validation_diagnostics(., targets, validation,
                                     which_diag, analyze = FALSE, ...))),
          recursive = FALSE), map_chr(m1_ems, ~.$output_name))
      res_two <- setNames(
        unlist(
          map(
            m2_ems,
            ~suppressWarnings(
              validation_diagnostics(., targets, validation,
                                     which_diag, analyze = FALSE, ...))),
          recursive = FALSE), map_chr(m2_ems, ~.$output_name))
    }
    diag_results <- setNames(
      map(unique(c(names(res_one), names(res_two))), function(x) {
      if (!is.null(res_one[[x]]) && !is.null(res_two[[x]]))
        return(map(seq_along(res_one[[x]]), ~rbind(res_one[[x]][[.]],
                                                          res_two[[x]][[.]])))
      if (is.null(res_one[[x]])) return(res_two[[x]])
      return(res_one[[x]])
    }), unique(c(names(res_one), names(res_two))))
  }
  else {
    if (!is.null(emulators$expectation)) {
      emulators <- emulators[[ifelse(diagnose == "variance", "variance", "expectation")]]
    }
    if (!is.null(validation)) {
      cleaning_dat <- map(emulators, function(x) {
        if (inherits(x, "Hierarchical"))
          valid_dat <- clean_data(validation, names(x$ranges),
                                  x$output_name, x$em_type == "variance")
        else valid_dat <- NULL
        valid_dat
      })
      kfolded <- NULL
    }
    else {
      cleaning_dat <- NULL
      kfolded <- setNames(
        map(
          emulators,
          function(x) k_fold_measure(x, targets[[x$output_name]], ...)),
        map_chr(emulators, ~.$output_name))
    }
    diag_results <- map(
      emulators,
      function(x) map(actual_diag, function(y)
        get_diagnostic(x, targets, validation, y,
                       cleaned = cleaning_dat[[x$output_name]],
                       kfold = kfolded[[x$output_name]], ...)))
  }
  if (!analyze) return(diag_results)
  fail_point_list <- list()
  mf <- length(actual_diag)
  n_row <- list(...)[['row']]
  if (!is.null(n_row)) op <- par(mfrow = c(n_row, mf))
  else op <- par(mfrow = c(3, mf))
  for (i in seq_along(diag_results)) {
    for (j in seq_along(diag_results[[i]])) {
      fail_point_list[[length(fail_point_list)+1]] <- analyze_diagnostic(
        diag_results[[i]][[j]], names(diag_results)[[i]], targets, ...)
    }
  }
  failed_points <- unique(do.call('rbind', fail_point_list))
  return(failed_points)
}


#' Predictive Error Plots
#'
#' Plots the predictive error with respect to a variety of quantities.
#'
#' The choice of errors to plot is controlled by \code{errtype}, and can be one
#' of four things: normal, corresponding to the regular standardised errors; eigen,
#' corresponding to the errors after reordering given by the eigendecomposition
#' of the emulator covariance matrix; chol, similarly deriving errors after Cholesky
#' decomposition; and cholpivot, deriving the errors after pivoted Cholesky decomposition.
#'
#' What the errors are plotted with respect to is controlled by \code{xtype}. The options
#' are index, which plots them in their order in the validation set; em, which plots errors
#' with respect to the emulator prediction at that point; and any named parameter of the
#' model, which plots with respect to the values of that parameter.
#'
#' Finally, the plot type is controlled by \code{plottype}: this can be one of normal,
#' which plots the errors; or qq, which produces a Q-Q plot of the errors.
#'
#' The default output is to plot the standardised errors (with no decomposition)
#' against the ordering in the validation set; i.e. \code{errtype = "normal"},
#' \code{xtype = "index"}, \code{plottype = "normal"}.
#'
#' Some combinations are not permitted, as the output would not be meaningful. Errors
#' arising from an eigendecomposition cannot be plotted against either emulator prediction
#' or a particular parameter (due to the transformation induced by the eigendecomposition);
#' Q-Q plots are not plotted for a non-decomposed set of errors, as the correlation
#' between errors makes it much harder to interpret.
#'
#' @importFrom stats qqnorm qqline
#'
#' @param em The emulator to perform diagnostics on
#' @param validation The validation set of points with output(s)
#' @param errtype The type of individual error to be plotted.
#' @param xtype The value to plot against
#' @param plottype Whether to plot a standard or Q-Q plot.
#'
#' @return The relevant plot.
#'
#' @references Bastos & O'Hagan (2009) <doi:10.1198/TECH.2009.08019>
#'
#' @family diagnostic functions
#' @export
#'
#' @examples
#' i1 <- individual_errors(SIREmulators$ems$nS, SIRSample$validation)
#' i2 <- individual_errors(SIREmulators$ems$nS, SIRSample$validation, "chol", "em")
#' i3 <- individual_errors(SIREmulators$ems$nS, SIRSample$validation, "eigen", plottype = "qq")
#' i4 <- individual_errors(SIREmulators$ems$nS, SIRSample$validation, "cholpivot", xtype = "aSI")
#'
individual_errors <- function(em, validation, errtype = "normal",
                              xtype = "index", plottype = "normal") {
  if (inherits(em, "EmProto"))
    stop("individual_errors not applicable for Proto_emulator objects.")
  if (!errtype %in% c("normal", "eigen", "chol", "cholpivot"))
    stop(paste("Error type not recognised",
    "(options are normal, eigen, chol, or cholpivot)."))
  if (!xtype %in% c("index", "em", names(em$ranges)))
    stop(paste("x-axis measure not recognised",
    "(options are index, em, or a parameter name."))
  if (!plottype %in% c("normal", "qq"))
    stop("Plot type not recognised (options are normal or qq).")
  if (plottype == "qq" && errtype == "normal") {
    warning(
      paste("Not meaningful to create Q-Q plot with untransformed errors.",
            "Changing to pivoted Cholesky."))
    errtype <- "cholpivot"
  }
  if (xtype %in% c(names(em$ranges), 'em') && errtype == "eigen") {
    warning(paste(
      "Not meaningful to plot parameter or emulator prediction",
      "against eigendecomposed errors. Changing to pivoted Cholesky."))
    errtype <- "cholpivot"
  }
  points <- validation[,names(em$ranges)]
  outputs <- validation[,em$output_name]
  em_pred <- em$get_exp(points)
  em_cov <- em$get_cov(points, full = TRUE)
  ## Ensure positive-definiteness
  em_cov_struct <- eigen(em_cov)
  em_cov_eval <- em_cov_struct$values
  em_cov_eval[em_cov_eval < 0] <- 1e-6
  em_cov <- em_cov_struct$vectors %*% diag(em_cov_eval, nrow = length(em_cov_eval)) %*% t(em_cov_struct$vectors)
  if (errtype == "normal")
    indiv_errors <- (outputs - em_pred)/sqrt(diag(em_cov))
  else {
      if (errtype == "eigen")
        G <- eigen(em_cov)$vectors %*% diag(sqrt(eigen(em_cov)$values))
      else if (errtype == "chol")
        G <- t(chol(em_cov))
      else {
        Q <- chol(em_cov, pivot = TRUE)
        P <- diag(1, nrow = nrow(Q))[,attr(Q, 'pivot')]
        G <- P %*% t(Q)
      }
      G_inv <- tryCatch(
        chol2inv(chol(G)),
        error = function(e) {ginv(G)}
      )
      indiv_errors <- G_inv %*% (outputs - em_pred)
  }
  if (xtype == "index") x_vals <- seq_along(outputs)
  else if (xtype == "em") x_vals <- em_pred
  else x_vals <- validation[,xtype]
  if (plottype == "normal") {
    x_lab <- switch(xtype, "index" = "Index",
                    "em" = "Emulator Prediction", xtype)
    appended <- switch(errtype, "normal" = "",
                       "eigen" = " (eigendecomposition)",
                       "chol" = " (Choleksy decomposition)",
                       "cholpivot" = " (pivoted Cholesky decomposition)")
    plot(x_vals, indiv_errors, pch = 16, xlab = x_lab, ylab = "Error",
         main = paste0("Errors against ", x_lab, appended),
         panel.first = abline(h = c(-2,2), lty = 2))
  }
  if (plottype == "qq") {
    appended <- switch(errtype, "eigen" = " (eigendecomposition)",
                       "chol" = " (Choleksy decomposition)",
                       "cholpivot" = " (pivoted Cholesky decomposition)")
    qqnorm(indiv_errors, pch = 16,
           main = paste0("Q-Q plot for Errors", appended),
           panel.first = qqline(indiv_errors, col = "steelblue"))
  }
  output <- data.frame(variable = x_vals, error = indiv_errors)
  return(output)
}

#' Classification Diagnostics
#'
#' Shorthand function for diagnostic test `ce'.
#'
#' For details of the function, see \code{\link{get_diagnostic}} and for the plot
#' see \code{\link{analyze_diagnostic}}.
#'
#' @param emulator The emulator in question
#' @param targets The output targets
#' @param validation The validation set
#' @param cutoff The implausibility cutoff
#' @param plt Whether to plot or not
#'
#' @return A data.frame of failed points
#'
#' @family diagnostic functions
#' @export
#'
#' @references Jackson (2018) <http://etheses.dur.ac.uk/12826>
#'
#' @seealso \code{\link{get_diagnostic}}, \code{\link{analyze_diagnostic}},
#'  \code{\link{validation_diagnostics}}
#'
classification_diag <- function(emulator, targets, validation, cutoff = 3,
                                plt = interactive()) {
  analyze_diagnostic(get_diagnostic(emulator, targets, validation, 'ce'),
                     emulator$output_name, targets, plt, cutoff)
}

#' Comparison Diagnostics
#'
#' Shorthand function for diagnostic test `cd'.
#'
#' For details of the function, see \code{\link{get_diagnostic}} and for the plot
#' see \code{\link{analyze_diagnostic}}.
#'
#' @param emulator The emulator in question
#' @param targets The output targets
#' @param validation The validation set
#' @param sd The range of uncertainty allowed
#' @param plt Whether to plot or not
#'
#' @return A data.frame of failed points
#'
#' @family diagnostic functions
#' @export
#'
#' @references Jackson (2018) <http://etheses.dur.ac.uk/12826>
#'
#' @seealso \code{\link{get_diagnostic}}, \code{\link{analyze_diagnostic}},
#'  \code{\link{validation_diagnostics}}
#'
comparison_diag <- function(emulator, targets, validation, sd = 3,
                            plt = interactive()) {
  analyze_diagnostic(get_diagnostic(emulator, targets, validation, 'cd', sd),
                     emulator$output_name, targets, plt)
}

#' Standardized Error Diagnostics
#'
#' Shorthand function for diagnostic test `se'.
#'
#' For details of the function, see \code{\link{get_diagnostic}} and for the plot
#' see \code{\link{analyze_diagnostic}}.
#'
#' @param emulator The emulator in question
#' @param targets The output targets
#' @param validation The validation set
#' @param plt Whether to plot or not
#'
#' @return A data.frame of failed points
#'
#' @family diagnostic functions
#' @export
#'
#' @references Jackson (2018) <http://etheses.dur.ac.uk/12826>
#'
#' @seealso \code{\link{get_diagnostic}}, \code{\link{analyze_diagnostic}},
#'  \code{\link{validation_diagnostics}}
#'
standard_errors <- function(emulator, targets = NULL, validation = NULL,
                            plt = interactive()) {
  analyze_diagnostic(get_diagnostic(emulator, targets, validation, 'se'),
                     emulator$output_name, targets, plt)
}

#' Binomial Comparison Diagnostic Test
#'
#' Checks whether the number of failed points at different sds agrees with expectation.
#'
#' We do not expect every validation point to be perfectly accurately classified.
#' For a given number of standard deviations, Pukelsheim's rule gives an expectation
#' of the proportion of points lying outside the predictive bounds; we compute the
#' observed number of such points for each emulator at a collection of sigma values
#' and ensure that the error rate is aligned with these bounds, up to uncertainty
#' induced by the size of the validation set.
#'
#' @param ems The emulators in question
#' @param validation A list of validation points of size equal to \code{length(ems)}
#' @param n The multiples of sigma to test at
#' @param df.out Whether to return a single diagnostic TRUE/FALSE, or a detailed data.frame
#' @param allow.zero Allow for over-precise (potentially overfitted) emulators?
#'
#' @return Either a boolean, or a data.frame thereof.
#'
#' @noRd
#' @keywords internal
binomial_diagnostic_test <- function(ems, validation, n = 2:4, df.out = FALSE, allow.zero = FALSE, ...) {
  N <- map_dbl(validation, nrow)
  Xn <- map_dbl(n, ~4/(9*.^2))
  diag_check <- map(seq_along(ems), function(i) {
    diag_percents <- map_dbl(n, ~nrow(comparison_diag(ems[[i]], targets = NULL, validation = validation[[i]], sd = ., plt = FALSE)))
    if (allow.zero)
      bounds <- map(Xn, ~c(0, min(N[[i]], N[[i]]*. + 3*sqrt(N*.*(1-.)))))
    else
      bounds <- map(Xn, ~c(max(0, N[[i]]*. - 3*sqrt(N[[i]]*.*(1-.))), min(N[[i]], N[[i]]*. + 3*sqrt(N*.*(1-.)))))
    map_lgl(seq_along(diag_percents), ~diag_percents[.] >= bounds[[.]][1] && diag_percents[.] <= bounds[[.]][2])
  })
  collect_df <- do.call('cbind.data.frame', diag_check) |> setNames(map_chr(ems, "output_name"))
  if (df.out) return(collect_df)
  return(all(apply(collect_df, 2, all)))
}
#' All-emulator Mitigated Diagnostics
#'
#' Check for flagged points conditioned on overall point suitability
#'
#' A failing point may not be considered problematic if, ultimately, other emulators
#' would justifiability rule this point out as implausible. For each point that fails
#' diagnostics for one or more emulators, we compute the nth-maximum implausibility at
#' that point with respect to the remaining emulators. If the point would not be ruled
#' out as implausible by the residual emulators, we consider this point to have failed
#' diagnostics; else we dismiss it as unimportant.
#'
#' @importFrom rlang hash
#'
#' @param ems The list of emulators to consider
#' @param targets The output targets
#' @param validation A list of validation sets of same size as \code{length(ems)}
#' @param test Either 'cd' (comparison) or 'ce' (classification) diagnostics
#'
#' @return Either a data.frame or list thereof containing failed points.
#'
#' @noRd
#' @keywords internal
points_of_concern <- function(ems, targets, validation, test = 'cd', ...) {
  if (test == 'cd')
    invalid_pts_by_em <- map(seq_along(ems), ~comparison_diag(ems[[.]], targets, validation[[.]], plt = FALSE))
  else
    invalid_pts_by_em <- map(seq_along(ems), ~classification_diag(ems[[.]], targets, validation[[.]], plt = FALSE))
  all_fail_pts <- do.call('rbind.data.frame', invalid_pts_by_em)
  if (nrow(all_fail_pts) == 0)
    return(all_fail_pts)
  all_uids <- apply(all_fail_pts[,names(ems[[1]]$ranges)], 1, hash)
  all_fail_pts <- all_fail_pts[which(!duplicated(all_uids)),]
  uids <- all_uids[which(!duplicated(all_uids))]
  uids_by_em <- map(invalid_pts_by_em, function(df) apply(df[,names(ems[[1]]$ranges)], 1, hash))
  pt_em_corr <- map(uids, function(u) {
    which(map_lgl(uids_by_em, ~u %in% .))
  })
  points_dfs <- map(unique(pt_em_corr), function(upec) {
    logicals <- map_lgl(pt_em_corr, function(pec) {
      if (length(pec) != length(upec)) return(FALSE)
      return(all(pec == upec))
    })
    all_fail_pts[which(logicals),]
  })
  points_dfs_em_idents <- unique(pt_em_corr)
  actual_points <- map(seq_along(points_dfs), function(i) {
    if (length(points_dfs_em_idents[[i]]) == length(ems)) return(points_dfs[[i]])
    points_dfs[[i]][nth_implausible(ems[-points_dfs_em_idents[[i]]], points_dfs[[i]], targets, n = 1) < 3,]
  })
  actual_point_df <- do.call('rbind.data.frame', actual_points)
  if (nrow(actual_point_df) == 0) return(actual_point_df)
  validation_collect <- do.call('rbind.data.frame', validation)
  full_points <- do.call('rbind.data.frame', map(apply(actual_point_df[,names(ems[[1]]$ranges)], 1, hash), function(h) {
    validation_collect[which(apply(validation_collect[,names(ems[[1]]$ranges)], 1, hash) == h)[1],]
  }))
  return(full_points)
}
#' Structured Error Checking
#'
#' Check for relationships between flagged points and position in input/output space
#'
#' We would expect any points that flag diagnostic checks to be spread across the space;
#' if we instead have a cluster of failed points in one part of space we may think that
#' there is a problem with the regression surface of an emulator. Points that have
#' been flagged by diagnostics are collected for each emulator, and a linear model is
#' fitted to the categorical pass/fail property of the points against either the
#' point's output value or position in input space. If the model suggests a strong
#' relationship between the pass/failure and the position in space, this is flagged.
#' The adjusted R-squared of the linear model is considered a proxy for 'strong
#' relationship'.
#'
#' @param ems The emulators to test
#' @param validation A list of validation points, with size equal to \code{length(ems)}
#' @param compare_output Whether to compare against output value or input position
#' @param threshhold What level of fit do we consider problematic?
#'
#' @return A vector of booleans, one for each emulator.
#'
#' @noRd
#' @keywords internal
structured_error_check <- function(ems, validation, compare_output = TRUE, threshhold = 0.3, ...) {
  for (i in seq_along(validation))
    row.names(validation[[i]]) <- 1:nrow(validation[[i]])
  failed_points <- map(seq_along(ems), ~comparison_diag(ems[[.]], targets = NULL, validation = validation[[.]], plt = FALSE))
  em_fail_indices <- map(seq_along(failed_points), ~row.names(validation[[.]]) %in% row.names(failed_points[[.]]))
  if (compare_output) {
    em_preds <- map(seq_along(ems), ~ems[[.]]$get_exp(validation[[.]]))
    fail_dfs <- map(seq_along(em_preds), ~cbind.data.frame(em_preds[[.]], em_fail_indices[[.]]) |> setNames(c('pred', 'fail')))
    fail_lms <- map(fail_dfs, ~lm(data = ., fail ~ pred))
  }
  else {
    fail_dfs <- map(seq_along(em_fail_indices), ~cbind.data.frame(validation[[.]], em_fail_indices[[.]]) |> setNames(c(names(validation[[.]]), "fail")))
    fail_lms <- map(fail_dfs, ~lm(data = ., as.formula(paste0("fail ~ ", paste0(names(ems[[1]]$ranges), collapse = "+")))))
  }
  map_lgl(fail_lms, ~!is.nan(summary(.)$adj.r.squared) && summary(.)$adj.r.squared > threshhold)
}


#' Automated Diagnostics and Modifications
#'
#' Perform a set of diagnostics on emulators, changing them if needed.
#'
#' NOTE: Automated diagnostics are currently only supported for deterministic
#' emulators.
#'
#' There are a number of different characteristics that emulators might possess
#' that give rise to diagnostic flags. This function collects together some of those
#' whose resulting modifications can be automated. The tests, and consequences, are
#' as follows.
#'
#' \describe{
#'  \item{Structured Input Space}{Looks for errors with dependence on input parameters. If
#'   found, the emulator's correlation length is reduced (to a minimum of 1/3);}
#'  \item{Structured Output Space}{Looks for errors with dependence on output value. If
#'   found, the training and validation data is resampled and emulators are retrained,
#'   to try to incorporate/remove high leverage points;}
#'  \item{Misclassification}{Checks agreement between emulator and simulator implausibility
#'   classification. If they do not match, emulator uncertainty is inflated;}
#'  \item{Comparison}{Checks that the emulator predictions agree with the simulator
#'   predictions at the validation points, allowing for expected margin of error.}
#' }
#'
#' If the automated modifications are not sufficient to remove problems, then
#' offending emulators are removed from the set under consideration. Emulators in
#' this category should be carefully considered and their outputs analyzed: they may
#' require manual determination of the regression surface or additional training
#' points in the neightbourhood of the problematic inputs.
#'
#' The validation set can also be checked for suitability independent of emulator
#' structure: if \code{check_output = TRUE} then the outputs of the validation set
#' will be compared against targets, as well as checking the implausibility of the
#' points with respect to the emulators. If any outputs are consistent under- or
#' over-estimates, or if all points are to be ruled out as implausible, the emulators
#' corresponding to these outputs are removed. This option should be used with care:
#' such a situation could be informative for considering the model structure and
#' whether one should expect a match to observational data.
#'
#' @param ems The emulators to consider
#' @param targets The output targets to compare implausibility against
#' @param validation The set of validation points (either a single data.frame or one per emulator)
#' @param verbose Whether messages should be printed while running
#' @param check_output Whether to check for suitability of outputs re. targets
#' @param ... Other arguments to pass to helper functions
#'
#' @return A collection of modified emulators, potentially a subset of the original collection
#'
#' @export
#'
#' @examples
#'  new_ems <- diagnostic_pass(SIREmulators$ems, SIREmulators$targets, SIRSample$validation)
diagnostic_pass <- function(ems, targets, validation, check_output = FALSE, verbose = interactive(), ...) {
  if (!is.null(ems$variance) || !is.null(ems$expectation) || !is.null(ems$mode1) || !is.null(ems$mode2))
    stop("Automated diagnostics not supported for stochastic or multistate emulators.")
  original_outputs <- map_chr(ems, "output_name")
  if (is.data.frame(validation)) {
    validation <- validation[apply(validation, 1, function(x) !any(is.na(x))),]
    validation_per_em <- map(ems, ~validation)
  }
  else {
    validation_per_em <- map(validation, function(v) {
      v[apply(v, 1, function(x) !any(is.na(x))),]
    })
  }
  if (check_output) {
    if (verbose) cat("Checking consistent implausibility or over/underestimation of outputs..\n") #nocov
    imps <- map(seq_along(ems), function(i) {
      training_runs <- eval_funcs(scale_input, ems[[i]]$input_data, ems[[i]]$ranges, FALSE)
      ems[[i]]$implausibility(rbind.data.frame(validation_per_em[[i]][,names(ems[[i]]$ranges)], training_runs), targets[[ems[[i]]$output_name]], cutoff = 3)
    })
    fail_imp <- map_lgl(imps, ~sum(.) == 0)
    all_pts <- do.call('rbind.data.frame', validation_per_em)
    consistent_under <- map_lgl(map_chr(ems, "output_name"), function(tn) {
      if (is.atomic(targets[[tn]])) check_val <- targets[[tn]][1]
      else check_val <- targets[[tn]]$val - 3*targets[[tn]]$sigma
      trained_outputs <- ems[[tn]]$out_data
      return(all(c(all_pts[,tn], trained_outputs) < check_val))
    })
    consistent_over <- map_lgl(map_chr(ems, "output_name"), function(tn) {
      if (is.atomic(targets[[tn]])) check_val <- targets[[tn]][2]
      else check_val <- targets[[tn]]$val + 3*targets[[tn]]$sigma
      trained_outputs <- ems[[tn]]$out_data
      return(all(c(all_pts[,tn], trained_outputs) > check_val))
    })
    failed <- fail_imp | consistent_under | consistent_over
    if (any(failed) && verbose) cat(paste("Some outputs unsuitable for targets:", #nocov
                               paste0(map_chr(ems, "output_name")[failed], collapse = "; "), "\n")) #nocov
    ems <- subset_emulators(ems, map_chr(ems, "output_name")[!failed])
    validation_per_em <- validation_per_em[!failed]
    if (length(ems) == 0) {
      if (verbose) cat("No emulators deemed suitable.\n") #nocov
      return(ems)
    }
  }
  if (verbose) cat("Checking for structured errors in input space..\n") #nocov
  input_structured <- structured_error_check(ems, validation_per_em, FALSE, ...)
  if (any(input_structured) && verbose)
    cat("Structured input errors detected. Reducing correlation length.\n") #nocov
  while(any(input_structured & map_lgl(ems, ~.$corr$hyper_p$theta > 1/3))) {
    ems <- map(seq_along(ems), function(i) {
      if (input_structured[i]) {
        hyper_p_spec <- ems[[i]]$corr$hyper_p
        hyper_p_spec$theta <- hyper_p_spec$theta * 0.9
        if (hyper_p_spec$theta < 1/3) hyper_p_spec$theta <- 1/3
        return(ems[[i]]$set_hyperparams(hyper_p_spec))
      }
      else return(ems[[i]])
    }) |> setNames(map_chr(ems, "output_name"))
    input_structured <- structured_error_check(ems, validation_per_em, ...)
  }
  if (any(input_structured)) {
    if (verbose) cat(paste("Some emulators show unresolved structured input errors:", #nocov
              paste0(map_chr(ems, "output_name")[input_structured], collapse = "; "), "\n")) #nocov
    ems <- subset_emulators(ems, map_chr(ems, "output_name")[!input_structured])
    validation_per_em <- validation_per_em[!input_structured]
  }
  if (length(ems) == 0) {
    if (verbose) cat("No emulators deemed suitable.\n") #nocov
    return(ems)
  }
  if (verbose) cat("Checking for structured errors in output space..\n") #nocov
  output_structured <- structured_error_check(ems, validation_per_em, ...)
  if (any(output_structured) && verbose)
    cat("Structured output errors detected. Resampling training and validation points.\n") #nocov
  structure_fix_attempts <- 0
  while(any(output_structured) && structure_fix_attempts < 5) {
    structure_fix_attempts <- structure_fix_attempts + 1
    ems <- map(seq_along(ems), function(i) {
      if (output_structured[i]) {
        if (verbose) cat(paste0("Retraining emulator for output ", ems[[i]]$output_name, "..\n"))
        old_training <- cbind.data.frame(eval_funcs(scale_input, ems[[i]]$in_data, ems[[i]]$ranges, FALSE), ems[[i]]$out_data) |>
          setNames(c(names(ems[[i]]$ranges), ems[[i]]$output_name))
        all_data <- rbind.data.frame(validation[,c(names(ems[[i]]$ranges), ems[[i]]$output_name)], old_training)
        new_sample <- sample(nrow(all_data), nrow(old_training))
        new_train <- all_data[new_sample,]
        validation_per_em[[i]] <- all_data[-new_sample,]
        return(emulator_from_data(new_train, ems[[i]]$output_name, ems[[i]]$ranges, verbose = FALSE)[[1]])
      }
      else return(ems[[i]])
    }) |> setNames(map_chr(ems, "output_name"))
    output_structured <- structured_error_check(ems, validation_per_em, ...)
  }
  if (any(output_structured)) {
    if (verbose) cat(paste("Some emulators show unresolved structured output errors:", #nocov
              paste0(map_chr(ems, "output_name")[output_structured], collapse = "; "), "\n")) #nocov

    ems <- subset_emulators(ems, map_chr(ems, "output_name")[!output_structured])
    validation_per_em <- validation_per_em[!output_structured]
  }
  if (length(ems) == 0) {
    if (verbose) cat("No emulators deemed suitable.\n") #nocov
    return(ems)
  }
  if (verbose) cat("Checking for problematic implausibility misclassifications..\n") #nocov
  classified_wrong <- points_of_concern(ems, targets, validation_per_em, test = 'ce', ...)
  if (nrow(classified_wrong) > 0 && verbose)
    cat("Classification diagnostic failures. Inflating relevant emulator uncertainties.\n") #nocov
  while (nrow(classified_wrong) > 0) {
    number_per_em <- map_dbl(ems, ~nrow(classification_diag(., targets, classified_wrong, plt = FALSE)))
    ems <- map(seq_along(ems), function(i) {
      if (number_per_em[i] == 0) return(ems[[i]])
      return(ems[[i]]$mult_sigma(1.1))
    }) |> setNames(map_chr(ems, "output_name"))
    classified_wrong <- points_of_concern(ems, targets, validation_per_em, test = 'ce', ...)
  }
  if (verbose) cat("Checking for issues in comparison diagnostics..\n") #nocov
  comparison_wrong <- points_of_concern(ems, targets, validation_per_em, test = 'cd', ...)
  if (nrow(comparison_wrong) > 0) {
    if (verbose) cat("Comparison diagnostic issues. Inflating relevant emulator uncertainties..\n") #nocov
    while(nrow(comparison_wrong) > 0) {
      number_per_em <- map_dbl(ems, ~nrow(comparison_diag(., targets = NULL, validation = comparison_wrong, plt = FALSE)))
      ems <- map(seq_along(ems), function(i) {
        if (number_per_em[i] == 0) return(ems[[i]])
        return(ems[[i]]$mult_sigma(1.1))
      }) |> setNames(map_chr(ems, "output_name"))
      comparison_wrong <- points_of_concern(ems, targets, validation_per_em, test = 'cd', ...)
    }
    binomial_problems <- apply(binomial_diagnostic_test(ems, validation_per_em, df.out = TRUE, ...), 2, all)
    if (!all(binomial_problems)) {
      if (verbose) cat(paste("Some emulators show scaling issues:", #nocov
                paste0(map_chr(ems, "output_name")[!binomial_problems], collapse = "; "), "\n")) #nocov
      ems <- subset_emulators(ems, map_chr(ems, "output_name")[binomial_problems])
    }
  }
  if (length(ems) == 0) {
    if (verbose) cat("No emulators deemed suitable.\n") #nocov
    return(ems)
  }
  if (length(original_outputs) != length(ems))
    if (verbose) { #nocov start
      cat("Some emulators did not pass diagnostics: ", paste0(original_outputs[!original_outputs %in% map_chr(ems, "output_name")], collapse = "; "), ".\n")
      cat("Investigate these outputs carefully and consider adding in additional training points near problematic regions of space.\n")
    } #nocov end
  return(ems)
}
Any scripts or data that you put into this service are public.
hmer documentation built on June 22, 2024, 9:22 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
hmer
History Matching and Emulation Package

R/diagnostics.R
In hmer: History Matching and Emulation Package

Defines functions points_of_concern binomial_diagnostic_test standard_errors comparison_diag classification_diag individual_errors validation_diagnostics analyze_diagnostic get_diagnostic space_removal residual_diag summary_diag k_fold_measure clean_data

Documented in analyze_diagnostic classification_diag comparison_diag get_diagnostic individual_errors residual_diag space_removal standard_errors summary_diag validation_diagnostics

Try the hmer package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hmer History Matching and Emulation Package

R/diagnostics.R In hmer: History Matching and Emulation Package

Defines functions points_of_concern binomial_diagnostic_test standard_errors comparison_diag classification_diag individual_errors validation_diagnostics analyze_diagnostic get_diagnostic space_removal residual_diag summary_diag k_fold_measure clean_data

Documented in analyze_diagnostic classification_diag comparison_diag get_diagnostic individual_errors residual_diag space_removal standard_errors summary_diag validation_diagnostics

Try the hmer package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

hmer
History Matching and Emulation Package

R/diagnostics.R
In hmer: History Matching and Emulation Package