R/evsi.R
In voi: Expected Value of Information

Documented in evsi

##' Calculate the expected value of sample information from a decision-analytic
##' model
##'
##' Calculate the expected value of sample information from a decision-analytic
##' model
##'
##' See the \href{https://chjackson.github.io/voi/articles/voi.html#evsi}{package overview / Get Started vignette} for some examples of using this function. 
##'
##' @inheritParams evppi
##'
##' @param study Name of one of the built-in study types supported by this
##'   package for EVSI calculation.  If this is supplied, then the columns of
##'   \code{inputs} that correspond to the parameters governing the study data
##'   should be identified in \code{pars}.
##'
##'   Current built-in studies are
##'
##'   \code{"binary"} A study with a binary outcome observed on one sample of
##'   individuals.   Requires one parameter: the probability of the outcome. The
##'   sample size is specifed in the \code{n} argument to \code{evsi()}, and the
##'   binomially-distributed outcome is named \code{X1}.
##'
##'   \code{"trial_binary"} Two-arm trial with a binary outcome.   Requires two
##'   parameters: the probability of the outcome in arm 1 and 2 respectively.
##'   The sample size is the same in each arm, specifed in the \code{n} argument
##'   to \code{evsi()}, and the binomial outcomes are named \code{X1} and
##'   \code{X2} respectively.  
##'
##'   \code{"normal_known"} A study of a normally-distributed outcome, with a
##'   known standard deviation, on one sample of individuals.  Likewise the
##'   sample size is specified in the \code{n} argument to \code{evsi()}.  The
##'   standard deviation defaults to 1, and can be changed by specifying
##'   \code{sd} as a component of the \code{aux_pars} argument, e.g.
##'   \code{evsi(..., aux_pars=list(sd=2))}.   
##'
##'   Either \code{study} or \code{datagen_fn} should be supplied to
##'   \code{evsi()}.
##'
##'   For the EVSI calculation methods where explicit Bayesian analyses of the
##'  simulated data are performed, the prior parameters for these built-in studies
##'  are supplied in the \code{analysis_args} argument to \code{evsi()}.  These
##'   assume Beta priors for probabilities, and Normal priors for the mean of a
##' normal outcome.
##'
##'
##' @param datagen_fn If the proposed study is not one of the built-in types
##'   supported, it can be specified in this argument as an R function to sample
##'   predicted data from the study.  This function should have the following
##'   specification:
##'
##'   1. the function's first argument should be a data frame of parameter
##'   simulations, with one row per simulation and one column per parameter.
##'   The parameters in this data frame must all be found in \code{inputs},
##'   but need not necessarily be in the same order or include all of them. 
##'
##'   2. the function should return a data frame.
##'
##'   3. the returned data frame should have number of rows equal to the number
##'   of parameter simulations in \code{inputs}.
##'
##'   4. if \code{inputs} is considered as a sample from the posterior, then
##'   \code{datagen_fn(inputs)} returns a corresponding sample from the
##'   posterior predictive distribution, which includes two sources of
##'   uncertainty: (a) uncertainty about the parameters and (b) sampling
##'   variation in observed data given fixed parameter values.
##'
##'   5. the function can optionally have more than one argument. If so, these
##'   additional arguments should be given default values in the definition of
##'   \code{datagen_fn}.  If there is an argument called \code{n}, then it is
##'   interpreted as the sample size for the proposed study.
##'
##' @param pars Character vector identifying which parameters are learned from the proposed study.
##' This is required for the moment matching and importance sampling methods,
##' and these should be columns of \code{inputs}.   This is not required for the nonparametric
##' regression methods.  
##' 
##' @param pars_datagen Character vector identifying which columns of \code{inputs} are
##'   the parameters required to generate data from the proposed study.
##'   These should be columns of \code{inputs}.   
##' 
##'   If \code{pars_datagen} is not supplied, then it is assumed to be the same as \code{pars}.
##'   Note that these can be different.  Even if the study data are generated by a particular parameter,
##'   when analysing the data we could choose to ignore the information that the data provides about
##'   that parameter.
##'
##' @param n Sample size of future study, or vector of alternative sample sizes.
##'   This is understood by the built-in study designs.  For studies specified
##'   by the user with \code{datagen_fn}, if \code{datagen_fn} has an argument
##'   \code{n}, then this is interpreted as the sample size.  However if
##'   calling \code{evsi} for a user-specified design where 
##'   \code{datagen_fn} does not have an \code{n} argument, then any \code{n}
##'   argument supplied to \code{evsi} will be ignored. 
##'
##'   Currently this
##'   shortcut is not supported if more than one quantity is required to
##'   describe the sample size, for example, trials with unbalanced arms.  In
##'   that case, you will have to hard-code the required sample sizes into
##'   `datagen_fn`.
##'
##'   For the nonparametric regression and importance sampling methods, the
##'   computation is simply repeated for each sample size supplied here.
##'
##'   The moment matching method uses a regression model to estimate the
##'   dependency of the EVSI on the sample size, hence to enable EVSI to be
##'   calculated efficiently for any number of sample sizes (Heath et al. 2019).
##'
##' @param aux_pars A list of additional fixed arguments to supply to the
##'   function to generate the data, whether that is a built-in study design or user-defined
##'   function supplied in \code{datagen_fn}.  For example, \code{evsi(..., aux_pars = list(sd=2))} defines the fixed
##'   standard deviation in the \code{"normal_known"} model.
##'
##' @param method Character string indicating the calculation method.  Defaults to \code{"gam"}.
##'
##'   All the nonparametric regression methods supported for
##'   \code{\link{evppi}}, that is \code{"gam","gp","earth","inla"}, can also be
##'   used for EVSI calculation by regressing on a summary statistic of the
##'   predicted data (Strong et al 2015). 
##'
##'   \code{"is"} for importance sampling (Menzies 2016)
##'
##'   \code{"mm"} for moment matching (Heath et al 2018)
##'
##'   Note that the  \code{"is"} and \code{"mm"} methods are used in conjunction
##'   with nonparametric regression, and the \code{gam_formula} argument can be
##'   supplied to \code{evsi} to specify this regression - see
##'   \code{\link{evppi}} for documentation of this argument.
##'
##' @param likelihood Likelihood function, required (and only required) for the
##'   importance sampling method when a study design other than one of the
##'   built-in ones is used.  This should have two arguments, named as follows:
##'
##'   \code{Y}: a one-row data frame of predicted data. Columns are defined by different
##'   outcomes in the data, with names matching the names of the data frame returned by
##'   \code{datagen_fn}.  
##'
##'   \code{inputs}. a data frame of simulated parameter values. Columns should correspond
##'   to different variables in \code{inputs}.  The column names should all be
##'   found in the names of \code{inputs}, though they do not have to be in the same
##'   order, or include everything in \code{inputs}. The number or rows should be the same as
##'   the number of rows in \code{inputs}.
##'
##'   The function should return a vector whose length matches the number of
##'   rows of the parameters data frame given as the second argument.   Each
##'   element of the vector gives the likelihood of the corresponding set of
##'   parameters, given the data in the first argument.  An example is given in
##'   the vignette.
##' 
##'   The likelihood can optionally have a \code{n} argument, which is interpreted
##'   as the sample size of the study.   If the \code{n} 
##'   argument to \code{evsi} is used then this is passed to the likelihood function.
##'   Conversely any \code{n} argument to \code{evsi} will be ignored by a likelihood
##'   function that does not have its own \code{n} argument.
##'
##'   Note the definition of the likelihood should agree with the definition of
##'   \code{datagen_fn} to define a consistent sampling distribution for the
##'   data. No automatic check is performed for this.
##'
##' @param analysis_fn Function which fits a Bayesian model to the generated
##'   data.   Required for \code{method="mm"} if a study design other than one
##'   of the built-in ones is used.  This should be a function that takes the
##'   following arguments:
##'
##'   `data`: A data frame with names matching the output of `datagen_fn`
##'
##'   `args`: A list with constants required in the Bayesian analysis, e.g.
##'   prior parameters, or options for the analysis, e.g. number of MCMC
##'   simulations. The component of this list called \code{n} is assumed to
##'   contain the sample size of the study.
##'
##'   `pars` Names of the parameters whose posterior is being sampled.
##'
##'   The function should return a data frame with names matching `pars`,
##'   containing a sample from the posterior distribution of the parameters
##'   given data supplied through `data`.
##'
##'   `analysis_fn` is required to have all three of these arguments, but you do
##'   not need to use any elements of `args` or `pars` in the body of
##'   `analysis_fn`.  Instead, sample sizes, prior parameters, MCMC options and
##'   parameter names can alternatively be hard-coded inside `analysis_fn`. Passing these
##'   through the function arguments (via the \code{analysis_args} argument to
##'   \code{evsi}) is only necessary if we want to use the same `analysis_fn` to
##'   do EVSI calculations with different sample sizes or other settings.
##'
##' @param analysis_args List of arguments required for the Bayesian analysis of
##'   the predicted data, e.g. definitions of the prior and options to control
##'   sampling.  Only used in \code{method="mm"}.  This is required if the study
##'   design is one of the built-in ones specified in \code{study}.  If a custom
##'   design is specifed through \code{analysis_fn}, then any constants needed
##'   in `analysis_fn` can either be supplied in `analysis_args`, or hard-coded
##'   in `analysis_fn` itself.
##'
##'   For the built-in designs, the lists should have the following named
##'   components. An optional component `niter` in each case defines the
##'   posterior sample size (default 1000).
##'
##'   `study="binary"`: `a` and `b`: Beta shape parameters
##'
##'   `study="trial_binary"`: `a1` and `b1`: Beta shape parameters for the prior
##'   for the first arm,  `a2` and `b2`: Beta shape parameters for the prior for
##'   the second arm.
##'
##'   `study="normal_known"`: `prior_mean`, `prior_sd` (mean and standard deviation 
##'   deviation of the Normal prior) and `sampling_sd` (SD of an individual-level normal
##'   observation, so that the sampling SD of the mean outcome over the study is
##'   `sampling_sd/sqrt(n)`.
##'
##' @param model_fn Function which evaluates the decision-analytic model, given
##'   parameter values.  Required for \code{method="mm"}.  See
##'   \code{\link{evppi_mc}} for full documentation of the required specification
##'   of this function.
##'
##' @param par_fn Function to simulate values from the uncertainty distributions
##'   of parameters needed by the decision-analytic model.  Should take one
##'   argument and return a data frame with one row for each simulated value,
##'   and one column for each parameter.  See \code{\link{evppi_mc}} for full
##'   specification.
##'
##' @param Q Number of quantiles to use in \code{method="mm"}.
##'
##' @param npreg_method Method to use to calculate the EVPPI, for those methods
##'   that require it.  This is passed to \code{\link{evppi}} as the
##'   \code{method} argument.
##'
##' @param nsim Number of simulations from the model to use for calculating
##'   EVPPI.  The first \code{nsim} rows of the objects in \code{inputs} and
##'   \code{outputs} are used.
##'
##' @param ... Other arguments understood by specific methods, e.g. \code{gam_formula}
##' and other controlling options (see \code{\link{evppi}}) can be passed to the
##' nonparametric regression used inside the moment matching method.
##'
##' @return A data frame with a column \code{pars}, indicating the
##'   parameter(s), and a column \code{evsi}, giving the corresponding
##'   EVPPI.  If the EVSI for multiple sample sizes was requested,
##'   then the sample size is returned in the column \code{n}, and if
##'   \code{outputs} is of "cost-effectiveness analysis" form, so that
##'   there is one EVPPI per willingness-to-pay value, then a column
##'   \code{k} identifies the willingness-to-pay.
##'
##' @references
##' 
##' Heath, A., Kunst, N., & Jackson, C. (eds.). (2024). Value of Information for Healthcare Decision-Making. CRC Press.
##'
##' Strong, M., Oakley, J. E., Brennan, A., & Breeze, P. (2015). Estimating the
##' expected value of sample information using the probabilistic sensitivity
##' analysis sample: a fast, nonparametric regression-based method. Medical
##' Decision Making, 35(5), 570-583.
##'
##' Menzies, N. A. (2016). An efficient estimator for the expected value of
##' sample information. Medical Decision Making, 36(3), 308-320.
##'
##' Heath, A., Manolopoulou, I., & Baio, G. (2018). Efficient Monte Carlo
##' estimation of the expected value of sample information using moment
##' matching. Medical Decision Making, 38(2), 163-173.
##'
##' Heath, A., Manolopoulou, I., & Baio, G. (2019). Estimating the expected
##' value of sample information across different sample sizes using moment
##' matching and nonlinear regression. Medical Decision Making, 39(4), 347-359.
##'
##' @export
evsi <- function(outputs,
                 inputs,
                 study=NULL,
                 datagen_fn=NULL,
                 pars=NULL,
                 pars_datagen=NULL,
                 n=100,
                 aux_pars=NULL, 
                 method=NULL,
                 likelihood=NULL,
                 analysis_fn=NULL,
                 analysis_args=NULL,
                 model_fn=NULL,
                 par_fn=NULL,
                 Q=50,
                 npreg_method="gam",
                 nsim=NULL,
                 verbose=FALSE, 
                 check=FALSE,
                 ...)
{
    check_inputs(inputs)
    check_ss(n)
    outputs <- check_outputs(outputs, inputs)
    if (is.null(method))
        method <- default_evsi_method()

    if (is.null(nsim)) nsim <- nrow(inputs)
    outputs <- subset_outputs(outputs, nsim)
    inputs <- inputs[1:nsim,,drop=FALSE]

    datagen_fn <- form_datagen_fn(study, datagen_fn, inputs, aux_pars)
    ## Could use any nonparametric regression method to regress on a summary statistic, in identical way to EVPPI estimation.
    
    pars_datagen <- check_pars_datagen(pars, pars_datagen, inputs)
    npreg_method <- default_npreg_method(pars)
    if (method %in% npreg_methods) { 
        res <- evsi_npreg(outputs=outputs, inputs=inputs, 
                   datagen_fn=datagen_fn, pars=pars_datagen, n=n, 
                   method=method, verbose=verbose, check=check, 
                   aux_pars = aux_pars, ...)
    } else if (method=="is") {
        if (verbose) message("Processing likelihood...")
        likelihood <- form_likelihood(study, likelihood, inputs, datagen_fn, pars_datagen)
        if (verbose) message("Entering evsi_is...")
        res <- evsi_is(outputs=outputs, inputs=inputs, 
                       pars=pars, pars_datagen=pars_datagen,
                       datagen_fn=datagen_fn, n=n,
                       aux_pars=aux_pars, likelihood=likelihood,
                       npreg_method=npreg_method, verbose=verbose, ...)
    } else if (method=="mm") {
        res <- evsi_mm(outputs=outputs, inputs=inputs, 
                       pars=pars, pars_datagen=pars_datagen,
                       datagen_fn=datagen_fn,
                       study=study, 
                       analysis_fn=analysis_fn,
                       analysis_args=analysis_args,
                       model_fn=model_fn, 
                       par_fn=par_fn, 
                       n=n, Q=Q,
                       npreg_method=npreg_method,
                       verbose=verbose, ...)
    }
    else stop("Other methods not implemented yet")
    attr(res, "method") <- method
    attr(res, "outputs") <- class(outputs)[1]
    class(res) <- c("evsi", attr(res,"class"))
    res
}

evsi_npreg <- function(outputs, inputs, datagen_fn, pars, n, method=NULL, se=FALSE, B=500, verbose, check, aux_pars=NULL, ...){
    nn <- length(n)
    res <- vector(nn, mode="list")
    validate_char(pars, "pars")
    for (i in seq_along(n)){
        Tdata <- generate_data(inputs, datagen_fn, n[i], pars, aux_pars)
        res[[i]] <- evppi_npreg(outputs=outputs, inputs=Tdata, 
                           pars=names(Tdata), method=method, se=se, B=B, verbose=verbose, ...)
        names(res[[i]])[names(res[[i]])=="evppi"] <- "evsi"
        rownames(res[[i]]) <- NULL
    }
    resall <- do.call("rbind", res)
    resall <- cbind(n=rep(n,each=nrow(res[[1]])), resall)
    if (check){
      attr(resall, "models") <- lapply(res, function(x)attr(x, "models"))
      names(attr(resall,"models")) <- as.character(n)
    }
    resall
}

generate_data <- function(inputs, datagen_fn, n=100, pars, aux_pars=NULL){
    check_datagen_fn(datagen_fn, inputs, pars)
    args <- list(inputs=inputs, n=n, pars=pars)
    args <- c(args, aux_pars)
    do.call(datagen_fn, args)
}

default_evsi_method <- function(){
    "gam"
}

check_study <- function(study) {
    if (!is.character(study) || (!(study %in% studies_builtin)))
      stop("``study` should be a character string matching one of the supported study designs")
}

form_datagen_fn <- function(study, datagen_fn, inputs, aux_pars=NULL){
  if (!is.null(study)) {
    check_study(study)
    if (!is.null(datagen_fn))
      warning("Ignoring `datagen_fn`, since a built-in study design was requested")
    datagen_fn <- get(sprintf("datagen_%s", study))
  } else {
    if (is.null(datagen_fn)) stop("`datagen_fn` should be supplied if `study` is not supplied")
    if (!is.function(datagen_fn)) stop("`datagen_fn` should be a function")
    if (!("n" %in% names(formals(datagen_fn))))
      formals(datagen_fn) <- c(formals(datagen_fn), list(n=100))
    formals(datagen_fn) <- c(formals(datagen_fn), list(pars=NULL))
    check_datagen_fn(datagen_fn, inputs, aux_pars)
  }
  datagen_fn
}

check_datagen_fn <- function(datagen_fn, inputs, pars=NULL, aux_pars=NULL){
    ## If there's more than one argument, check that those args have
    ## defaults (e.g. sample sizes). Give error for now if not, but
    ## consider relaxing if this becomes a problem
    extra_args <- formals(datagen_fn)[-1]
    extra_args <- extra_args[names(extra_args) != "pars"]
    if (length(extra_args)>0){
        no_defaults <- sapply(extra_args, is.symbol)
        if (any(no_defaults)){
            stop(sprintf("Arguments \"%s\" of `datagen_fn` do not have default values", paste(names(no_defaults),collapse=",")))
        }
    }
    ret <- datagen_fn(inputs, pars=pars)
    if (!is.data.frame(ret)) stop("`datagen_fn` should return a data frame")
    parnames <- names(ret)[names(ret) %in% names(inputs)]
    if (length(parnames)>0) {
        stop(sprintf("`datagen_fn` returns variables with the same names as parameters (%s).  It should return simulated data", paste(parnames, collapse=",")))
    }
    if (nrow(ret) != nrow(inputs)){
        stop(sprintf("`datagen_fn` returns a data frame with %s rows. There should be %s rows, the same number of rows as `inputs`", nrow(ret), nrow(inputs)))
    }
}

form_analysis_fn <- function(study, analysis_fn, analysis_args, datagen_fn, inputs, n, pars, pars_datagen){
  if (!is.null(study)){
    check_study(study)
    analysis_fn <- get(sprintf("analysis_%s", study))
  } else {
    if (is.null(analysis_fn)) stop("`analysis_fn` should be supplied if `study` is not supplied")
    check_analysis_fn(analysis_fn, analysis_args, datagen_fn, inputs, n, pars, pars_datagen)
  }
  analysis_fn
}

check_analysis_fn <- function(analysis_fn, analysis_args, datagen_fn, inputs, n, pars, pars_datagen){
  if (!is.function(analysis_fn)) stop("`analysis_fn` should be a function")
  if (!identical(names(formals(analysis_fn)), c("data","args","pars")))
    stop("`analysis_fn` should have arguments `data`,`args`,`pars` in that order")
  testdata <- datagen_fn(inputs = inputs[1,,drop=FALSE], n = n, pars=pars_datagen)
  post_pars <- analysis_fn(data = testdata, args = analysis_args, pars=pars)
  missing_pars <- setdiff(pars, names(post_pars))
  if (length(missing_pars) > 0){
    badpars <- paste(missing_pars, collapse=",")
    stop(sprintf("Parameters %s not found in data frame returned by `analysis_fn`", badpars))
  }
}

check_pars_in_modelfn <- function(pars, model_fn){
  mfargs <- names(formals(model_fn))
  missing_args <- setdiff(pars, mfargs)
  if (length(missing_args) > 0){
    badpars <- paste(missing_args, collapse=",")
    stop(sprintf("Parameters %s are not arguments to model_fn. Should model_fn be reparameterised?", badpars))
  } 
}

check_ss <- function(n){
    if (!is.numeric(n))
        stop("sample size `n` should be a numeric vector")
    if (any(n < 0))
        stop("sample sizes `n` should all be positive integers")
}

form_analysis_args <- function(analysis_args, study, n){
  if (!is.null(study)){
    if (study %in% studies_builtin){
      if (!is.list(analysis_args))
        stop("analysis_args should be supplied as a named list if using one of the built-in study designs")
    }
  }
  if (is.null(analysis_args))
    analysis_args <- list() # for testing
  if (is.null(analysis_args$n))
    analysis_args$n <- n[1]
  analysis_args
}

check_pars_datagen <- function(pars, pars_datagen, inputs){
  if (is.null(pars_datagen))
    pars_datagen <- pars
  check_pars(pars_datagen, inputs, evppi=FALSE)
  pars_datagen
}