#' Bayesian aggregate treatment effects model
#'
#' Bayesian inference on parameters of an average treatment effects model
#' that's appropriate to the supplied
#' individual- or group-level data, using Hamiltonian Monte Carlo in Stan.
#' (For overall package help file see [baggr-package])
#'
#'
#' @importFrom rstan summary
#' @importFrom rstan sampling
#'
#' @param data data frame with summary or individual level data to meta-analyse;
#' see Details section for how to format your data
#' @param model if \code{NULL}, detected automatically from input data
#' otherwise choose from
#' \code{"rubin"}, \code{"mutau"}, \code{"rubin_full"}, \code{"quantiles"}
#' (see Details).
#' @param pooling Type of pooling;
#' choose from \code{"none"}, \code{"partial"} (default) and \code{"full"}.
#' If you are not familiar with the terms, consult the vignette;
#' "partial" can be understood as random effects and "full" as fixed effects
#' @param pooling_control Pooling for group-specific control mean terms in models using
#' individual-level data. Typically we use either `"none"` or `"partial"`,
#' but if you want to remove the group-specific intercept altogether,
#' set this to `"remove"`.
#' @param effect_label How to label the effect(s). These labels are used in various print and plot outputs.
#' Will default to `"mean"` in most models, `"log OR"` in logistic model etc.
#' If you plan on comparing models (see [baggr_compare]), use the same labels.
#' @param covariates Character vector with column names in `data`. The corresponding columns are used as
#' covariates (fixed effects) in the meta-regression model (in case of aggregate data).
#' In the case of individual level data the model does not differentiate between group-level
#' variables (same values of the covariate for all rows related to a given group) and
#' individual-level covariates.
#' @param prior_hypermean prior distribution for hypermean; you can use "plain text" notation like
#' `prior_hypermean=normal(0,100)` or `uniform(-10, 10)`.
#' See _Details:Priors_ section below for more possible specifications.
#' If unspecified, the priors will be derived automatically based on data
#' (and printed out in the console).
#' @param prior_hypersd prior for hyper-standard deviation, used
#' by Rubin and `"mutau"` models;
#' same rules apply as for `_hypermean`;
#' @param prior_hypercor prior for hypercorrelation matrix, used by the `"mutau"` model
#' @param prior_beta prior for regression coefficients if `covariates` are specified; will default to
#' experimental normal(0, 10^2) distribution
#' @param prior_control prior for the mean in the control arm (baseline), currently
#' used in `"logit"` model only;
#' if `pooling_control = "partial"`, the prior is hyperprior
#' for all baselines, if `"none"`,
#' then it is an independent prior for all baselines
#' @param prior_control_sd prior for the SD in the control arm (baseline), currently
#' used in `"logit"` model only;
#' this can only be used if `pooling_control = "partial"`
#' @param prior_sigma prior for error terms in linear regression models (`"rubin_full"` or `"mutau_full"`)
#' @param prior alternative way to specify all priors as a named list with `hypermean`,
#' `hypersd`, `hypercor`, `beta`, analogous to `prior_` arguments above,
#' e.g. `prior = list(hypermean = normal(0,10), beta = uniform(-50, 50))`
#' @param ppd logical; use prior predictive distribution? (_p.p.d._)
#' If `ppd=TRUE`, Stan model will sample from the prior distribution(s)
#' and ignore `data` in inference. However, `data` argument might still
#' be used to infer the correct model (if `model=NULL`) and to set the
#' default priors, therefore you must specify it.
#' @param outcome character; column name in (individual-level)
#' \code{data} with outcome variable values
#' @param group character; column name in \code{data} with grouping factor;
#' it's necessary for individual-level data, for summarised data
#' it will be used as labels for groups when displaying results
#' @param treatment character; column name in (individual-level) \code{data} with treatment factor;
#' @param quantiles if \code{model = "quantiles"}, a vector indicating which quantiles of data to use
#' (with values between 0 and 1)
#' @param test_data data for cross-validation; NULL for no validation, otherwise a data frame
#' with the same columns as `data` argument. See "Cross-validation" section below.
#' @param silent Whether to silence messages about prior settings and about other automatic behaviour.
#' @param warn print an additional warning if Rhat exceeds 1.05
#' @param ... extra options passed to Stan function, e.g. \code{control = list(adapt_delta = 0.99)},
#' number of iterations etc.
#' @return `baggr` class structure: a list including Stan model fit
#' alongside input data, pooling metrics, various model properties.
#' If test data is used, mean value of -2*lpd is reported as `mean_lpd`
#'
#'
#' @details
#'
#' Below we briefly discuss 1/ data preparation, 2/ choice of model, 3/ choice of priors.
#' All three are discussed in more depth in the package vignette, `vignette("baggr")`.
#'
#' __Data.__ For aggregate data models you need a data frame with columns
#' `tau` and `se` (Rubin model) or `tau`, `mu`, `se.tau`, `se.mu` ("mu & tau" model).
#' An additional column can be used to provide labels for each group
#' (by default column `group` is used if available, but this can be
#' customised -- see the example below).
#' For individual level data three columns are needed: outcome, treatment, group. These
#' are identified by using the `outcome`, `treatment` and `group` arguments.
#'
#' Many data preparation steps can be done through a helper function [prepare_ma].
#' It can convert individual to summary-level data, calculate
#' odds/risk ratios (with/without corrections) in binary data, standardise variables and more.
#' Using it will automatically format data inputs to work with `baggr()`.
#'
#'
#' __Models.__ Available models are:
#'
#' * for the __continuous variable__ means:
#' `"rubin"` model for average treatment effect (using summary data), `"mutau"`
#' version which takes into account means of control groups (also using summary data),
#' `"rubin_full"`, which is the same model as `"rubin"` but works with individual-level data
#' * for __continuous variable quantiles__: `"quantiles"`` model
#' (see Meager, 2019 in references)
#' * for _mixture data_: `"sslab"` (experimental)
#' * for __binary data__: `"logit"` model can be used on individual-level data;
#' you can also analyse continuous statistics such as
#' log odds ratios and logs risk ratios using the models listed above;
#' see `vignette("baggr_binary")` for tutorial with examples
#'
#' If no model is specified, the function tries to infer the appropriate
#' model automatically.
#' Additionally, the user must specify type of pooling.
#' The default is always partial pooling.
#'
#'
#' __Covariates.__
#' Both aggregate and individual-level data can include extra columns, given by `covariates` argument
#' (specified as a character vector of column names) to be used in regression models.
#' We also refer to impact of these covariates as _fixed effects_.
#'
#' Two types of covariates may be present in your data:
#'
#' * In `"rubin"` and `"mutau"` models, covariates that __change according to group unit__.
#' In that case, the model accounting
#' for the group covariates is a
#' [meta-regression](https://handbook-5-1.cochrane.org/chapter_9/9_6_4_meta_regression.htm)
#' model. It can be modelled on summary-level data.
#' * In `"logit"` and `"rubin_full"` models, covariates that __change according to individual unit__.
#' Then, such a model is commonly referred to as a
#' [mixed model](https://stats.stackexchange.com/questions/4700/what-is-the-difference-between-fixed-effect-random-effect-and-mixed-effect-mode/252888)
#' . It has to be fitted to individual-level data. Note that meta-regression is a special
#' case of a mixed model for individual-level data.
#'
#'
#' __Priors.__ It is optional to specify priors yourself,
#' as the package will try propose an appropriate
#' prior for the input data if you do not pass a `prior` argument.
#' To set the priors yourself, use `prior_` arguments. For specifying many priors at once
#' (or re-using between models), a single `prior = list(...)` argument can be used instead.
#' Meaning of the prior parameters may slightly change from model to model.
#' Details and examples are given in `vignette("baggr")`.
#' Setting `ppd=TRUE` can be used to obtain prior predictive distributions,
#' which is useful for understanding the prior assumptions,
#' especially useful in conjunction with [effect_plot]. You can also [baggr_compare]
#' different priors by setting `baggr_compare(..., compare="prior")`.
#'
#' __Cross-validation.__ When `test_data` are specified, an extra parameter, the
#' log predictive density, will be returned by the model.
#' (The fitted model itself is the same regardless of whether there are `test_data`.)
#' To understand this parameter, see documentation of [loocv], a function that
#' can be used to assess out of sample prediction of the model using all available data.
#' If using individual-level data model, `test_data` should only include treatment arms
#' of the groups of interest. (This is because in cross-validation we are not typically
#' interested in the model's ability to fit heterogeneity in control arms, but
#' only heterogeneity in treatment arms.)
#' For using aggregate level data, there is no such restriction.
#'
#' __Outputs.__ By default, some outputs are printed. There is also a
#' plot method for _baggr_ objects which you can access via [baggr_plot] (or simply `plot()`).
#' Other standard functions for working with `baggr` object are
#'
#' * [treatment_effect] for distribution of hyperparameters
#' * [group_effects] for distributions of group-specific parameters (alias: [study_effects], we use the two interchangeably)
#' * [fixed_effects] for coefficients in (meta-)regression
#' * [effect_draw] and [effect_plot] for posterior predictive distributions
#' * [baggr_compare] for comparing multiple `baggr` models
#' * [loocv] for cross-validation
#'
#' @examples
#' df_pooled <- data.frame("tau" = c(1, -1, .5, -.5, .7, -.7, 1.3, -1.3),
#' "se" = rep(1, 8),
#' "state" = datasets::state.name[1:8])
#' baggr(df_pooled) #baggr automatically detects the input data
#' # same model, but with correct labels,
#' # different pooling & passing some options to Stan
#' baggr(df_pooled, group = "state", pooling = "full", iter = 500)
#' # model with non-default (and very informative) priors
#'
#' baggr(df_pooled, prior_hypersd = normal(0, 2))
#'
#' \donttest{
#' # "mu & tau" model, using a built-in dataset
#' # prepare_ma() can summarise individual-level data
#' ms <- microcredit_simplified
#' microcredit_summary_data <- prepare_ma(ms, outcome = "consumption")
#' baggr(microcredit_summary_data, model = "mutau",
#' iter = 500, #this is just for illustration -- don't set it this low normally!
#' pooling = "partial", prior_hypercor = lkj(1),
#' prior_hypersd = normal(0,10),
#' prior_hypermean = multinormal(c(0,0),matrix(c(10,3,3,10),2,2)))
#' }
#'
#'
#' @importFrom rstan summary
#' @importFrom rstan sampling
#' @export
baggr <- function(data,
model = NULL,
pooling = c("partial", "none", "full"),
effect_label = NULL,
covariates = c(),
prior_hypermean = NULL, prior_hypersd = NULL, prior_hypercor=NULL,
prior_beta = NULL, prior_control = NULL, prior_control_sd = NULL,
prior_sigma = NULL,
# log = FALSE, cfb = FALSE, standardise = FALSE,
# baseline = NULL,
prior = NULL, ppd = FALSE,
pooling_control = c("none", "partial", "remove"),
test_data = NULL, quantiles = seq(.05, .95, .1),
outcome = "outcome", group = "group", treatment = "treatment",
silent = FALSE, warn = TRUE, ...) {
# check that it is data.frame of at least 1 row
# if(!inherits(data, "data.frame") || nrow(data) == 1)
# stop("data argument must be a data.frame of >1 rows")
# Match arguments
pooling <- match.arg(pooling)
pooling_control <- match.arg(pooling_control)
# Rename effect_label to effect (old naming convention in code)
effect <- effect_label
# For now we recommend that users format their data before passing to baggr()
# data <- prepare_ma(data,
# standardise = standardise, log = log,
# summarise = FALSE, cfb = cfb,
# treatment=treatment, group=group,
# outcome=outcome, baseline=baseline)
if(!is.null(model) && (model == "full")){
message("From v0.6 model 'full' is named 'rubin_full'. Please update your code in the future.")
model <- "rubin_full"
}
if(!is.null(model) && model == "mutau_sum") {
message("Experimental: using mu & tau model with sum specification (tau = mu + theta)")
model <- "mutau"
cumsum_mutau <- 0
} else {
cumsum_mutau <- 1
}
stan_data <- convert_inputs(data,
model,
effect,
covariates = covariates,
quantiles = quantiles,
outcome = outcome,
group = group,
treatment = treatment,
test_data = test_data,
silent = silent)
# model might've been chosen automatically (if NULL)
# within convert_inptuts(), otherwise it's unchanged
model <- attr(stan_data, "model")
# data might also change if Rubin model requested but mutau
# type inputs supplied
data <- attr(stan_data, "data")
# Lastly, effect changes if binary data were passed
# and effect arg was not specified
effect <- attr(stan_data, "effect")
attr(data, "outcome") <- outcome
attr(data, "group") <- group
attr(data, "treatment") <- treatment
# remember number of groups:
n_groups <- attr(stan_data, "n_groups")
# labels for what the effect parameters represent:
if(model == "quantiles"){
if(is.null(effect))
effect <- paste0(100*quantiles, "% quantile mean")
else if(length(effect) == 1)
effect <- paste0(100*quantiles, "% quantile on ", effect)
else if(length(length(effect) != length(quantiles)))
stop("For quantile models, 'effect' must be of length 1",
"or same as number of quantiles")
} else if(model == "logit"){
if(is.null(effect))
effect <- "logOR"
} else if(model == "sslab") {
if(is.null(effect))
effect <- c("Location of negative tail",
"Location of positive tail",
"Scale of negative tail",
"Scale of positive tail",
"LogOR on being negative",
"LogOR on being equal to 0")
else if(length(effect) != 6)
stop("For spike & slab models, 'effect' must be of length 6")
}
# In all other cases we set it to mean
if(is.null(effect))
effect <- "mean"
# Number of TE parameters
# (in the future this can be built into the models):
n_parameters <- length(effect)
# Pooling type:
stan_data[["pooling_type"]] <- switch(pooling,
"none" = 0,
"partial" = 1,
"full" = 2)
if(model %in% c("logit", "rubin_full", "mutau_full"))
stan_data[["pooling_baseline"]] <- switch(pooling_control,
"none" = 0,
"partial" = 1,
"remove" = 2)
if(model == "mutau_full"){
# For now this model only used for joint prior
stan_data[["joint_prior_mean"]] <- 1
stan_data[["joint_prior_variance"]] <- 1
}
if(model == "mutau")
stan_data[["cumsum"]] <- cumsum_mutau
# Prior settings:
if(is.null(prior)){
prior <- list(hypermean = prior_hypermean,
hypercor = prior_hypercor,
hypersd = prior_hypersd,
beta = prior_beta,
control = prior_control,
control_sd= prior_control_sd)
} else {
if(!is.null(prior_hypermean) || !is.null(prior_beta) ||
!is.null(prior_control) || !is.null(prior_control_sd) ||
!is.null(prior_hypercor) || !is.null(prior_hypersd))
message("Both 'prior' and 'prior_' arguments specified. Using 'prior' only.")
if(!inherits(prior, "list") ||
!all(names(prior) %in% c('hypermean', 'hypercor', 'hypersd',
'beta', 'control', 'control_sd')))
warning(paste("Only names used in the prior argument are:",
"'hypermean', 'hypercor', 'hypersd',
'beta', 'control', 'control_sd'"))
}
# If extracting prior from another model, we need to do a swapsie switcheroo:
stan_args <- list(...)
if("formatted_prior" %in% names(stan_args)){
formatted_prior <- stan_args$formatted_prior
stan_args$formatted_prior <- NULL
} else { # extract priors from inputs & fill in missing priors
formatted_prior <- prepare_prior(prior, data, stan_data, model,
pooling, covariates, quantiles = quantiles,
silent = silent)
}
for(nm in names(formatted_prior))
stan_data[[nm]] <- formatted_prior[[nm]]
stan_args$object <- stanmodels[[model]]
if(ppd){
if(pooling == "none")
stop("Can't use prior predictive distribution with no pooling.")
stan_data <- remove_data_for_prior_pred(stan_data)
}
stan_args$data <- stan_data
# SAMPLING IS HERE
fit <- do.call(rstan::sampling, stan_args)
result <- list(
"data" = data,
"inputs" = stan_data,
"user_prior" = prior,
"formatted_prior" = formatted_prior,
"n_groups" = n_groups,
"n_parameters" = n_parameters,
"effects" = effect,
"covariates" = covariates,
"pooling" = pooling,
"fit" = fit,
"model" = model
)
class(result) <- c("baggr")
attr(result, "ppd") <- ppd
if(grepl("individual", attr(stan_data, "data_type")))
result$summary_data <- prepare_ma(data,
# rare_event_correction = 0.5,
effect = ifelse(model == "logit", "logOR", "mean"),
group = attr(data, "group"),
treatment = attr(data, "treatment"),
outcome = attr(data, "outcome"),
pooling = TRUE)
if(model == "quantiles")
result[["quantiles"]] <- quantiles
if(!ppd){
result[["pooling_metric"]] <- pooling(result)
if(!is.null(test_data)){
result[["test_data"]] <- test_data
result[["mean_lpd"]] <- -2*mean(rstan::extract(fit, "logpd[1]")[[1]])
}
}
# Check convergence
rhat <- rstan::summary(fit)$summary[,"Rhat"]
rhat <- rhat[!is.nan(rhat)] #drop some nonsensical parameters
if(warn && any(rhat > 1.05))
warning(paste0("\nRhat statistic for ", sum(rhat > 1.05),
" parameters exceeded 1.05, with maximum equal to ",
round(max(rhat),2),
".\n This suggests lack of convergence.",
"\n No further warning will be issued.",
"\n Stan model saved as $fit in the returned object. \n"))
return(result)
}
check_if_baggr <- function(bg) {
if(!inherits(bg, "baggr"))
stop("Object of class 'baggr' required.")
}
remove_data_for_prior_pred <- function(data) {
scalars_to0 <- c("K", "N", "Nc",
# specific to sslab (generalise this)
"N_neg", "N_pos")
vectors_to_remove <- c("theta_hat_k", "se_theta_k",
"y", "treatment", "site",
# specific to sslab
"treatment_neg", "treatment_pos", "cat",
"site_neg", "site_pos", "y_neg", "y_pos")
matrices_to_remove <- c("X")
# Specific to quantiles:
matrices_to_rescale <- c("y_0", "y_1")
arrays_to_rescale <- c("Sigma_y_k_0", "Sigma_y_k_1")
matrices_remove_rows <- c("x")
for(nm in scalars_to0)
if(!is.null(data[[nm]]))
data[[nm]] <- 0
for(nm in vectors_to_remove)
if(!is.null(data[[nm]]))
data[[nm]] <- array(0, dim = c(0))
for(nm in matrices_to_remove)
if(!is.null(data[[nm]]))
data[[nm]] <- array(0, dim = c(0,0))
for(nm in matrices_remove_rows)
if(!is.null(data[[nm]]))
data[[nm]] <- array(0, dim = c(0,ncol(data[[nm]])))
# This is for quantiles model where you have K x Nq inputs
# i.e. sites x Nquantiles
# For PPD we will need 0 x Nq
for(nm in matrices_to_rescale)
if(!is.null(data[[nm]]))
data[[nm]] <- array(0, dim = c(0, data$Nq))
for(nm in arrays_to_rescale)
if(!is.null(data[[nm]]))
data[[nm]] <- array(0, dim = c(0, data$Nq, data$Nq))
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.