R/declare_diagnosands.R
In DeclareDesign: Declare and Diagnose Research Designs

Documented in diagnosand_handler pop.var

#' @param ... A set of new diagnosands.
#' @param subset A subset of the simulations data frame within which to calculate diagnosands e.g. \code{subset = p.value < .05}.
#' @param alpha Alpha significance level. Defaults to \code{.05}.
#' @param label Label for the set of diagnosands.
#' @param data A data.frame.
#'
#' @details
#'
#' If term is TRUE, the names of ... will be returned in a \code{term} column, and \code{inquiry}
#' will contain the step label. This can be used as an additional dimension for use in diagnosis.
#'
#' @importFrom rlang eval_tidy quos is_quosure quo_is_call %||% enquo
#' @importFrom stats na.omit
#' @rdname declare_diagnosands
diagnosand_handler <- function(data, ...,
                               subset = NULL,
                               alpha = 0.05,
                               label) {
  options <- quos(...)
  
  if(length(options) == 0) {
    default_diagnosands(data = data, alpha = alpha)
  } else {
    
    # subsetting the data -----------------------------------------------------
    
    subset <- enquo(subset)
    idx <- eval_tidy(subset, data = data)
    if (!is.null(idx)) {
      data <- data[idx, , drop = FALSE]
    }
    
    ret <- vector("list", length(options))
    
    for (i in seq_along(options)) {
      ret[i] <- eval_tidy(options[[i]], data = data)
    }
    
    ret <- simplify2array(ret)
    
    data.frame(
      diagnosand_label = names(options),
      diagnosand = ret,
      stringsAsFactors = FALSE
    )
  }
  
}


validation_fn(diagnosand_handler) <- function(ret, dots, label) {
  
  options <- names(dots)[!names(dots) %in% c("subset", "alpha", "label", "data")]
  
  # if (length(options) == 0) {
  #   declare_time_error("No diagnosands were declared.", ret)
  # }
  
  # check whether all diagnosands are named
  if (is.null(names(dots)) || "" %in% names(dots)) {
    declare_time_error("All diagnosands must be named", ret)
  }
  
  ret
}

#' Declare diagnosands
#'
#' @inheritParams declare_internal_inherit_params
#'
#' @details
#'
#' Diagnosands summarize the simulations generated by \code{\link{diagnose_design}} or \code{\link{simulate_design}}. Typically, the columns of the resulting simulations data.frame include the following variables: estimate, std.error, p.value, conf.low, conf.high, and inquiry. Many diagnosands will be a function of these variables.
#'
#' @return a function that returns a data.frame
#'
#' @importFrom rlang eval_tidy
#'
#' @export
#'
#' @examples
#'
#' # Two-arm randomized experiment
#' design <-
#'   declare_model(
#'     N = 500,
#'     gender = rbinom(N, 1, 0.5),
#'     X = rep(c(0, 1), each = N / 2),
#'     U = rnorm(N, sd = 0.25),
#'     potential_outcomes(Y ~ 0.2 * Z + X + U)
#'   ) +
#'   declare_inquiry(ATE = mean(Y_Z_1 - Y_Z_0)) +
#'   declare_sampling(S = complete_rs(N = N, n = 200)) +
#'   declare_assignment(Z = complete_ra(N = N, m = 100)) +
#'   declare_measurement(Y = reveal_outcomes(Y ~ Z)) +
#'   declare_estimator(Y ~ Z, inquiry = "ATE")
#' 
#' \dontrun{
#' # using built-in defaults:
#' diagnosis <- diagnose_design(design)
#' diagnosis
#' 
#' # You can choose your own diagnosands instead of the defaults:
#' 
#' my_diagnosands <-
#'   declare_diagnosands(median_bias = median(estimate - estimand))
#' 
#' ## You can set diagnosands within the diagnose_design function
#' ## using the 'diagnosands =' argument
#' diagnosis <- diagnose_design(design, diagnosands = my_diagnosands)
#' diagnosis
#' 
#' ## You can also set diagnosands with set_diagnosands 
#' design <- set_diagnosands(design, diagnosands = my_diagnosands)
#' diagnosis <- diagnose_design(design)
#' diagnosis
#' 
#' # If you do not specify diagnosands in diagnose_design, 
#' #   the function default_diagnosands() is used, 
#' #   which is reproduced below.
#' 
#' alpha <- 0.05
#' 
#' default_diagnosands <- 
#'   declare_diagnosands(
#'     mean_estimand = mean(estimand),
#'     mean_estimate = mean(estimate),
#'     bias = mean(estimate - estimand),
#'     sd_estimate = sqrt(pop.var(estimate)),
#'     rmse = sqrt(mean((estimate - estimand) ^ 2)),
#'     power = mean(p.value <= alpha),
#'     coverage = mean(estimand <= conf.high & estimand >= conf.low)
#'   )
#'   
#' diagnose_design(
#'   design,
#'   diagnosands = default_diagnosands
#' )
#' 
#' # A longer list of potentially useful diagnosands might include:
#' 
#' extended_diagnosands <- 
#'   declare_diagnosands(
#'     mean_estimand = mean(estimand),
#'     mean_estimate = mean(estimate),
#'     bias = mean(estimate - estimand),
#'     sd_estimate = sd(estimate),
#'     rmse = sqrt(mean((estimate - estimand) ^ 2)),
#'     power = mean(p.value <= alpha),
#'     coverage = mean(estimand <= conf.high & estimand >= conf.low),
#'     mean_se = mean(std.error),
#'     type_s_rate = mean((sign(estimate) != sign(estimand))[p.value <= alpha]),
#'     exaggeration_ratio = mean((estimate/estimand)[p.value <= alpha]),
#'     var_estimate = pop.var(estimate),
#'     mean_var_hat = mean(std.error^2),
#'     prop_pos_sig = mean(estimate > 0 & p.value <= alpha),
#'     mean_ci_length = mean(conf.high - conf.low)
#'   )
#'   
#' diagnose_design(
#'   design,
#'   diagnosands = extended_diagnosands
#' )
#' }
declare_diagnosands <- make_declarations(diagnosand_handler, "diagnosand", "diagnosands")

#' @importFrom stats na.omit
default_diagnosands <- function(data, alpha = 0.05){
  
  estimate <- data$estimate %||% NA
  estimand <- data$estimand %||% NA
  p.value <- data$p.value %||% NA
  std.error <- data$std.error %||% NA
  conf.low <- data$conf.low %||% NA
  conf.high <- data$conf.high %||% NA
  
  mean_estimand <- mean(estimand)
  mean_estimate <- mean(estimate)
  bias <- mean(estimate - estimand)
  sd_estimate <- sd(estimate)
  rmse <- sqrt(mean((estimate - estimand)^2))
  power <- mean(p.value <= alpha)
  coverage <- mean(estimand <= conf.high & estimand >= conf.low)
  
  data.frame(
    diagnosand_label = c(
      "mean_estimand",
      "mean_estimate",
      "bias",
      "sd_estimate",
      "rmse",
      "power",
      "coverage"
    ),
    diagnosand = c(
      mean_estimand,
      mean_estimate,
      bias,
      sd_estimate,
      rmse,
      power,
      coverage
    ), 
    stringsAsFactors = FALSE
  )
  
}

#' Population variance function
#'
#' @param x a numeric vector, matrix or data frame.
#' @param na.rm logical. Should missing values be removed?
#'
#' @return numeric scalar of the population variance
#' @export
#'
#' @examples
#' 
#' x <- 1:4
#' var(x) # divides by (n-1)
#' pop.var(x) # divides by n
#' 
#' 
pop.var <- function(x, na.rm = FALSE){
  # checks from the base R function var
  if (is.data.frame(x)) 
    x <- as.matrix(x)
  else stopifnot(is.atomic(x))
  
  if(na.rm) x <- na.omit(x)
  
  mean((x - mean(x, na.rm = na.rm))^2) 
}