R/utils.R

Defines functions release_questions estimate_nuisances run_sl get_test_set make_kfold make_folds estimate_eif_projection scale_est get_full_type process_arg_lst create_z check_fitted_values check_inputs

Documented in check_fitted_values check_inputs create_z estimate_eif_projection estimate_nuisances get_full_type get_test_set make_folds make_kfold process_arg_lst run_sl scale_est

# Checkers ---------------------------------------------------------------------

#' Check inputs to a call to vim, cv_vim, or sp_vim
#'
#' @details Ensure that inputs to \code{vim}, \code{cv_vim}, and \code{sp_vim}
#'   follow the correct formats.
#'
#' @param Y the outcome
#' @param X the covariates
#' @param f1 estimator of the population-optimal prediction function
#'   using all covariates
#' @param f2 estimator of the population-optimal prediction function
#'   using the reduced set of covariates
#' @param indx the index or indices of the covariate(s) of interest
#'
#' @return None. Called for the side effect of stopping the algorithm if
#'   any inputs are in an unexpected format.
#' @export
check_inputs <- function(Y, X, f1, f2, indx) {
  if (is.null(f1) && is.null(Y)) {
    stop("You must enter either Y or fitted values for the full regression.")
  }
  if (is.null(f2) && is.null(X)) {
    stop("You must enter either X or fitted values for the reduced regression.")
  }
  # if indx is outside the range of X, stop and throw an error
  if (!is.null(X)) {
    if (any(indx > dim(X)[2])) {
      stop(paste0("One of the feature indices in 'indx' is larger than the ",
                  "total number of features in X. Please specify a new index ",
                  "subgroup in 'indx'."))
    }
  }
}
#' Check pre-computed fitted values for call to vim, cv_vim, or sp_vim
#'
#' @details Ensure that inputs to \code{vim}, \code{cv_vim}, and \code{sp_vim}
#'   follow the correct formats.
#'
#' @param Y the outcome
#' @param f1 estimator of the population-optimal prediction function
#'   using all covariates
#' @param f2 estimator of the population-optimal prediction function
#'   using the reduced set of covariates
#' @param cross_fitted_f1 cross-fitted estimator of the population-optimal
#'   prediction function using all covariates
#' @param cross_fitted_f2 cross-fitted estimator of the population-optimal
#'   prediction function using the reduced set of covariates
#' @param sample_splitting_folds the folds for sample-splitting (used for
#'   hypothesis testing)
#' @param cross_fitting_folds the folds for cross-fitting (used for point
#'   estimates of variable importance in \code{cv_vim} and \code{sp_vim})
#' @param cross_fitted_se logical; should cross-fitting be used to estimate
#'   standard errors?
#' @param V the number of cross-fitting folds
#' @param ss_V the number of folds for CV (if sample_splitting is TRUE)
#' @param cv a logical flag indicating whether or not to use cross-fitting
#'
#' @return None. Called for the side effect of stopping the algorithm if
#'   any inputs are in an unexpected format.
#' @export
check_fitted_values <- function(Y = NULL, f1 = NULL, f2 = NULL,
                                cross_fitted_f1 = NULL, cross_fitted_f2 = NULL,
                                sample_splitting_folds = NULL,
                                cross_fitting_folds = NULL,
                                cross_fitted_se = TRUE, V = NULL, ss_V = NULL,
                                cv = FALSE) {
  if (is.null(Y)) stop("Y must be entered.")
  if (!cv) {
    if (length(f1) == 0 || length(f2) == 0) {
      stop("Fitted values must be entered if run_regression = FALSE.")
    }
    if (length(sample_splitting_folds) != length(Y)) {
      stop("The entered folds must be the same length as the outcome of interest.")
    }
  } else {
    if (is.null(cross_fitted_f1)) {
      stop(paste0("You must specify a list of predicted values from a ",
                  "regression of Y on X."))
    }
    if (is.null(cross_fitted_f2)) {
      stop(paste0("You must specify a list of predicted values from either ",
                  "(a) a regression of the fitted values from the Y on X ",
                  "regression on the reduced set of covariates, or (b) ",
                  "a regression of Y on the reduced set of covariates."))
    }
    if (is.numeric(cross_fitted_f1)) {
      if (length(cross_fitted_f1) != length(Y)) {
        stop(paste0("There must be a predicted value for each observation ",
                    "in the dataset."))
      }
    } else {
      if (length(cross_fitted_f1) != V) {
        stop(paste0("There must be a predicted value for each observation ",
                    "in the dataset, in a list of length equal to the number ",
                    "of cross-fitting folds."))
      }
    }
    if (is.null(f1) & !cross_fitted_se) {
      stop(paste0("You must enter an estimator of the population-optimal predictor",
                  " using all covariates."))
    }
    if (is.null(f2) & !cross_fitted_se) {
      stop(paste0("You must enter an estimator of the population-optimal predictor",
                  " using the reduced set of covariates."))
    }
    if (length(sample_splitting_folds) != length(unique(cross_fitting_folds))) {
      stop("The sample splitting folds must be the same length as the number of cross-fitting folds.")
    }
    if (is.null(cross_fitting_folds)) {
      stop("You must specify the folds that were used for cross-fitting.")
    }
  }
}

#' Create complete-case outcome, weights, and Z
#'
#' @param Y the outcome
#' @param C indicator of missing or observed
#' @param Z the covariates observed in phase 1 and 2 data
#' @param X all covariates
#' @param ipc_weights the weights
#'
#' @return a list, with the complete-case outcome, weights, and Z matrix
#' @export
create_z <- function(Y, C, Z, X, ipc_weights) {
  # set up internal data -- based on complete cases only
  Y_cc <- subset(Y, C == 1, drop = FALSE)
  weights_cc <- ipc_weights[C == 1]
  if (!all(C == 1) || !all(ipc_weights == 1)) {
    if (is.character(Z)) {
      tmp_Z <- Z[Z != "Y"]
      minus_X <- as.numeric(gsub("x", "", tmp_Z, ignore.case = TRUE))
      # check to see if it is only part of X matrix
      if (any(sapply(seq_along(minus_X), function(j) length(minus_X[j]) > 0))) {
        if (any(grepl("Y", Z))) {
          Z_in <- cbind.data.frame(as.data.frame(mget("Y")), X[, minus_X])
        } else {
          Z_in <- as.data.frame(X[, minus_X])
        }
      } else {
        Z_in <- as.data.frame(mget(Z))
      }
    } else {
      stop("Please enter a character vector corresponding to the names of the fully observed data.")
    }
  } else {
    Z_in <- NULL
  }
  list(Y = Y_cc, weights = weights_cc, Z = Z_in)
}

#' Process argument list for Super Learner estimation of the EIF
#'
#' @param arg_lst the list of arguments for Super Learner
#'
#' @return a list of modified arguments for EIF estimation
#' @export
process_arg_lst <- function(arg_lst) {
  if (!is.null(names(arg_lst)) && any(grepl("cvControl", names(arg_lst)))) {
    arg_lst$cvControl$stratifyCV <- FALSE
  }
  if (!is.null(names(arg_lst)) && any(grepl("method", names(arg_lst)))) {
    if (grepl("NNloglik", arg_lst$method)) {
      arg_lst$method <- "method.NNLS"
    } else {
      arg_lst$method <- "method.CC_LS"
    }
  }
  if (!is.null(names(arg_lst)) && any(grepl("family", names(arg_lst)))) {
    arg_lst$family <- stats::gaussian()
  }
  arg_lst
}

# ------------------------------------------------------------------------------

#' Obtain the type of VIM to estimate using partial matching
#'
#' @param type the partial string indicating the type of VIM
#'
#' @return the full string indicating the type of VIM
#' @export
get_full_type <- function(type) {
  types <- c("accuracy", "auc", "deviance", "r_squared", "anova", "average_value")
  full_type <- types[pmatch(type, types)]
  if (is.na(full_type)) {
    stop("We currently do not support the entered variable importance parameter.")
  }
  if (full_type == "anova" ) {
    message(paste0("Hypothesis testing is not available for type = 'anova'. ",
                   "If you want an R-squared-based hypothesis test, please enter ",
                   "type = 'r_squared'."))
  }
  full_type
}

#' Return an estimator on a different scale
#'
#' @details It may be of interest to return an estimate (or confidence interval)
#'   on a different scale than originally measured. For example, computing a
#'   confidence interval (CI) for a VIM value that lies in (0,1) on the logit scale
#'   ensures that the CI also lies in (0, 1).
#'
#' @param obs_est the observed VIM estimate
#' @param grad the estimated efficient influence function
#' @param scale the scale to compute on
#'
#' @return the scaled estimate
#' @export
scale_est <- function(obs_est = NULL, grad = NULL, scale = "identity") {
  if (scale == "logit") {
    this_grad <- 1 / (obs_est - obs_est ^ 2)
    est <- stats::plogis(stats::qlogis(obs_est) + this_grad * mean(grad))
  } else if (scale == "log") {
    this_grad <- 1 / obs_est
    est <- exp(log(obs_est) + this_grad * mean(grad))
  } else {
    est <- obs_est + mean(grad)
  }
  est
}

#' Estimate projection of EIF on fully-observed variables
#'
#' @param obs_grad the estimated (observed) EIF
#' @inheritParams measure_accuracy
#'
#' @return the projection of the EIF onto the fully-observed variables
#' @importFrom stats rnorm
estimate_eif_projection <- function(obs_grad = NULL, C = NULL, Z = NULL,
                                    ipc_fit_type = NULL, ipc_eif_preds = NULL, ...) {
  if (ipc_fit_type != "external") {
    if (length(unique(obs_grad)) == 1) {
      noisy_obs_grad <- stats::rnorm(length(obs_grad), mean = obs_grad, sd = 1e-5)
      obs_grad <- noisy_obs_grad
    }
    ipc_eif_mod <- SuperLearner::SuperLearner(
      Y = obs_grad, X = Z[C == 1, , drop = FALSE], ...
    )
    ipc_eif_preds <- SuperLearner::predict.SuperLearner(
      ipc_eif_mod, newdata = Z, onlySL = TRUE
    )$pred
  }
  ipc_eif_preds
}
# ------------------------------------------------------------------------------

#' Create Folds for Cross-Fitting
#'
#' @param y the outcome
#' @param V the number of folds
#' @param stratified should the folds be stratified based on the outcome?
#' @param C a vector indicating whether or not the observation is fully observed;
#'    1 denotes yes, 0 denotes no
#' @param probs vector of proportions for each fold number
#' @return a vector of folds
#' @export
make_folds <- function(y, V = 2, stratified = FALSE,
                       C = NULL,
                       probs = rep(1/V, V)) {
  folds <- vector("numeric", length(y))
  if (length(unique(probs)) == 1) {
    if (stratified) {
      if (length(unique(C)) <= 1) {
        folds_1 <- sample(rep(seq_len(V), length = sum(y == 1)))
        folds_0 <- sample(rep(seq_len(V), length = sum(y == 0)))
        folds[y == 1] <- folds_1
        folds[y == 0] <- folds_0
      } else {
        folds_11 <- sample(rep(seq_len(V), length = sum(y == 1 & C == 1)))
        folds_10 <- sample(rep(seq_len(V), length = sum(y == 0 & C == 1)))
        folds_01 <- sample(rep(seq_len(V), length = sum(y == 1 & C == 0)))
        folds_00 <- sample(rep(seq_len(V), length = sum(y == 0 & C == 0)))
        folds[y == 1 & C == 1] <- folds_11
        folds[y == 0 & C == 1] <- folds_10
        folds[y == 1 & C == 0] <- folds_01
        folds[y == 0 & C == 0] <- folds_00
      }
    } else {
      folds <- sample(rep(seq_len(V), length = length(y)))
    }
  } else {
    if (stratified) {
      if (length(unique(C)) <= 1) {
        folds_1 <- rep(seq_len(V), probs * sum(y == 1))
        folds_1 <- c(folds_1, sample(seq_len(V), size = sum(y == 1) - length(folds_1),
                                     replace = TRUE, prob = probs))
        folds_0 <- rep(seq_len(V), probs * sum(y == 0))
        folds_0 <- c(folds_0, sample(seq_len(V), size = sum(y == 0) - length(folds_0),
                                     replace = TRUE, prob = probs))
        folds_1 <- sample(folds_1)
        folds_0 <- sample(folds_0)
        folds[y == 1] <- folds_1
        folds[y == 0] <- folds_0
      } else {
        folds_11 <- rep(seq_len(V), probs * sum(y == 1 & C == 1))
        folds_10 <- rep(seq_len(V), probs * sum(y == 1 & C == 0))
        folds_01 <- rep(seq_len(V), probs * sum(y == 0 & C == 1))
        folds_00 <- rep(seq_len(V), probs * sum(y == 0 & C == 0))
        folds_11 <- c(folds_11, sample(seq_len(V), size = sum(y == 1 & C == 1) -
                                         length(folds_11),
                                       replace = TRUE, prob = probs))
        folds_01 <- c(folds_01, sample(seq_len(V), size = sum(y == 0 & C == 1) -
                                         length(folds_01),
                                       replace = TRUE, prob = probs))
        folds_10 <- c(folds_10, sample(seq_len(V), size = sum(y == 1 & C == 0) -
                                         length(folds_10),
                                       replace = TRUE, prob = probs))
        folds_00 <- c(folds_00, sample(seq_len(V), size = sum(y == 0 & C == 0) -
                                         length(folds_00),
                                       replace = TRUE, prob = probs))
        folds_11 <- sample(folds_11)
        folds_01 <- sample(folds_01)
        folds_10 <- sample(folds_10)
        folds_00 <- sample(folds_00)
        folds[y == 1 & C == 1] <- folds_11
        folds[y == 0 & C == 1] <- folds_10
        folds[y == 1 & C == 0] <- folds_01
        folds[y == 0 & C == 0] <- folds_00
      }
    } else {
      these_probs <- round(probs * length(y))
      if (sum(these_probs) != length(y)) {
        these_probs[which.min(these_probs)] <- these_probs[which.min(these_probs)] - 1
      }
      folds <- sample(rep(seq_len(V), these_probs))
    }
  }
  return(folds)
}

#' Turn folds from 2K-fold cross-fitting into individual K-fold folds
#'
#' @param cross_fitting_folds the vector of cross-fitting folds
#' @param sample_splitting_folds the sample splitting folds
#' @param C vector of whether or not we measured the observation in phase 2
#'
#' @return the two sets of testing folds for K-fold cross-fitting
#' @export
make_kfold <- function(cross_fitting_folds,
                       sample_splitting_folds = rep(1, length(unique(cross_fitting_folds))),
                       C = rep(1, length(cross_fitting_folds))) {
  # get the folds for the full and reduced nuisance functions
  full_folds <- which(sample_splitting_folds == 1)
  redu_folds <- which(sample_splitting_folds == 2)
  sample_splitting_vec <- vector("numeric", length = length(cross_fitting_folds))
  sample_splitting_vec[cross_fitting_folds %in% full_folds] <- 1
  sample_splitting_vec[cross_fitting_folds %in% redu_folds] <- 2
  # create K-fold folds, i.e., 1:K for each
  full_cf_folds <- cross_fitting_folds[cross_fitting_folds %in% full_folds]
  redu_cf_folds <- cross_fitting_folds[cross_fitting_folds %in% redu_folds]
  unique_full <- sort(unique(full_cf_folds))
  unique_redu <- sort(unique(redu_cf_folds))
  K <- length(unique_full)
  folds_k <- seq_len(K)
  k_fold_full <- full_cf_folds
  k_fold_redu <- redu_cf_folds
  for (v in seq_len(K)) {
    k_fold_full <- replace(k_fold_full, full_cf_folds == unique_full[v], folds_k[v])
    k_fold_redu <- replace(k_fold_redu, redu_cf_folds == unique_redu[v], folds_k[v])
  }
  # return a list; the first four values are the cross-fitting folds,
  # while the last two values replicate the sample-splitting folds
  list(full = k_fold_full, reduced = k_fold_redu,
       sample_splitting_folds = sample_splitting_vec)
}

#' Return test-set only data
#'
#' @param arg_lst a list of estimates, data, etc.
#' @param k the index of interest
#'
#' @return the test-set only data
#' @export
get_test_set <- function(arg_lst, k) {
  folds <- arg_lst$cross_fitting_folds
  folds_Z <- arg_lst$folds_Z
  test_lst <- arg_lst
  test_lst$fitted_values <- arg_lst$fitted_values[folds == k]
  test_lst$y <- arg_lst$y[folds == k]
  test_lst$C <- arg_lst$C[folds_Z == k]
  test_lst$Z <- arg_lst$Z[folds_Z == k, , drop = FALSE]
  test_lst$ipc_weights <- arg_lst$ipc_weights[folds_Z == k]
  test_lst$ipc_eif_preds <- arg_lst$ipc_eif_preds[folds_Z == k]
  test_lst$nuisance_estimators <- lapply(arg_lst$nuisance_estimators, function(l) {
    l[folds == k]
  })
  test_lst$a <- arg_lst$a[folds == k]
  return(test_lst)
}

# For sp_vim -------------------------------------------------------------------
#' Run a Super Learner for the provided subset of features
#'
#' @param Y the outcome
#' @param X the covariates
#' @param V the number of folds
#' @param SL.library the library of candidate learners
#' @param univariate_SL.library the library of candidate learners for
#'          single-covariate regressions
#' @param s the subset of interest
#' @param cv_folds the CV folds
#' @param sample_splitting logical; should we use sample-splitting for
#'   predictiveness estimation?
#' @param ss_folds the sample-splitting folds; only used if
#'   \code{sample_splitting = TRUE}
#' @param split the split to use for sample-splitting; only used if
#'   \code{sample_splitting = TRUE}
#' @param verbose should we print progress? defaults to FALSE
#' @param progress_bar the progress bar to print to (only if verbose = TRUE)
#' @param indx the index to pass to progress bar (only if verbose = TRUE)
#' @param weights weights to pass to estimation procedure
#' @param cross_fitted_se if \code{TRUE}, uses a cross-fitted estimator of
#'   the standard error; otherwise, uses the entire dataset
#' @param full should this be considered a "full" or "reduced" regression?
#'   If \code{NULL} (the default), this is determined automatically; a full
#'   regression corresponds to \code{s} being equal to the full covariate vector.
#'   For SPVIMs, can be entered manually.
#' @param vector should we return a vector (\code{TRUE}) or a list (\code{FALSE})?
#' @param ... other arguments to Super Learner
#'
#' @return a list of length V, with the results of predicting on the hold-out data for each v in 1 through V
#' @export
run_sl <- function(Y = NULL, X = NULL, V = 5, SL.library = "SL.glm",
                   univariate_SL.library = NULL, s = 1, cv_folds = NULL,
                   sample_splitting = TRUE, ss_folds = NULL, split = 1, verbose = FALSE,
                   progress_bar = NULL, indx = 1, weights = rep(1, nrow(X)),
                   cross_fitted_se = TRUE, full = NULL, vector = TRUE, ...) {
  # if verbose, print what we're doing and make sure that SL is verbose;
  # set up the argument list for the Super Learner / CV.SuperLearner
  arg_lst <- list(...)
  if (is.null(arg_lst$family)) {
    arg_lst$family <- switch(
      (length(unique(Y)) == 2) + 1, stats::gaussian(), stats::binomial()
    )
  }
  if (is.character(arg_lst$family)) {
    family <- get(arg_lst$family, mode = "function", envir = parent.frame())
    arg_lst$family <- family()
  }
  if ((arg_lst$family$family == "binomial") & (length(unique(Y)) > 2)) {
    arg_lst$family <- stats::gaussian()
  }
  if (verbose) {
    if (is.null(arg_lst$cvControl)) {
      arg_lst$cvControl <- list(verbose = TRUE)
    } else {
      arg_lst$cvControl$verbose <- TRUE
    }
  }
  arg_lst_bool <- is.null(arg_lst$cvControl) |
    ifelse(!is.null(arg_lst$cvControl), arg_lst$cvControl$V != V, FALSE)
  if (arg_lst_bool) {
    if (V > 1) {
      arg_lst$innerCvControl <- list(list(V = switch(as.numeric(!is.null(arg_lst$cvControl$V)) + 1, 5, arg_lst$cvControl$V),
                                          stratifyCV = switch(as.numeric(!is.null(arg_lst$cvControl$stratifyCV)) + 1,
                                                              FALSE, arg_lst$cvControl$stratifyCV)))
    }
    arg_lst$cvControl$V <- ifelse(V == 1, 5, V)
  }
  if (is.null(arg_lst$obsWeights)) {
    arg_lst$obsWeights <- weights
  }
  arg_lst_cv <- arg_lst
  # fit the super learner for a given set of variables
  red_X <- as.data.frame(X[, s, drop = FALSE])
  if (is.null(cv_folds)) {
    cv_folds <- make_folds(Y, V = V, stratified = (length(unique(Y)) == 2))
  }
  cf_folds_lst <- lapply(as.list(seq_len(V)), function(v) {
    which(cv_folds == v)
  })
  if (V > 1) {
    arg_lst_cv$cvControl$validRows <- cf_folds_lst
  }
  this_sl_lib <- SL.library
  # if univariate regression (i.e., length(s) == 1) then check univariate_SL.library
  # if it exists, use it; otherwise, use the normal library
  if (length(s) == 1) {
    if (!is.null(univariate_SL.library)) {
      this_sl_lib <- univariate_SL.library
    }
    requires_2d <- c("glmnet", "polymars")
    for (i in 1:length(requires_2d)) {
      if (any(grepl(requires_2d[i], this_sl_lib)) & (ncol(red_X) == 1)) {
        red_X <- cbind.data.frame(V0 = 0, red_X)
      }
    }
  }
  full_arg_lst_cv <- c(arg_lst_cv, list(
    Y = Y, X = red_X, SL.library = this_sl_lib
  ))
  # if dim(red_X) == 0, then return the mean
  if (ncol(red_X) == 0) {
    this_sl_lib <- eval(parse(text = "SL.mean"))
  }
  # if a single learner, don't do inner CV
  if ((length(this_sl_lib) == 1) & !is.list(this_sl_lib)) {
    if (is.character(this_sl_lib)) {
      this_sl_lib <- eval(parse(text = this_sl_lib))
    }
    preds <- list()
    preds_vector <- rep(NA, length = length(Y))
    if (V == 1) {
      fit <- NA
      preds <- NA
      preds_vector <- NA
    } else {
      for (v in seq_len(V)) {
        train_v <- (cv_folds != v)
        test_v <- (cv_folds == v)
        fit <- this_sl_lib(Y = Y[train_v, , drop = FALSE],
                           X = red_X[train_v, , drop = FALSE],
                           newX = red_X[test_v, , drop = FALSE],
                           family = full_arg_lst_cv$family,
                           obsWeights = full_arg_lst_cv$obsWeights[train_v])
        preds[[v]] <- fit$pred
        preds_vector[test_v] <- fit$pred
      }
      preds_vector <- preds_vector[!is.na(preds_vector)]
    }
    if (vector) {
      preds <- preds_vector
    }
  } else if (V == 1) {
    # once again, don't do anything; will fit at end
    fit <- NA
    preds <- NA
    preds_vector <- NA
  } else {
    # fit a cross-validated Super Learner
    fit <- suppressWarnings(do.call(SuperLearner::CV.SuperLearner, full_arg_lst_cv))
    # extract predictions on correct sampled-split folds
    if (is.null(full)) {
      all_equal_s <- all.equal(s, 1:ncol(X))
      is_full <- switch((sample_splitting) + 1, TRUE,
                        !is.character(all_equal_s) & as.logical(all_equal_s))
    } else {
      is_full <- full
    }
    preds <- fit$SL.predict
  }
  # if cross_fitted_se, we're done; otherwise, re-fit to the entire dataset
  if (!cross_fitted_se) {
    # refit to the entire dataset
    bool <- switch(as.numeric(sample_splitting) + 1, rep(TRUE, length(Y)), ss_folds == split)
    if (!is.character(this_sl_lib)) {
      fit_se <- this_sl_lib(Y = Y[bool, ],
                            X = red_X[bool, , drop = FALSE],
                            newX = red_X[bool, , drop = FALSE],
                            family = arg_lst$family,
                            obsWeights = arg_lst$obsWeights[bool])
      preds_se <- fit_se$pred
      if (all(is.na(preds))) {
        preds <- preds_se
        fit <- fit_se
      }
    } else {
      arg_lst$obsWeights <- weights[bool]
      if (any(grepl("parallel", names(arg_lst)))) {
        # remove it for SL calls
        arg_lst$parallel <- NULL
      }
      fit_se <- do.call(
        SuperLearner::SuperLearner,
        args = c(arg_lst[-which(names(arg_lst) == "innerCvControl")], list(
          Y = Y[bool, ], X = red_X[bool, , drop = FALSE],
          SL.library = this_sl_lib
        ))
      )
      preds_se <- fit_se$SL.predict
      if (all(is.na(preds))) {
        fit <- fit_se
        preds <- preds_se
      }
    }
  } else {
    fit_se <- NA
    preds_se <- NA
  }
  if (verbose) {
    setTxtProgressBar(progress_bar, indx)
  }
  return(list(fit = fit, preds = preds, ss_folds = ss_folds,
              cv_folds = cv_folds, fit_non_cf_se = fit_se, preds_non_cf_se = preds_se))
}

# estimate nuisance functions (for average value) ------------------------------
#' Estimate nuisance functions for average value-based VIMs
#'
#' @inheritParams cv_vim
#' @inheritParams run_sl
#' @param fit the fitted nuisance function estimator
#' @param split the sample split to use
#'
#' @return nuisance function estimators for use in the average value VIM:
#'  the treatment assignment based on the estimated optimal rule
#'  (based on the estimated outcome regression); the expected outcome under the
#'  estimated optimal rule; and the estimated propensity score.
#'
#' @export
estimate_nuisances <- function(fit, X, exposure_name, V = 1, SL.library, sample_splitting,
                               sample_splitting_folds, verbose, weights, cross_fitted_se,
                               split = 1, ...) {
  A <- X %>% dplyr::pull(!!exposure_name)
  W <- X %>% dplyr::select(-!!exposure_name)
  # estimate the optimal rule
  A_1 <- cbind.data.frame(A = 1, W)
  A_0 <- cbind.data.frame(A = 0, W)
  names(A_1)[1] <- exposure_name
  names(A_0)[1] <- exposure_name
  f_n <- as.numeric(stats::predict(fit, newdata = A_1)$pred > stats::predict(fit, newdata = A_0)$pred)
  # estimate the propensity score
  g_n <- run_sl(Y = f_n, X = W, V = V, SL.library = SL.library,
                s = 1:ncol(W), sample_splitting = sample_splitting,
                ss_folds = sample_splitting_folds, split = split, verbose = verbose,
                weights = weights, cross_fitted_se = cross_fitted_se, ...)
  # set up data based on f_n, f_n_reduced
  A_f <- cbind.data.frame(A = f_n, W)
  names(A_f)[1] <- exposure_name
  nuisance_estimators <- list(f_n = f_n, q_n = stats::predict(fit, newdata = A_f)$pred,
                              g_n = stats::predict(g_n, A_f)$pred)
  return(nuisance_estimators)
}

# -------------------------------------
# release questions
# -------------------------------------
# @keywords internal
release_questions <- function() {
  c(
    "Have you run cran_prep <- rhub::check_for_cran(env_vars = c(R_COMPILE_AND_INSTALL_PACKAGES = 'always'), show_status = FALSE)?",
    "Have you run devtools::check_win_devel() and devtools::check_win_release()?"
  )
}
bdwilliamson/npvi documentation built on Feb. 1, 2024, 10:46 p.m.