corxplor: Correlation Exploration

Documented in cor_boot

#' Bootstrapped Confidence Intervals for Correlations
#'
#' Calculates the bias-corrected and accelerated (BCa) bootstrap confidence
#' interval for correlation coefficients.
#'
#' @seealso \code{\link[boot]{boot.ci}}, \code{\link[stats]{cor}},
#'  \code{\link{cor_list}},
#'
#' @param x a numeric matrix or data frame.
#'
#' @param y NULL (default) or a matrix or data frame with compatible dimensions
#'  to x. The default is equivalent to y = x (but more efficient).
#'
#' @param use an optional character string giving a method for handling missing
#'  values. This must be (an abbreviation of) one of the strings "everything",
#'  "all.obs", "complete.obs", or "pairwise.complete.obs"
#'  (default). See \code{\link[stats]{cor}} for explanation of these options.
#'
#' @param method a character string indicating which correlation coefficient is
#'  to be computed. One of "pearson" (default), "kendall", or "spearman": can be
#'  abbreviated.
#'
#' @param n_rep The number of bootstrap replicates.
#'
#' @param conf A scalar or vector containing the confidence level(s) of the
#' required interval(s). 95\% intervals used by default.
#'
#' @param seed Integer used to seed the random number generator.
#'
#' @param n_cores Integer indicating number of processes to use in parallel
#' processing. Set to 1 fewer than the number of available CPUs by default.
#'
#' @return Object of class "cor_boot", a \code{\link{cor_list}} object with the
#'  following additional vectors:
#' \describe{
#'  \item{lower}{lower endpoint of the confidence interval}
#'  \item{upper}{upper endpoint of the confidence interval}
#' }
#' and the attribute \code{"CI"} indicating the size of the confidence interval being
#' returned.
#' @examples
#' # Correlations with 95% CI for the numeric variables from the iris data set
#' cor_boot(iris[,-5], n_rep = 1000)
#'
#' # with 99% CI
#' cor_boot(iris[,-5], n_rep = 1000, conf = 0.99)
#'
#' @export

cor_boot <- function(x, y = NULL, use = "pairwise", method = "pearson",
                     n_rep = 10000, conf = 0.95, seed = 42, n_cores = NULL,
                     ...){
  # construct cor_list. do this first so that cor_list can check that `x` and
  # `y` have named columns
  output <- cor_list(x, y, use, method)
  # do not proceed if cor_list returned empty object
  if(length(output$x) == 0){
    warning("There are no complete cases. Bootstrapping not performed.")
    return(output)
  }
  # calculation of BCa confidence intervals requires n_rep >= sample size
  # in addition, require at least 1000 replicates
  min_rep <- nrow(x); if(min_rep < 1000) min_rep <- 1000
  if(n_rep < min_rep){
    if(min_rep > 1000){
      message(paste("Calculation of BCa intervals requires the number of",
                    "\nbootstrap replicates to equal or exceed sample size."))
    } else {
      message(paste("At least 1000 bootstrap replicates recommended for",
                    "\nthe accurate calculation of BCa confidence intervals."))
    }
    message(paste("\n`n_rep` has been automatically increased from ", n_rep,
                  " to ", min_rep, ".\n", sep = ""))
    n_rep <- min_rep
  }
  # identify which columns in `data` belong to `x`; this is needed to work
  # around the requirement by `boot` that data be passed as a single argument,
  # enabling `boot_cor` to restore `data` to separate `x` & `y` args.
  x_cols <- 1:ncol(x)
  # use tibble data structure to prevent possible coercion to a vector when
  # .boot_cor dereferences data back into x and y parameters
  data <- tibble::as_tibble(x)
  # if y exists, merge with x so that it can be passed to boot as single frame
  if(!is.null(y)) data <- dplyr::bind_cols(data, y)
  # `statistic` function passed to `boot` returns just the coefficient vector
  # from the cor_list object
  .boot_cor <- function(data, rows, ..., x_cols){
    x <- data[rows, x_cols]
    y <- data[rows, -x_cols]; if(ncol(y) == 0) y <- NULL
    suppressWarnings(cor_list(x, y, ...)$coef)
  }
  # ============================================================================
  # parallel bootstrap
  # ----------------------------------------------------------------------------
  if(is.null(n_cores)) n_cores <- parallel::detectCores(logical = FALSE) - 1L
  cl <- parallel::makeCluster(n_cores)
  parallel::clusterExport(cl, "cor_list")
  set.seed(seed)
  boot_out <- boot::boot(data = data,
                         statistic = .boot_cor,
                         R = n_rep,
                         x_cols = x_cols,
                         use = use,
                         method = method,
                         parallel = "snow",
                         ncpus = n_cores,
                         cl = cl)
  parallel::stopCluster(cl)
  # ============================================================================

  # ============================================================================
  # obtain confidence intervals
  # ----------------------------------------------------------------------------
  .get_ci <- function(i){
    boot_ci <-  boot::boot.ci(boot_out, conf = conf, type = "bca", index = i)
    boot_ci$bca[4:5]
  }
  ci <- lapply(seq_along(boot_out$t0), .get_ci)
  # ============================================================================

  # join confidence intervals to cor_list output
  output$lower <- sapply(ci, function(x) x[1])
  output$upper <- sapply(ci, function(x) x[2])
  structure(output, class = c("cor_boot", "cor_list"), CI = conf)
}