ggboot: Bootstrap univariate and multivariate plots

Documented in calcBCa calcBCaA calcBCaZ0 calcCITbl calcP calcPSub calcPTbl

#' Calculate BCa
#'
#' @param bootObj object of class "boot". Object generated by \code{boot::boot} function. Optional
#' if \code{bookPkgCI} is TRUE or if \code{bootStat} is \code{NULL}. If \code{bookPkgCI} is TRUE
#' and \code{bootStat} is not \code{NULL}, then \code{bootObj} is used to calculate the confidence
#' interval, rather than \code{bootStat}.
#' @param bootPkgCI logical. If TRUE, then only functions from the \code{boot} package are used
#' to compute the confidence interval.
#' @param calcStat function. Function of a single variable to calculate for each sample.
#' @param origData numeric vector. Original sample. Required if \code{origStat}, \code{bootStat} or
#' \code{bcaA} (if \code{type} == "bc" ) not given.
#' @param bootStat numeric vector. Bootstrap samples. Optional if \code{bookPkgCI} is TRUE. Will have no effect if
#' \code{bootPkgCI} is \code{TRUE} and  \code{bootObj} is not \code{NULL}.
#' @param seed numeric vector. If specified, then \code{seed(seed)} is called before
#' generating a bootstrap sample.
#' @param type 'bca' or 'bc'. Whether to return BCa or bias-corrected intervals. If type == "bc",
#' it sets \code{bcaA}, overriding \code{bcaA} as specified in the \code{calcBCa} arguments.
#' @param origStat numeric. Original sample estimate, i.e. value of \code{calcStat}
#' applied to \code{origData}.
#' @param bcaA numeric. Acceleration constant.
#' @param alpha numeric. Dependent on \code{ci}, either 1-\code{alpha} is the coverage of the
#' confidence interval or \code{alpha} specifies the quantile.
#' @param ci logical. If \code{TRUE}, then \code{alpha} specifies the width of the confidence interval,
#' and an upper and lower bound are returned. If \code{FALSE}, then \code{alpha} specifies the quantile,
#' and a single quantile is returned.
#' @param B numeric. Number of bootstrap samples to generate, if required.
#' @param ... arguments. Passed on to \code{boot::boot}, if run.
#' @return If \code{ci} is \code{TRUE}, then a numeric vector of length two is returned with the
#' upper and lower bound of the 1-\code{alpha} confidence interval. If \code{ci} is \code{FALSE}, then
#' a single numeric is returned of the corresponding quantile.
#' @examples calcBCa( origData = c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248, 1.676752, 0.46717, 0.360975, 0.118718 ),
#' calcStat = mean, bootPkgCI = TRUE )

calcBCa = function( bootObj = NULL, bootStat = NULL, origStat = NULL, origData = NULL,
                    bcaA = NULL, alpha = 0.05, ci = TRUE, type = 'bca',
                    calcStat = NULL, B = 2e3, bootPkgCI = FALSE, seed = NULL, ...){


  # FLOW

  # Input checks
  # - if bootPkgCI is TRUE and bootObj not given
  # -- origData and calcStat must be given
  # - if bootPkgCI is FALSE
  # -- If at least one of the following are not given:
  # --- bootStat, bcaA ( if type != "bc" ), origStat
  # --- then origData and calcStat must be given

  # Use boot package for CIs
  # - Only if bootPkgCI == TRUE
  # - if seed given
  # -- set seed.
  # -- If is.null( bootObj ) & is.null( bootStat )
  # --- Calculate bootObj
  # -- If !is.null( bootObj )
  # --- Calculate ci using boot.ci
  # --- Return ci
  # -- At this point, the function would have quit unless bootStat was supplied and
  # -- bootObj was not (bootObj therefore takes preference).
  # --- Calculate L
  # ---- This is the difference between the jackknife estimates and the jackknife mean
  # ---- for each jackknife estimate.
  # --- If origStat not provided
  # ---- Calculate it.
  # --- Calculate bca confidence interval
  # ---- Use boot:::bca.ci function, and
  # ---- L and origStat as calculated before.
  # --- Return bca confidence interval.

  # Use boot package for CIs only
  # - if bootPkgCI = TRUE and bootStat not NULL, then boot:::bca.ci is applied to bootStat to
  # calculate BCa confidence intervals.
  # origStat is first calculated if not initially supplied.

  # Remaing missing objects
  # - If bootStat is NULL, it is calculated.
  # - If type == "bca" and bcaA is NULL, then bcaA is calculated here.
  # - If type == "bc", then bcaA is set to 0.

  # Remaining objects
  # - Calculate z0
  # - Calculate za
  # -- ci
  # --- If TRUE, then
  # ---- the original alpha is taken to mean that a 1-alpha
  # ---- confidence interval is required.
  # --- If FALSE, then
  # ---- the original alpha is taken to mean that the alpha-th quantile
  # ---- is required.

  # Calculated adjusted alpha
  # - Calculate the adjusted alpha

  # Return bootstrap quantile(s)
  # - Return the quantile(s) in the bootstrap distribution
  # - corresponding to the (adjusted) alpha(s).

  if( bootPkgCI & is.null( bootObj ) & or( is.null( origData ), is.null( calcStat ) ) ){
    stop( "If bootPkgCI is TRUE and bootObj not given, then origData and calcStat are required." ) }


  if( !bootPkgCI &
      ( is.null( origStat ) %>%
        or( is.null( bootStat ) ) %>%
        or( and( is.null( bcaA ), type != "bc" ) ) ) &
      or( is.null( calcStat ), is.null( origData ) ) ){
    stop( "If bootPkgCI is FALSE, then if bcaA (if type is not 'bc' ), origStat or bootStat are not given, origData and calcStat are required." )
  }

  # Use boot package for confidence intervals
  if( bootPkgCI ){

    if( !is.null( seed ) ) set.seed( seed )
    if( is.null( bootObj ) & is.null( bootStat ) ) bootObj = boot::boot( origData,
                                                                         function(x,w) calcStat(x[w]),
                                                                         R = B,... )

    if( !is.null( bootObj ) ) return( boot::boot.ci( bootObj, type = "bca" )$bca[4:5] )

    L = boot::empinf( data = origData, statistic = function(x,w) calcStat(x[w]),
                      type = "jack", stype = "i" )
    if( is.null( origStat ) ) origStat = calcStat( origData )
    return( boot:::bca.ci( boot.out = NULL, conf = 1-alpha, t0 = origStat,
                           t = bootStat, L = L )[4:5] )
  }

  # Intermediate objects
  if( is.null( origStat ) ) origStat = calcStat( origData )
  if( !is.null( seed ) ) set.seed( seed )
  if( is.null( bootStat ) ) bootStat = boot::boot( origData, function(x,w) calcStat(x[w]), R = B, ...)$t
  if( is.null( bcaA ) & type == "bca" ) bcaA = calcBCaA( origData, calcStat)
  if( type == "bc" ) bcaA = 0
  if( !is.finite( bcaA ) ){
    message( "Estimated variance adjustment, a, is infinite. Setting a to 0.")
    bcaA = 0
  }
  z0 = calcBCaZ0( bootStat, origStat )
  if( !is.finite( z0 ) ){
    message( "Estimated bias adjustment, z0, is infinite. Setting z0 and a to 0." )
    bcaA = 0
    z0 = 0
  }
  if( ci ) za = qnorm( c( alpha/2, 1-alpha/2 ) )
  if( !ci ) za = qnorm( alpha )

  # Adjusted alphas
  num = z0 + za
  den = 1 - bcaA * num
  term2 = num/den
  adjAlpha = pnorm ( z0 + term2 )
  setNames( boot:::norm.inter( bootStat, adjAlpha )[,2], NULL)

  # Return bootstrap quantile(s)
  #setNames( matrixStats::colQuantiles( matrix( bootStat, ncol = 1 ),
  #  probs = adjAlpha ), NULL )
}

#' Calculate BCa acceleration factor
#'
#' Returns the estimated acceleration factor for the BCa confidence interval
#' in the one-sample non-parametric case, using the standard jack-knife.
#'
#' @param origData numeric vector. Original sample.
#' @param calcStat function. A numeric function that takes a single numeric vector as
#' an argument and returns a numeric vector of length 1 as output.
#' @return A numeric vector of length one that is the estimate of the acceleration factor.
#' @examples calcBCaA( c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248,
#' 1.676752, 0.46717, 0.360975, 0.118718 ), calcStat = mean)
#' @export
calcBCaA = function( origData, calcStat ){
  L = boot::empinf( data = origData, statistic = function(x,w) calcStat(x[w]),
                    type = "jack", stype = "i" )
  setNames( sum(L^3)/(6 * sum(L^2)^1.5), NULL )
}

#' Calculate BCa bias quantile
#'
#' Returns the estimated bias quantile for the BCa confidence interval.
#'
#' @param bootStat numeric vector. Bootstrap sample statistics.
#' @param origStat numeric. Original sample statistic.
#' @return A numeric vector of length one that is the estimate of the bias quantile.
#' @export
#' @examples
#' respVec = c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248,
#' 1.676752, 0.46717, 0.360975, 0.118718 )
#' origStat = mean(respVec)
#' bootVec = boot::boot( respVec, function(x,w) mean(x[w]), R = 2e3 )$t
#' calcBCaZ0( bootVec, origStat )
calcBCaZ0 = function( bootStat, origStat ){
  qnorm( sum( bootStat < origStat ) / length( bootStat ), lower.tail = TRUE )
}


#' calculate confidence intervals for multiple groups
#'
#' Calculates confidence intervals using one of three methods (BCa, BC or percentile),
#' as well as the bias factor ($z_0$) and the acceleration factor ($a$).
#'
#' @param bootData dataframe. Must have character column split, defining
#' groups, and bootStat, giving \code{B} bootstrap sample estimates for each
#' group in \code{split} of statistic for which confidence interval is required. If
#' \code{method=='percT'}, then
#' @param alpha numeric \eqn{\;\in(0,1)}. (1-alpha) is the desired coverage
#' for the confidence interval.
#' @param method 'bca', 'bc' or 'perc'. The confidence interval method.
#' @param diff logical. If \code{TRUE}, then the acceleration factor, $a$, is
#' forced to zero, even when \code{method=='bca'}. This is because the acceleration factor
#' calculated is invalid in the unpaired two-sample case. The reported acceleration factor
#' is set to \code{NA}, for the same reason. If \code{diff==TRUE}, then the acceleration factor
#' is only zero if \code{method=="bc"}, and the acceleration factor is reported as is.
#' @param bcaAVec numeric vector. Numeric vector of . Required only if bcaAVec
#' @param createCluster logical. If \code{TRUE}, then the code to start a cluster is run.
#' @return A tibble with columns named split, lb and ub.
#' @examples
#' # data prep
#' respVec = c( 0.060426, 0.066152, 0.07093, 0.056687, 0.18716, 0.790952, 0.20803, 0.245396, 0.087918,
#' 0.02499226, 0.2234444308, 0.0148025445, 0.2425358694, 0.025742255, 0.011875387, 0.0148772707,
#' 0.0086363158, 0.014349707, 0.0087206469, 0.055159961, 0.0366530097, 0.000260970000000001, 0.0111716045, 0.011851586,
#' 0.0109080046, 0.002077461, 0.011693718, 0.0182342151, 0.0031248769, 0.0067057876, 0.0172261736,
#' 0.0491776258, 0.026822441, 0.0062511869, 0.0135163775, 0.003760361, 0.0196274421, 0.004280863, 0.0143105389,
#' 0.0150681214, 0.0063319547, 0.0087206469, 0.000260970000000001, 0.0111716045, 0.002077461, 0.011693718, 0.0148772707,
#' 0.055159961, 0.0366530097, 0.011851586, 0.0109080046, 0.0182342151, 0.0172261736, 0.026822441, 0.0135163775 )
#' splitVec = c( rep( "1", 9), rep( "2", 46 ) )
#' calcStat = function(x) mean( x, trim = 0.2 )
#'calcStatVec = calcStat
#' calcBootStatVec = function(x,w) calcStatVec(x[w])
#'
# table of statistics on original sample
#' origStatTbl = plyr::ldply( split( respVec, splitVec ),
#'                        calcStatVec ) %>%
#'  rename( split = .id, origStat = V1 )
#'
#' origStatVec = setNames( origStatTbl$origStat, origStatTbl$split )
#'
# acceleration factor
#' bcaATbl = plyr::ldply( split( respVec, splitVec ), function(x) calcBCaA( x, calcStat ) ) %>%
#'  rename( split = .id, bcaA = V1 )
#'
#' bcaAVec = setNames( bcaATbl$bcaA, bcaATbl$split )
#'
# bootstrap sample statistics
#' set.seed(1)
#' bootTbl = plyr::ldply( split( respVec, splitVec ),
#'                       function(x) boot::boot( x, calcBootStatVec, 1e2 )$t ) %>%
#'  rename( split = .id, bootStat = `1` ) %>%
#'  mutate( origStat = origStatVec[ split ] ) %>%
#'  as_tibble()
#'
#'  # actual function
#'  calcCITbl( bootData = bootTbl, alpha = 0.05, method = "bca",
#' diff = FALSE, bcaAVec = bcaAVec )

calcCITbl = function( bootData, alpha, method, diff,
                      bcaAVec, createCluster = FALSE ){

  # FLOW
  #
  # Input checks
  # - Purpose:
  # -- Ensures that bcaVec is supplied if BCa calculation is requested.
  # -- Sets bcaAvec to named NA vec if bcaVec was NULL and BC
  # -- or perc calculation requested.
  #
  # Parallel I
  # - Purpose:
  # -- This is useful, as sometimes you have already initiated a cluster,
  # -- and so don't want another initialised to save time/avoid errors
  # - Code:
  # -- If createCluster is TRUE
  # --- initates a cluster.
  #
  # BCa vec
  # - Purpose:
  # -- Creates two vectors, reportingBCaVec and calculationBCaVec, which
  # -- are reported in the final outputted tibble or used in the
  # -- BCa or BC calculation, respectively.
  # -- This is done because if diff is TRUE, then the acceleration factor
  # -- should be NA, but if diff is FALSE, then the acceleration factor
  # -- is correct and can be reported.
  # -- It is also useful because if diff is TRUE or the calculation method
  # -- is BC, then the acceleration factor used in the BCa calculation should
  # -- be zero.
  # - Code:
  # -- if diff
  # --- set reportingBCaVec and calculationBCaVec to NA and 0, respectively,
  # --- with element names set to the unique names in the split.
  # -- if not diff
  # --- reportingBCaAvec is as given
  # --- calculationBCaVec is as given if method is BCa, and 0 if BC. Not needed
  # --- for perc method.
  #
  #
  # BC(a) calculation
  # - Purpose:
  # -- Performs a BCa calculation for each split, using the relevant provided
  # -- bootstrap samples (bootStat), acceleleration factors (calculationBCaAVec)
  # -- alpha and original sample statistics (origStat).
  # - Code:
  # -- ciVec
  # --- Uses calcBCa to calculate confidence interval.
  # -- Calculates z0CI using calcBCaZ0
  # -- Sets aCI to reportingBCaAVec.
  #
  # Perc calculation
  # - Purpose:
  # -- For each level in split vector, calculates percentile confidence interval
  # -- and bias factor, and returns confidence interval, bias factor and
  # -- reportingBCaAVec accleration factor.
  # - Code:
  # -- Uses matrixStats::colQuantiles function for quantiles, and
  # -- calcBCaZ0 for bias factor.
  #
  # Parallel II
  # - Purpose
  # -- If a cluster was initiated by this function, it is stopped now.
  #
  # Return
  # - Purpose
  # -- Returns outTbl

  # Input checks
  if( is.null( bcaAVec ) & diff!= FALSE ){
    if( method == "bca" ) stop( "Method == 'bca' required non-null bcaVec.")
    if( method %in% c( "perc", "bc" ) ) bcaAVec = setNames( rep( NA, calcLU( bootData$split ) ),
                                                            unique( bootData$split ) )
  }
  # Parallel
  if( createCluster ){
    cl1 = parallel::makeCluster(parallel::detectCores()-1)
    doParallel::registerDoParallel(cl1)
  }

  # BCa vec
  if( diff ){
    reportingBCaAVec =  setNames( rep( NA, calcLU( bootData$split ) ),
                                  unique( bootData$split ) )
    calculationBCaAVec = setNames( rep( 0, calcLU( bootData$split ) ),
                                   unique( bootData$split ) )
  } else{
    reportingBCaAVec = bcaAVec
    if( method == "bc" ) calculationBCaAVec = setNames( rep( 0, calcLU( bootData$split ) ),
                                                        unique( bootData$split ) )
    if( method == "bca" ) calculationBCaAVec = bcaAVec
  }

  # BC(a) calculation
  if( str_detect( method, "bc" ) ){

    outTbl = ldply( split( bootData, bootData$split ), function(x){
      ciVec = calcBCa( bootStat = x$bootStat,
                       origStat = x$origStat[1],
                       bcaA = calculationBCaAVec[x$split[1]],
                       alpha = alpha )
      tibble( lb = ciVec[1], ub = ciVec[2],
              z0CI = calcBCaZ0( x$bootStat, x$origStat[1]),
              aCI = reportingBCaAVec[x$split[1]])

    }) %>%
      rename( split = .id ) %>%
      as_tibble()
  }

  # Perc calculation
  if( method == "perc" ){
    outTbl = ldply( split( bootData, bootData$split ), function(x){
      tibble( lb = matrixStats::colQuantiles( matrix( x$bootStat,
                                                      ncol = 1),
                                              probs = alpha/2 ),
              ub = matrixStats::colQuantiles( matrix( x$bootStat,
                                                      ncol = 1),
                                              probs = 1-alpha/2 ),
              z0CI = calcBCaZ0( x$bootStat, x$origStat[1]),
              aCI = reportingBCaAVec[x$split[1]])

    }) %>%
      rename( split = .id ) %>%
      as_tibble()
  }

  # Parallel II
  if( createCluster ) parallel::stopCluster( cl1 )

  # Return
  outTbl
}

#' Calculate p-value
#'
#' Calculate the p-value for a specific subgroup.
#'
#' @param bootData dataframe. Must have column named bootStat with bootstrap sample statistics.
#' If \code{method=='percT'}, it must also have the column bootSE, with the bootstrap estimate
#' standard deviation.
#' @param origStat numeric. Original sample estimate. Required.
#' @param origSE numeric. Original sample estimate standard error. Required if \code{method=='percT'}.
#' @param nullValue numeric. Borderline value under the null hypothesis.
#' @param altSide both', 'high' or 'low'. The side of the null hypothesis the alternate hypothesis
#' is on.
#' @return The p-value as a numeric vector.
calcP = function( bootData, origStat, origSE = NULL, nullValue,
                  altSide = NULL, method = 'perc'){

  # Input checks
  if( !method %in% c( 'perc', 'percT' ) ){
    stop( "method must be one of 'perc' or 'percT") }
  if( method == "percT" & ( suppressWarnings( is.null( bootData$bootSE ) %>%
                                              or( is.null( origSE ) ) ) ) ){
    stop( "percT requires bootSE and origSE." ) }
  if( !exists( "bootData", inherits = FALSE ) ) stop( "bootData required." )
  if( !exists( "origStat", inherits = FALSE ) ) stop( "origStat required." )
  if( !exists( "nullValue", inherits = FALSE ) ) stop( "nullValue required." )

  if( is.null( altSide ) ){
    altSide = 'both'
    message( "Setting altSide to 'both'.")
  }

  # perc
  nullBootStat = bootData$bootStat - origStat + nullValue # centred at null
  nullOrigStat = origStat - nullValue # difference from null
  prop = sum( nullBootStat < nullOrigStat ) / nrow( bootData )
  if( method == "perc" ) return( calcPSub( prop, altSide ) )
  # percT
  nullBootTStat = (bootData$bootStat - origStat + nullValue )/bootData$bootSE
  nullOrigTStat = ( origStat - nullValue ) / origSE
  propT = sum( nullBootTStat < nullOrigTStat ) / nrow( bootData )

  c( calcPSub( prop, altSide ), calcPSub( propT, altSide ) )
}
#' Calculate p-value
#'
#'

#' Calculate p-value based on a proportion
#'
#' @param prop numeric. Proportion of the null distribution less than the sample statistic.
#' @param altSide 'both', 'high' or 'low'. The side of the null hypothesis the alternate hypothesis
#' is on.
#' @return A single numeric, the p-value.
calcPSub = function( prop, altSide ){
  if( altSide == "high" ) return( c( 1 - prop ) )
  if( altSide == "low" ) return( prop )
  if( altSide == "both" ) return( 2 * min( prop, 1 - prop ) )
}

#' Calculate p value for a table
#'
#' Calculate p value for each subgroup in a table
#'
#' @param bootData dataframe. Must have character column split, defining
#' groups, and bootStat/bootTStat, giving \code{B} bootstrap sample estimates for each
#' group in \code{split} of statistic for which confidence interval is required.
#' @param origData dataframe. Must have character column split, defining
#' groups, and origStat/origTStat, giving original sample estimate of
#' statistic for which p-value is required.
#' @param calcStat function. Function that calculates the statistic for
#' which a confidence interval is required.
#' @param method character. The method specified by the argument \code{pMethod}
#' in the \code{ggbootUV} function.
#' @param altSide "both", "low" or "high". Specifies side(s) of null hypothesis alternative
#' hypothesis lies.
#'
#' @return A dataframe with columns split and p.
# added trueBootData now - check this out.
calcPTbl = function( bootData, origData, method, altSide, nullValue ){

  outTbl = ldply( split( bootData, bootData$split ),
                  function(x){
                    origRow = which( origData$split == x$split[1] )
                    origStat = origData$origStat[origRow]
                    origSE = origData$origSE[origRow]
                    pVec = calcP( bootData = x, origStat = origStat,
                                  origSE = ifelse( method == "percT", origSE, 0 ),
                                  nullValue = nullValue, altSide = altSide,
                                  method = method )
                    z0P = calcBCaZ0( x$bootStat, origStat ) %>% signif(3)
                    if( method == "perc" ) return( c( pVec, z0P ) )

                    z0PT = calcBCaZ0( ( x$bootStat - nullValue )/ x$bootSE,
                                      ( origStat - nullValue )/ origSE ) %>%
                      signif(3)

                    c( pVec, z0P, z0PT )

                  } ) %>%
    rename( split = .id, percP = V1 ) %>%
    as_tibble
  if( method == "perc" ) outTbl %<>% rename( z0P = V2 )
  if( method == 'percT' ) outTbl %<>% rename( percTP = V2,
                                              z0P = V3,
                                              z0PT = V4 )

  outTbl
}