#' Calculate BCa
#'
#' @param bootObj object of class "boot". Object generated by \code{boot::boot} function. Optional
#' if \code{bookPkgCI} is TRUE or if \code{bootStat} is \code{NULL}. If \code{bookPkgCI} is TRUE
#' and \code{bootStat} is not \code{NULL}, then \code{bootObj} is used to calculate the confidence
#' interval, rather than \code{bootStat}.
#' @param bootPkgCI logical. If TRUE, then only functions from the \code{boot} package are used
#' to compute the confidence interval.
#' @param calcStat function. Function of a single variable to calculate for each sample.
#' @param origData numeric vector. Original sample. Required if \code{origStat}, \code{bootStat} or
#' \code{bcaA} (if \code{type} == "bc" ) not given.
#' @param bootStat numeric vector. Bootstrap samples. Optional if \code{bookPkgCI} is TRUE. Will have no effect if
#' \code{bootPkgCI} is \code{TRUE} and \code{bootObj} is not \code{NULL}.
#' @param seed numeric vector. If specified, then \code{seed(seed)} is called before
#' generating a bootstrap sample.
#' @param type 'bca' or 'bc'. Whether to return BCa or bias-corrected intervals. If type == "bc",
#' it sets \code{bcaA}, overriding \code{bcaA} as specified in the \code{calcBCa} arguments.
#' @param origStat numeric. Original sample estimate, i.e. value of \code{calcStat}
#' applied to \code{origData}.
#' @param bcaA numeric. Acceleration constant.
#' @param alpha numeric. Dependent on \code{ci}, either 1-\code{alpha} is the coverage of the
#' confidence interval or \code{alpha} specifies the quantile.
#' @param ci logical. If \code{TRUE}, then \code{alpha} specifies the width of the confidence interval,
#' and an upper and lower bound are returned. If \code{FALSE}, then \code{alpha} specifies the quantile,
#' and a single quantile is returned.
#' @param B numeric. Number of bootstrap samples to generate, if required.
#' @param ... arguments. Passed on to \code{boot::boot}, if run.
#' @return If \code{ci} is \code{TRUE}, then a numeric vector of length two is returned with the
#' upper and lower bound of the 1-\code{alpha} confidence interval. If \code{ci} is \code{FALSE}, then
#' a single numeric is returned of the corresponding quantile.
#' @examples calcBCa( origData = c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248, 1.676752, 0.46717, 0.360975, 0.118718 ),
#' calcStat = mean, bootPkgCI = TRUE )
calcBCa = function( bootObj = NULL, bootStat = NULL, origStat = NULL, origData = NULL,
bcaA = NULL, alpha = 0.05, ci = TRUE, type = 'bca',
calcStat = NULL, B = 2e3, bootPkgCI = FALSE, seed = NULL, ...){
# FLOW
# Input checks
# - if bootPkgCI is TRUE and bootObj not given
# -- origData and calcStat must be given
# - if bootPkgCI is FALSE
# -- If at least one of the following are not given:
# --- bootStat, bcaA ( if type != "bc" ), origStat
# --- then origData and calcStat must be given
# Use boot package for CIs
# - Only if bootPkgCI == TRUE
# - if seed given
# -- set seed.
# -- If is.null( bootObj ) & is.null( bootStat )
# --- Calculate bootObj
# -- If !is.null( bootObj )
# --- Calculate ci using boot.ci
# --- Return ci
# -- At this point, the function would have quit unless bootStat was supplied and
# -- bootObj was not (bootObj therefore takes preference).
# --- Calculate L
# ---- This is the difference between the jackknife estimates and the jackknife mean
# ---- for each jackknife estimate.
# --- If origStat not provided
# ---- Calculate it.
# --- Calculate bca confidence interval
# ---- Use boot:::bca.ci function, and
# ---- L and origStat as calculated before.
# --- Return bca confidence interval.
# Use boot package for CIs only
# - if bootPkgCI = TRUE and bootStat not NULL, then boot:::bca.ci is applied to bootStat to
# calculate BCa confidence intervals.
# origStat is first calculated if not initially supplied.
# Remaing missing objects
# - If bootStat is NULL, it is calculated.
# - If type == "bca" and bcaA is NULL, then bcaA is calculated here.
# - If type == "bc", then bcaA is set to 0.
# Remaining objects
# - Calculate z0
# - Calculate za
# -- ci
# --- If TRUE, then
# ---- the original alpha is taken to mean that a 1-alpha
# ---- confidence interval is required.
# --- If FALSE, then
# ---- the original alpha is taken to mean that the alpha-th quantile
# ---- is required.
# Calculated adjusted alpha
# - Calculate the adjusted alpha
# Return bootstrap quantile(s)
# - Return the quantile(s) in the bootstrap distribution
# - corresponding to the (adjusted) alpha(s).
if( bootPkgCI & is.null( bootObj ) & or( is.null( origData ), is.null( calcStat ) ) ){
stop( "If bootPkgCI is TRUE and bootObj not given, then origData and calcStat are required." ) }
if( !bootPkgCI &
( is.null( origStat ) %>%
or( is.null( bootStat ) ) %>%
or( and( is.null( bcaA ), type != "bc" ) ) ) &
or( is.null( calcStat ), is.null( origData ) ) ){
stop( "If bootPkgCI is FALSE, then if bcaA (if type is not 'bc' ), origStat or bootStat are not given, origData and calcStat are required." )
}
# Use boot package for confidence intervals
if( bootPkgCI ){
if( !is.null( seed ) ) set.seed( seed )
if( is.null( bootObj ) & is.null( bootStat ) ) bootObj = boot::boot( origData,
function(x,w) calcStat(x[w]),
R = B,... )
if( !is.null( bootObj ) ) return( boot::boot.ci( bootObj, type = "bca" )$bca[4:5] )
L = boot::empinf( data = origData, statistic = function(x,w) calcStat(x[w]),
type = "jack", stype = "i" )
if( is.null( origStat ) ) origStat = calcStat( origData )
return( boot:::bca.ci( boot.out = NULL, conf = 1-alpha, t0 = origStat,
t = bootStat, L = L )[4:5] )
}
# Intermediate objects
if( is.null( origStat ) ) origStat = calcStat( origData )
if( !is.null( seed ) ) set.seed( seed )
if( is.null( bootStat ) ) bootStat = boot::boot( origData, function(x,w) calcStat(x[w]), R = B, ...)$t
if( is.null( bcaA ) & type == "bca" ) bcaA = calcBCaA( origData, calcStat)
if( type == "bc" ) bcaA = 0
if( !is.finite( bcaA ) ){
message( "Estimated variance adjustment, a, is infinite. Setting a to 0.")
bcaA = 0
}
z0 = calcBCaZ0( bootStat, origStat )
if( !is.finite( z0 ) ){
message( "Estimated bias adjustment, z0, is infinite. Setting z0 and a to 0." )
bcaA = 0
z0 = 0
}
if( ci ) za = qnorm( c( alpha/2, 1-alpha/2 ) )
if( !ci ) za = qnorm( alpha )
# Adjusted alphas
num = z0 + za
den = 1 - bcaA * num
term2 = num/den
adjAlpha = pnorm ( z0 + term2 )
setNames( boot:::norm.inter( bootStat, adjAlpha )[,2], NULL)
# Return bootstrap quantile(s)
#setNames( matrixStats::colQuantiles( matrix( bootStat, ncol = 1 ),
# probs = adjAlpha ), NULL )
}
#' Calculate BCa acceleration factor
#'
#' Returns the estimated acceleration factor for the BCa confidence interval
#' in the one-sample non-parametric case, using the standard jack-knife.
#'
#' @param origData numeric vector. Original sample.
#' @param calcStat function. A numeric function that takes a single numeric vector as
#' an argument and returns a numeric vector of length 1 as output.
#' @return A numeric vector of length one that is the estimate of the acceleration factor.
#' @examples calcBCaA( c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248,
#' 1.676752, 0.46717, 0.360975, 0.118718 ), calcStat = mean)
#' @export
calcBCaA = function( origData, calcStat ){
L = boot::empinf( data = origData, statistic = function(x,w) calcStat(x[w]),
type = "jack", stype = "i" )
setNames( sum(L^3)/(6 * sum(L^2)^1.5), NULL )
}
#' Calculate BCa bias quantile
#'
#' Returns the estimated bias quantile for the BCa confidence interval.
#'
#' @param bootStat numeric vector. Bootstrap sample statistics.
#' @param origStat numeric. Original sample statistic.
#' @return A numeric vector of length one that is the estimate of the bias quantile.
#' @export
#' @examples
#' respVec = c( 0.096888, 0.115579, 0.13671, 0.086497, 0.5248,
#' 1.676752, 0.46717, 0.360975, 0.118718 )
#' origStat = mean(respVec)
#' bootVec = boot::boot( respVec, function(x,w) mean(x[w]), R = 2e3 )$t
#' calcBCaZ0( bootVec, origStat )
calcBCaZ0 = function( bootStat, origStat ){
qnorm( sum( bootStat < origStat ) / length( bootStat ), lower.tail = TRUE )
}
#' calculate confidence intervals for multiple groups
#'
#' Calculates confidence intervals using one of three methods (BCa, BC or percentile),
#' as well as the bias factor ($z_0$) and the acceleration factor ($a$).
#'
#' @param bootData dataframe. Must have character column split, defining
#' groups, and bootStat, giving \code{B} bootstrap sample estimates for each
#' group in \code{split} of statistic for which confidence interval is required. If
#' \code{method=='percT'}, then
#' @param alpha numeric \eqn{\;\in(0,1)}. (1-alpha) is the desired coverage
#' for the confidence interval.
#' @param method 'bca', 'bc' or 'perc'. The confidence interval method.
#' @param diff logical. If \code{TRUE}, then the acceleration factor, $a$, is
#' forced to zero, even when \code{method=='bca'}. This is because the acceleration factor
#' calculated is invalid in the unpaired two-sample case. The reported acceleration factor
#' is set to \code{NA}, for the same reason. If \code{diff==TRUE}, then the acceleration factor
#' is only zero if \code{method=="bc"}, and the acceleration factor is reported as is.
#' @param bcaAVec numeric vector. Numeric vector of . Required only if bcaAVec
#' @param createCluster logical. If \code{TRUE}, then the code to start a cluster is run.
#' @return A tibble with columns named split, lb and ub.
#' @examples
#' # data prep
#' respVec = c( 0.060426, 0.066152, 0.07093, 0.056687, 0.18716, 0.790952, 0.20803, 0.245396, 0.087918,
#' 0.02499226, 0.2234444308, 0.0148025445, 0.2425358694, 0.025742255, 0.011875387, 0.0148772707,
#' 0.0086363158, 0.014349707, 0.0087206469, 0.055159961, 0.0366530097, 0.000260970000000001, 0.0111716045, 0.011851586,
#' 0.0109080046, 0.002077461, 0.011693718, 0.0182342151, 0.0031248769, 0.0067057876, 0.0172261736,
#' 0.0491776258, 0.026822441, 0.0062511869, 0.0135163775, 0.003760361, 0.0196274421, 0.004280863, 0.0143105389,
#' 0.0150681214, 0.0063319547, 0.0087206469, 0.000260970000000001, 0.0111716045, 0.002077461, 0.011693718, 0.0148772707,
#' 0.055159961, 0.0366530097, 0.011851586, 0.0109080046, 0.0182342151, 0.0172261736, 0.026822441, 0.0135163775 )
#' splitVec = c( rep( "1", 9), rep( "2", 46 ) )
#' calcStat = function(x) mean( x, trim = 0.2 )
#'calcStatVec = calcStat
#' calcBootStatVec = function(x,w) calcStatVec(x[w])
#'
# table of statistics on original sample
#' origStatTbl = plyr::ldply( split( respVec, splitVec ),
#' calcStatVec ) %>%
#' rename( split = .id, origStat = V1 )
#'
#' origStatVec = setNames( origStatTbl$origStat, origStatTbl$split )
#'
# acceleration factor
#' bcaATbl = plyr::ldply( split( respVec, splitVec ), function(x) calcBCaA( x, calcStat ) ) %>%
#' rename( split = .id, bcaA = V1 )
#'
#' bcaAVec = setNames( bcaATbl$bcaA, bcaATbl$split )
#'
# bootstrap sample statistics
#' set.seed(1)
#' bootTbl = plyr::ldply( split( respVec, splitVec ),
#' function(x) boot::boot( x, calcBootStatVec, 1e2 )$t ) %>%
#' rename( split = .id, bootStat = `1` ) %>%
#' mutate( origStat = origStatVec[ split ] ) %>%
#' as_tibble()
#'
#' # actual function
#' calcCITbl( bootData = bootTbl, alpha = 0.05, method = "bca",
#' diff = FALSE, bcaAVec = bcaAVec )
calcCITbl = function( bootData, alpha, method, diff,
bcaAVec, createCluster = FALSE ){
# FLOW
#
# Input checks
# - Purpose:
# -- Ensures that bcaVec is supplied if BCa calculation is requested.
# -- Sets bcaAvec to named NA vec if bcaVec was NULL and BC
# -- or perc calculation requested.
#
# Parallel I
# - Purpose:
# -- This is useful, as sometimes you have already initiated a cluster,
# -- and so don't want another initialised to save time/avoid errors
# - Code:
# -- If createCluster is TRUE
# --- initates a cluster.
#
# BCa vec
# - Purpose:
# -- Creates two vectors, reportingBCaVec and calculationBCaVec, which
# -- are reported in the final outputted tibble or used in the
# -- BCa or BC calculation, respectively.
# -- This is done because if diff is TRUE, then the acceleration factor
# -- should be NA, but if diff is FALSE, then the acceleration factor
# -- is correct and can be reported.
# -- It is also useful because if diff is TRUE or the calculation method
# -- is BC, then the acceleration factor used in the BCa calculation should
# -- be zero.
# - Code:
# -- if diff
# --- set reportingBCaVec and calculationBCaVec to NA and 0, respectively,
# --- with element names set to the unique names in the split.
# -- if not diff
# --- reportingBCaAvec is as given
# --- calculationBCaVec is as given if method is BCa, and 0 if BC. Not needed
# --- for perc method.
#
#
# BC(a) calculation
# - Purpose:
# -- Performs a BCa calculation for each split, using the relevant provided
# -- bootstrap samples (bootStat), acceleleration factors (calculationBCaAVec)
# -- alpha and original sample statistics (origStat).
# - Code:
# -- ciVec
# --- Uses calcBCa to calculate confidence interval.
# -- Calculates z0CI using calcBCaZ0
# -- Sets aCI to reportingBCaAVec.
#
# Perc calculation
# - Purpose:
# -- For each level in split vector, calculates percentile confidence interval
# -- and bias factor, and returns confidence interval, bias factor and
# -- reportingBCaAVec accleration factor.
# - Code:
# -- Uses matrixStats::colQuantiles function for quantiles, and
# -- calcBCaZ0 for bias factor.
#
# Parallel II
# - Purpose
# -- If a cluster was initiated by this function, it is stopped now.
#
# Return
# - Purpose
# -- Returns outTbl
# Input checks
if( is.null( bcaAVec ) & diff!= FALSE ){
if( method == "bca" ) stop( "Method == 'bca' required non-null bcaVec.")
if( method %in% c( "perc", "bc" ) ) bcaAVec = setNames( rep( NA, calcLU( bootData$split ) ),
unique( bootData$split ) )
}
# Parallel
if( createCluster ){
cl1 = parallel::makeCluster(parallel::detectCores()-1)
doParallel::registerDoParallel(cl1)
}
# BCa vec
if( diff ){
reportingBCaAVec = setNames( rep( NA, calcLU( bootData$split ) ),
unique( bootData$split ) )
calculationBCaAVec = setNames( rep( 0, calcLU( bootData$split ) ),
unique( bootData$split ) )
} else{
reportingBCaAVec = bcaAVec
if( method == "bc" ) calculationBCaAVec = setNames( rep( 0, calcLU( bootData$split ) ),
unique( bootData$split ) )
if( method == "bca" ) calculationBCaAVec = bcaAVec
}
# BC(a) calculation
if( str_detect( method, "bc" ) ){
outTbl = ldply( split( bootData, bootData$split ), function(x){
ciVec = calcBCa( bootStat = x$bootStat,
origStat = x$origStat[1],
bcaA = calculationBCaAVec[x$split[1]],
alpha = alpha )
tibble( lb = ciVec[1], ub = ciVec[2],
z0CI = calcBCaZ0( x$bootStat, x$origStat[1]),
aCI = reportingBCaAVec[x$split[1]])
}) %>%
rename( split = .id ) %>%
as_tibble()
}
# Perc calculation
if( method == "perc" ){
outTbl = ldply( split( bootData, bootData$split ), function(x){
tibble( lb = matrixStats::colQuantiles( matrix( x$bootStat,
ncol = 1),
probs = alpha/2 ),
ub = matrixStats::colQuantiles( matrix( x$bootStat,
ncol = 1),
probs = 1-alpha/2 ),
z0CI = calcBCaZ0( x$bootStat, x$origStat[1]),
aCI = reportingBCaAVec[x$split[1]])
}) %>%
rename( split = .id ) %>%
as_tibble()
}
# Parallel II
if( createCluster ) parallel::stopCluster( cl1 )
# Return
outTbl
}
#' Calculate p-value
#'
#' Calculate the p-value for a specific subgroup.
#'
#' @param bootData dataframe. Must have column named bootStat with bootstrap sample statistics.
#' If \code{method=='percT'}, it must also have the column bootSE, with the bootstrap estimate
#' standard deviation.
#' @param origStat numeric. Original sample estimate. Required.
#' @param origSE numeric. Original sample estimate standard error. Required if \code{method=='percT'}.
#' @param nullValue numeric. Borderline value under the null hypothesis.
#' @param altSide both', 'high' or 'low'. The side of the null hypothesis the alternate hypothesis
#' is on.
#' @return The p-value as a numeric vector.
calcP = function( bootData, origStat, origSE = NULL, nullValue,
altSide = NULL, method = 'perc'){
# Input checks
if( !method %in% c( 'perc', 'percT' ) ){
stop( "method must be one of 'perc' or 'percT") }
if( method == "percT" & ( suppressWarnings( is.null( bootData$bootSE ) %>%
or( is.null( origSE ) ) ) ) ){
stop( "percT requires bootSE and origSE." ) }
if( !exists( "bootData", inherits = FALSE ) ) stop( "bootData required." )
if( !exists( "origStat", inherits = FALSE ) ) stop( "origStat required." )
if( !exists( "nullValue", inherits = FALSE ) ) stop( "nullValue required." )
if( is.null( altSide ) ){
altSide = 'both'
message( "Setting altSide to 'both'.")
}
# perc
nullBootStat = bootData$bootStat - origStat + nullValue # centred at null
nullOrigStat = origStat - nullValue # difference from null
prop = sum( nullBootStat < nullOrigStat ) / nrow( bootData )
if( method == "perc" ) return( calcPSub( prop, altSide ) )
# percT
nullBootTStat = (bootData$bootStat - origStat + nullValue )/bootData$bootSE
nullOrigTStat = ( origStat - nullValue ) / origSE
propT = sum( nullBootTStat < nullOrigTStat ) / nrow( bootData )
c( calcPSub( prop, altSide ), calcPSub( propT, altSide ) )
}
#' Calculate p-value
#'
#'
#' Calculate p-value based on a proportion
#'
#' @param prop numeric. Proportion of the null distribution less than the sample statistic.
#' @param altSide 'both', 'high' or 'low'. The side of the null hypothesis the alternate hypothesis
#' is on.
#' @return A single numeric, the p-value.
calcPSub = function( prop, altSide ){
if( altSide == "high" ) return( c( 1 - prop ) )
if( altSide == "low" ) return( prop )
if( altSide == "both" ) return( 2 * min( prop, 1 - prop ) )
}
#' Calculate p value for a table
#'
#' Calculate p value for each subgroup in a table
#'
#' @param bootData dataframe. Must have character column split, defining
#' groups, and bootStat/bootTStat, giving \code{B} bootstrap sample estimates for each
#' group in \code{split} of statistic for which confidence interval is required.
#' @param origData dataframe. Must have character column split, defining
#' groups, and origStat/origTStat, giving original sample estimate of
#' statistic for which p-value is required.
#' @param calcStat function. Function that calculates the statistic for
#' which a confidence interval is required.
#' @param method character. The method specified by the argument \code{pMethod}
#' in the \code{ggbootUV} function.
#' @param altSide "both", "low" or "high". Specifies side(s) of null hypothesis alternative
#' hypothesis lies.
#'
#' @return A dataframe with columns split and p.
# added trueBootData now - check this out.
calcPTbl = function( bootData, origData, method, altSide, nullValue ){
outTbl = ldply( split( bootData, bootData$split ),
function(x){
origRow = which( origData$split == x$split[1] )
origStat = origData$origStat[origRow]
origSE = origData$origSE[origRow]
pVec = calcP( bootData = x, origStat = origStat,
origSE = ifelse( method == "percT", origSE, 0 ),
nullValue = nullValue, altSide = altSide,
method = method )
z0P = calcBCaZ0( x$bootStat, origStat ) %>% signif(3)
if( method == "perc" ) return( c( pVec, z0P ) )
z0PT = calcBCaZ0( ( x$bootStat - nullValue )/ x$bootSE,
( origStat - nullValue )/ origSE ) %>%
signif(3)
c( pVec, z0P, z0PT )
} ) %>%
rename( split = .id, percP = V1 ) %>%
as_tibble
if( method == "perc" ) outTbl %<>% rename( z0P = V2 )
if( method == 'percT' ) outTbl %<>% rename( percTP = V2,
z0P = V3,
z0PT = V4 )
outTbl
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.