#' Univariate bootstrap plot
#'
#' Plots a sample statistic, such as the mean or median, and associated bootstrapped 95\% confidence intervals for specified subsets of observations. Facilitates hypothesis testing by allowing a) application of the Benjamini-Hochberg procedure to control the alse discovery rate and b) specification of a minimum effect size.
#'
#' @param data dataframe. Has one row per observation and the column names specified below.
#' @param resp unquoted expression. Response variable name.
#' @param xAxis unquoted expression. X-axis variable name.
#' @param col unquoted expression. Colour variable name.
#' @param facet unquoted expression. Facet variable name.
#' @param diff unquoted expression. Difference variable name. Must be a binary variable, preferrably ('0','1') as then it is clear that the difference is the amount by which the observations labelled '1' are greater than the observations labelled '0'.
#' @param calcStat function olr character. Bootstrap statistic function. Functions other than mean or median used at own risk.
#' @param B integer. Number of bootstrap samples per subgroup.
#' @param alpha numeric. 1-\code{alpha} is the coverage of the confidence intervals.
#' @param nSide 1 or 2. Number of sides of the hypothesis tests.
#' @param altSde 'high', 'low' or 'both'. Character specifying the alternative hypothesis relative to the null hypothesis. Note that you must still specify nSide, even if altSide is specified.
#' @param fdr numeric. False discovery rate.
#' @param minEff numeric. Minimum effect size. Default is 0.
#' @param hLineVec numeric vector. Heights of dashed horizontal lines to plot. Useful for indicating minimum effect size.
#' @param nullValue numeric. Null hypothesis value for statistic.
#' @param xLab,yLab characeter. X- and y-axis titles. Optional.
#' @param xAxisLabVec named character vector. Labels for x-axis variable. Optional.
#' @param rotXText logical. If TRUE, then the x-axis text is rotated 90 degrees clockwise.
#' @param remXAxisMarks logical. If TRUE, then the x-axis ticks and text are removed.
#' @param pointSize numeric. Size of plot points
#' @param errBarSize numeric. Size of error bar
#' @param errBarSizeRatio numeric. How many times larger the size is of the statistically significant error bars than the
#' non-statistically significant bars are.
#' @param colLabName character. Name for colour legend. Optional.
#' @param colourVec character vector. Colours for colour scale. Optional. Label with factor levels to match levels to colours manually.
#' @param colLabVec named character vector. Labels for colour variable. Optional.
#' @param facetlabeVec named character vector. Labels for facet fariable. Optional.
#' @param facetScale 'free', 'free_x' or 'free_y'. Facet scale.
#' @param nCol numeric. Number of columns to use.
#' @param plotTblName character. If not NULL, then the dataframe used to plot is saved to global environment with name \code{plotTblName}.
#' @param eqErrBarWidth logical. If \code{TRUE}, then the error bars are made the same width in each facet. Otherwise the error bars automatically
#' span the entire facet, however many x-axis entries there are.
#' @param errBarAlpha numeric. Error bar alpha
#' @param errBarAlphaRatio numeric. How many times greater the alpha is of the statistically significant error bars than the
#' non-statistically significant bars are.
#' @param sigIndType character vector. Must contain at least one of \code{"lineType"}, \code{"alpha"} and \code{"size"}. The
#' statistically significant and non-statistically significant error bars will then differ by this aesthetic.
#' @param fontScaleFactor numeric. Factor by which to scale the text size from the default.
#' @param lineScaleFactor numeric. Factor by which to scale the line size from the default.
#' @param hLineFactor numeric. Factor by which to scale the horizontal line size from the default.
#' @param pMethod "perc" or "percT". Method to use for calculating p-values. Both uses the bootstrap percentile method, but "perc" does this on the mean or median, whilst "percT" does this on the t-statistic based on the mean or median.
#' @param ciMethod "perc" or "bca". method to use to calculate confidence intervals. Both use bootstrap samples, but "perc" uses the percentile method and "bca" the bias-corrected and adjusted method.
#' @return a ggplot2 object
#' @export
ggbootUV = function( data, resp, xAxis, col = NULL, facet = NULL, nullBFactor = 10,
diff = NULL, calcStat = "mean", B = 10, seB = 200,
pMethod = "percT", fdr = 0.01, altSide, nSide = 1,
ciMethod = "bca", alpha = 0.05, minEff = 0, hLineVec = NULL, nullValue = 0,
xLab = NULL, yLab = 'Response', xAxisLabVec = NULL, rotXText = FALSE,
remXAxisMarks = FALSE, pointSize = 2, errBarSize = 1, errBarSizeRatio = 2,
errBarAlpha = 0.7, errBarAlphaRatio = 2, eqErrBarWidth = TRUE, errBarLineType = "dotted",
colLabName = NULL, colourVec = c( 'darkorchid1', 'springgreen2', 'maroon1', 'dodgerblue', 'red', 'yellow3', 'cyan2', "orange2" ), colLabVec = NULL,
facetLabVec = NULL, facetScale = 'free', nCol = 2,
plotTblName = NULL,
fontScaleFactor = 1, lineScaleFactor = 1, hLineFactor = 1,
bootT = TRUE, trim = 0.2, bootSECorr = FALSE ){
### PRELIMINARIES ###
on.exit( if( exists( "cl1" ) ) parallel::stopCluster( cl1 ) )
# variables
## variable names
xAxisVar = substitute( xAxis )
colVar = substitute( col )
facet = substitute( facet )
diff = substitute( diff )
resp = substitute( resp )
## vectors of grouping variables
xAxisVec = data[[ deparse( xAxisVar ) ]]
colVec = data[[ deparse( colVar ) ]]
facetVec = data[[ deparse( facet ) ]]
diffVec = data[[ deparse( diff ) ]]
respVec = data[[ deparse( resp ) ]]
## grouping variables name vector
vNameVec = c( "V1" )
if( !is.null( colVar ) ) vNameVec = c( vNameVec, "V2" )
if( !is.null( facet ) ) vNameVec = c( vNameVec, "V3" )
## vector to split into groups based on
splitVec = str_c( xAxisVec, colVec, facetVec, diffVec, sep = "." )
noDiffSplitVec = str_c( xAxisVec, colVec, facetVec, sep = "." )
# x axis label
if( is.null( xLab ) ) xLab = deparse( xAxisVar )
sdFunc = sd
if( identical( calcStat, "median" ) ){
calcStat = function(x) matrixStats::colMedians( as.matrix( x, ncol = 1 ) )
calcSE = function(x) { boot::boot( x, R = seB,
function(x,w) matrix( x[w], ncol = 1 ) %>% matrixStats::colMedians() )$t %>% sdFunc() }
} else if( identical( calcStat, "mean" ) ){
calcStat = function( x, .trim = trim ) mean( x, .trim )
calcSE = function(x) { boot::boot( x, R = seB,
function(x,w) mean( x[w], trim ) )$t %>% sdFunc() }
} else{
stop( "calcStat must be either 'mean' or 'median'")
}
if( pMethod == "perc" ) {
calcStatVec = calcStat
calcBootStatVec = function( x, w ) calcStat( x[w] )
}
if( pMethod == "percT" ){
calcStatVec = function( x ) c( calcStat( x ), calcSE( x ) )
calcBootStatVec = function( x, w ) c( calcStat( x[w] ), calcSE( x[w] ) )
} # only calculate calcStat to save time when not using t-stat for hyp testing
### BOOTSTRAP STATISTICS ###
cl1 = parallel::makeCluster(parallel::detectCores()-1)
doParallel::registerDoParallel(cl1)
if( is.null( diff ) ){ # if no difference taken
## table of statistics on original sample
origStatTbl = plyr::ldply( split( respVec, splitVec ),
calcStatVec, .parallel = TRUE ) %>%
rename( split = .id, origStat = V1 )
origStatVec = setNames( origStatTbl$origStat, origStatTbl$split )
bcaATbl = plyr::ldply( split( respVec, splitVec ), function(x) calcBCaA( x, calcStat ) ) %>%
rename( split = .id, bcaA = V1 )
bcaAVec = setNames( bcaATbl$bcaA, bcaATbl$split )
if( pMethod == "percT" ){
origStatTbl %<>%
rename( origSE = V2 ) %>%
mutate( origTStat = ( origStat - nullValue ) / origSE )
origSEVec = setNames( origStatTbl$origSE, origStatTbl$split )
}
# table of bootstrapped sample estimates
bootTbl = plyr::ldply( split( respVec, splitVec ),
function(x) boot::boot( x, calcBootStatVec, B )$t, .parallel = TRUE ) %>%
rename( split = .id, bootStat = `1` ) %>%
mutate( origStat = origStatVec[ split ] )
if( pMethod == "percT" ){
bootTbl %<>%
rename( bootSE = `2` ) %>%
mutate( origSE = origSEVec[ split ],
bootTStat = ( bootStat - origStat + nullValue ) / bootSE )
}
} else{ # difference taken
bcaAVec = NULL
if( ciMethod == "bca" ) ciMethod = "bc"
## table of statistics on original sample
if( pMethod == "perc" ){
origStatTbl = plyr::ldply( split( respVec, splitVec ), calcStat )
mutate( diff = str_sub( .id, -1 ),
split = str_sub( .id, end = -3 ) ) %>%
arrange( split, diff ) %>%
group_by( split ) %>%
summarise( origStat = V1[2] - V1[1] )
origStatVec = setNames( origStatTbl$origStat, origStatTbl$split )
# table of bootstrapped sample estimates
bootTbl = plyr::ldply( split( respVec, splitVec ), # a group for each unique value in splitVec
function(x) boot::boot( x, calcBootStatVec, B )$t, .parallel = TRUE ) %>% # sample from each group B times, calculating calcBootStatVec
rename( split = .id, bootStat = `1` ) %>%
mutate( diff = str_sub( split, -1 ), split = str_sub( split, end = -3 ) )
bootTbl = plyr::ldply( split( bootTbl, bootTbl$split ), # a group for each unique value in splitVec
function(x) x %>%
mutate( tempID = c( 1:(nrow(x)/2),1:(nrow(x)/2) ) ) %>%
spread( key = diff, value = bootStat ), .parallel = TRUE ) %>% # sample from each group B times, calculating calcBootStatVec
mutate( bootStat = `1` - `0` ) %>%
select( -c( .id, tempID, `0`, `1` ) ) %>%
mutate( origStat = origStatVec[ split ] )
} else if( pMethod == "percT" ){
origDataTbl = tibble( split = splitVec, resp = respVec ) %>%
mutate( origSplit = split, diff = str_sub( split, -1 ),
split = str_sub( split, end = -3 ) )
origStatTbl = ldply( split( origDataTbl, origDataTbl$split ),
function(x){
resp0Vec = x %>% filter( diff == 0 ) %>% extract2( "resp" )
resp1Vec = x %>% filter( diff == 1 ) %>% extract2( "resp" )
se0 = calcSE( resp0Vec )
se1 = calcSE( resp1Vec )
diffSE = sqrt( se0^2 + se1^2 )
statDiff = calcStat( resp1Vec ) - calcStat( resp0Vec )
tibble( origStat = statDiff, origSE = diffSE,
origTStat = ( statDiff - nullValue )/diffSE )
} ) %>%
rename( split = .id )
# get bootstrap sample estimates of mean and estimates of sd of mean
bootTbl = ldply( split( origDataTbl, origDataTbl$origSplit ),
function(x){
boot::boot( x$resp, R = B,
function( y, w ) c( bootStat = calcStat( y[w] ),
bootSE = calcSE( y[w] ) ) )$t %>%
as_tibble() %>%
rename( bootStat = V1, bootSE = V2 )
}, .parallel = TRUE ) %>%
rename( origSplit = .id ) %>%
mutate( split = str_sub( origSplit, end = -3 ),
diff = str_sub( origSplit, - 1 ) )
# calculate bootStat and bootTStat
bootTbl = plyr::ldply( split( bootTbl %>% select( -origSplit ), bootTbl$split ), # a group for each unique value in splitVec
function(x){
currSplit = x$split[1]
currOrigStat = origStatTbl$origStat[which(origStatTbl$split==currSplit)]
currOrigSE = origStatTbl$origSE[which(origStatTbl$split==currSplit)]
x %<>% mutate(tempID = c( 1:(nrow(x)/2),1:(nrow(x)/2) ) ) %>% select( -split )
statTbl = x %>%
select( - bootSE ) %>%
spread( key = diff, value = bootStat ) %>%
rename( stat0 = `0`, stat1 = `1` )
sdTbl = x %>%
select( - bootStat ) %>%
spread( key = diff, value = bootSE ) %>%
rename( se0 = `0`, se1 = `1` )
fullTbl = merge( statTbl, sdTbl, by = "tempID" )
fullTbl %>%
mutate( bootStat = stat1 - stat0,
bootSE = sqrt( se1^2 + se0^2 ),
bootTStat = ( bootStat - currOrigStat + nullValue ) / bootSE,
origStat = currOrigStat,
origSE = currOrigSE) %>%
as_tibble()
} , .parallel = TRUE ) %>% # sample from each group B times, calculating calcBootStatVec
rename( split = .id ) # %>%
# select( split, bootStat, bootTStat, trueBootTStat, nullBootTStat )
} else { stop( "pMethod must be one of perc or percT" ) }
}
if( pMethod == "percT" & bootSECorr )
bootTbl = ldply( as.list( unique( bootTbl$split ) ), function(i){
currOrigStat = origStatTbl %>%
filter( split == i ) %>%
extract2( "origStat" )
currOrigSE = origStatTbl %>%
filter( split == i ) %>%
extract2( "origSE" )
currBootTbl = bootTbl %>%
filter( split == i )
gamMod = gam::gam( formula = bootSE ~ gam::s(bootStat, df = 4),
data = currBootTbl )
bootFit = gamMod$fitted.values
origSEFit = gam::predict.Gam( gamMod, tibble( bootStat = currOrigStat ) )
adjFactor = currOrigSE / origSEFit
bootFit = bootFit * adjFactor # adjust by a multiplicative factor
origMinBootSE = min( currBootTbl$bootSE ) # min raw boot SE
bootFit = Vectorize(ifelse)( bootFit <origMinBootSE, origMinBootSE, bootFit ) # ensure positive
currBootTbl %>% mutate( bootSE = bootSE * adjFactor, adj = adjFactor )
}, .parallel = TRUE )
bootTbl %<>% as_tibble()
### BOOTSTRAP CONFIDENCE INTERVALS AND P-VALUES ###
ciTbl = calcCITbl( bootData = bootTbl, alpha = alpha, method = ciMethod,
diff = !is.null( diff ), bcaAVec = bcaAVec, createCluster = FALSE )
pTbl = calcPTbl( bootData = bootTbl, origData = origStatTbl, method = pMethod,
altSide = altSide, nullValue = nullValue )
bootStatTbl = merge( ciTbl, pTbl, by = "split" )
### CORRECTIONS FOR MULTIPLE TESTING ###
# identify names of split levels for which the p-value was statistically significant
# after controlling fdr
if( pMethod == "perc" ) pVec = bootStatTbl$percP
if( pMethod == "percT" ) pVec = bootStatTbl$percTP
fdrSigNameVec = correctBH( setNames(pVec, bootStatTbl$split ), fdr = fdr, nSide = 1 )
plotTbl = bootStatTbl %>%
full_join( origStatTbl, by = 'split' ) %>%
mutate( fdrSig = ( split %in% fdrSigNameVec ) * 1 ) # add in p-value sig levels
# check which meet min effect size
if( altSide == "high" ) plotTbl %<>% mutate( mesSig = ( ( origStat >= abs( minEff ) ) * 1 ) )
if( altSide == "low" ) plotTbl %<>% mutate( mesSig = ( ( origStat <= -abs( minEff ) ) * 1 ) )
if( altSide == "both" ) plotTbl %<>% mutate( mesSig = pmax( ( origStat <= -abs( minEff ) ) * 1,
( origStat >= abs( minEff ) ) * 1 ) )
# get which are both fdr and min eff size significant
plotTbl %<>% mutate( sig = pmin( fdrSig, mesSig ) %>% as.character() )
# assign plotTbl to plotTblName in global environment
if( !is.null( plotTblName ) ) assign( plotTblName, as_tibble( plotTbl ), envir = globalenv() )
### PLOT ###
plotTbl %<>% separate( split, into = vNameVec )
# width
if( eqErrBarWidth & !is.null( facet ) ){
widthPerTbl = plotTbl %>%
group_by( V3 ) %>%
summarise( count = n() ) %>%
mutate( maxCount = max( count ),
widthRatio = count/maxCount * 0.95 )
widthLabVec = setNames( widthPerTbl$widthRatio, widthPerTbl$V3 )
plotTbl %<>% mutate( width = widthLabVec[ plotTbl$V3 ] )
} else{
plotTbl %<>% mutate( width = 1 )
}
# give values for lb and ub equal to origStat if lb and ub are NAN
plotTbl %<>%
mutate( lb = Vectorize(ifelse)( is.nan( lb ), origStat, lb ),
ub = Vectorize(ifelse)( is.nan( ub ), origStat, ub ) )
# base plot
p = ggplot( plotTbl, aes( x = V1 ) ) + # base settings
theme_cowplot( font_size = 14 * fontScaleFactor,
line_size = 0.5 * lineScaleFactor ) +
labs( x = xLab, y = yLab ) + # axes labels
geom_hline( yintercept = nullValue, linetype = 2, size = 1 * hLineFactor ) # add a horizontal line at null value
if( rotXText ) p = p + theme( axis.text.x = element_text( angle = 90 ) ) # rotate x-axis labels
if( !is.null( xAxisLabVec ) ) p = p + scale_x_discrete( labels = xAxisLabVec ) # give x-axis better labels
if( remXAxisMarks ) p = p + theme( axis.text.x = element_blank(),
axis.ticks.x = element_blank() ) # remove x axis text
# add in extra horizontal lines
k = ifelse( is.null( hLineVec ), 0, length( hLineVec ) )
while ( k > 0 ){
p = p + geom_hline( yintercept = hLineVec[k], linetype = 2, alpha = 0.4, linetype = 'dotted' )
k = k - 1
}
# colour variable
if( is.null( colVar ) ){ # if there is no colour variables
if( !fdr == 1 ){ p = p + geom_errorbar( aes( ymin = lb, ymax = ub, width = width,
linetype = sig ), size = errBarSize, alpha = errBarAlpha )
p = p +
scale_linetype_manual( labels = c( "0" = "Not significant", "1" = "Significant" ),
name = "",
values = c( "0" = errBarLineType, '1' = 'solid' ),
drop = FALSE,
limits = c( "0", "1" ) )
}
if( fdr == 1 ) p = p + geom_errorbar( aes( ymin = lb, ymax = ub, width = width ),
size = errBarSize, alpha = errBarAlpha )
p = p + geom_point( aes( y = origStat ), size = pointSize ) # add in points
} else if( !is.null( colVar ) ){ # if there are colour variables
if( !fdr == 1 ){ p = p + geom_errorbar( aes( ymin = lb, ymax = ub, width = width, col = V2,
linetype = sig ), size = errBarSize, alpha = errBarAlpha )
p = p +
scale_linetype_manual( labels = c( "0" = "Not significant", "1" = "Significant" ),
name = "",
values = c( "0" = errBarLineType, '1' = 'solid' ),
drop = FALSE,
limits = c( "0", "1" ) )
}
if( fdr == 1 ) p = p + geom_errorbar( aes( ymin = lb, ymax = ub, col = V2, width = width ),
size = errBarSize, position = position_dodge( 1 ), linetype = errBarLineType,
alpha = errBarAlpha )
p = p +
geom_point( aes( y = origStat, col = V2 ), size = pointSize,
position = position_dodge( 1 ) ) + # add in points
scale_colour_manual( values = colourVec,
labels = colLabVec,
name = colLabName) # modify points
}
# facet variable
if( !is.null( facet ) ){
if( is.null( facetLabVec ) ){
p = p +
facet_wrap( ~ V3, ncol = nCol, scales = facetScale )
}
else{
p = p +
facet_wrap( ~ V3, ncol = nCol, scales = facetScale,
labeller = labeller( V3 = facetLabVec ) )
}
}
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.