#' @title Bootstrap two gene sets and compare their mean values
#'
#' @description A function for comparing two sets of genes without relying on any distributional assumptions.
#'
#' @param valCol A singular column containing the value to be bootstrapped
#' @param data A data frame containing all the data required
#' @param testIds A \code{character vector} with test set of Ids
#' @param refIds A \code{character vector} with the reference setof Ids
#' @param idCol The column in \code{data} containing the Ids in the vectors \code{testIds} and \code{refIds}. Can be specified as an integer position or as a character (regular expression).
#' @param binCol The column in \code{data} containing the bin allocations for each gene. Can also be specified as an integer or by name.
#' @param filt A text expression passed to the NSE capabilities of the \code{filter_} function.
#' @param nGenes \code{integer}. The number of genes to sample at each iteration. Values greater than the number of testIds will automatically be capped at the number of testIds
#' @param nBoot \code{integer}. The number of bootstrap iterations to be performed
#' @param minGenes \code{integer}. The minimum number of IDs required to conduct a bootstrap procedure with any meaning.
#' @param ... Passed to the function \code{mean} internally
#' @param na.rm \code{logical}. Also passed internally to the function \code{mean}
#' @param replace \code{logical}. Should the bootstrap use sampling with replacement (\code{replace = TRUE}) or without
#' @param maxP The maximum probability (weight) allowed for an individual gene in the reference set.
#' Defaults to \code{1/nGenes}
#'
#' @details
#' This is a modification of the \code{bootMedians} function,
#' but is written to only work with a single column of values to be bootstrapped.
#' To apply across multiple value columns, please use \code{lapply} or \code{sapply}.
#'
#'This function breaks the supplied \code{data.frame} into two sets of test IDs & reference IDs.
#'The \code{data.frame} must contain a column (\code{binCol}) which classifies each ID into a bin.
#'The probabilities of bin membership in the test IDs are then used for sampling during the bootstrap procedure.
#'
#'The values to be bootstrapped must be specified in the argument \code{valCol},
#'and this can be a regular expression or integer, but must specify only a single column
#' in the supplied \code{data.frame}.
#'
#' The function will automatically filter the data to remove any values ouside
#' the specified criterion.
#'
#'The function itself will sample the same number of IDs (\code{nGenes}) from each dataset,
#'based on the probabilities of bin membership in the test dataset.
#'At each bootstrap iteration, the mean values for each column specified will be returned from both datasets,
#'with the reference values then subtracted from the tested values.
#'This allows direct comparison of these values as they will be drawn from similar
#'distributions based on the binning variable used.
#'
#'If any genes have a probability of being resampled > \code{maxP} they may exert undue influence on the results.
#'If any are found the process will stop to allow removal of this grouping.
#'Alternatively, the value for \code{maxP} can be reset up to a maximum of 1,
#'which would represent maximum permissability.
#'
#' @author Steve Pederson
#'
#' @return
#' A \code{list} with components:
#' \itemize{
#' \item \code{$samples} The sampled differences in the median values
#' \item \code{$p} The proportion of sampled differences which are > 0
#' \item \code{$nGenes} The number of genes sampled at each bootstrap iteration
#' \item \code{$nBoot} The number of bootstrap iterations
#' \item \code{$sampleSizes} A \code{data_frame} with the sample sizes for each dataset,
#' broken down into Expressed and Not Expressed genes.
#' \item \code{$testBins} The distributions of genes amongst the binning variable in the set of test IDs
#' \item \code{$refBins} The distributions of genes amongst the binning variable in the set of reference IDs.
#' The final column represents the sampling probability for each individual gene in the corresponding bin
#' \item \code{$missingBins} These are the bins not commonly represented in the dataset.
#' If any are found a non-fatal warning message will be printed during running of the process.
#'}
#' @import dplyr
#'
#' @export
bootMeans <- function(valCol, data, testIds, refIds,
idCol = 1L, binCol = "lengthBin", filt = "> -Inf",
nGenes = 1000L, nBoot = 100L,
minGenes = 200L, ..., na.rm=TRUE, replace = TRUE, maxP){
stopifnot(is.data.frame(data))
stopifnot(is.character(testIds))
stopifnot(is.character(refIds))
stopifnot(is.numeric(nGenes))
stopifnot(is.numeric(nBoot))
stopifnot(is.logical(na.rm), is.logical(replace))
nGenes <- as.integer(nGenes)
nBoot <-as.integer(nBoot)
if (length(nBoot) > 1){
nBoot <- nBoot[1]
message("length(nBoot) > 1. Only the first value will be used.")
}
if (nBoot > 1e6) stop("You cannot bootstrap more than a million times. Sorry...")
stopifnot(any(is.numeric(idCol), is.character(idCol)))
if (length(idCol) > 1) {
message("length(idCol) > 1. Only the first value will be used.")
idCol <- idCol[1]
}
if (is.numeric(idCol)) {
stopifnot(idCol <= ncol(data))
idCol <- as.integer(idCol)
}
if (is.character(idCol)){
idCol <- grep(idCol, colnames(data), value = TRUE)
if (length(idCol) != 1) stop("Unable to correctly identify idCol.\n Does it match multiple or no columns?")
}
data <- mutate(data, ID = data[[idCol]])
stopifnot(any(is.numeric(binCol), is.character(binCol)))
if (length(binCol) > 1) {
message("length(binCol) > 1. Only the first arguments will be used")
binCol <- binCol[1]
}
if (is.numeric(binCol)) {
stopifnot(binCol <= ncol(data))
binCol <- as.integer(binCol)
}
if (is.character(binCol)){
binCol <- grep(binCol, colnames(data), value = TRUE)
if (length(binCol) != 1) stop("Unable to correctly identify binCol.\n Does it match multiple or no columns?")
}
data <- mutate(data, bins = as.factor(data[[binCol]]))
stopifnot(any(is.character(valCol), is.numeric(valCol)))
if (is.numeric(valCol)){
valCol <- abs(as.integer(valCol))
stopifnot(valCol < ncol(data))
valCol <- colnames(data)[valCol]
}
if (is.character(valCol)){
#valCol <- grep(valCol, colnames(data), value = TRUE)
valCol <- match.arg(valCol, colnames(data))
}
if (length(valCol) != 1) stop("Invalid specification of the column name for the value column.\n",
"Either it doesn't exist, or it matches multiple columns.")
data <- filter(data, ID %in% c(testIds, refIds))
data <- filter_(data, paste(valCol, filt))
idMatch <- c(test = sum(data$ID %in% testIds),
ref = sum(data$ID%in% refIds))
if (nGenes > min(idMatch)) {
# If the number of genes to be sampled from (n) is less than that requested (m),
# reset the value to m = n - 3
nGenes <- min(idMatch) - 3
message("NB: The maximum number of genes able to be bootstrapped is ", nGenes, ".\n")
if (nGenes < minGenes) stop("Your test dataset is < ", minGenes, " IDs.\n",
"If this is expected behaviour, ",
"rerun the function changing the parameter minGenes to be below this value.\n",
"The number of matching IDs in your test & reference sets are ", paste(idMatch, collapse = " & "),
" respectively")
}
if(missing(maxP)) {
maxP <- 1 / nGenes
}
else {
stopifnot(is.numeric(maxP), maxP <= 1) # Ensure a numeric value <= 1
}
# Now just get the key information from the supplied data
data <- select(data, ID, bins, one_of(valCol))
data$test <- data$ID %in% testIds
allBins <- dcast(count_(data, vars=c("test", "bins")), bins~test, fill=0, value.var="n")
colnames(allBins)[2:3] <- c("ref", "test")
# Check the compatability of this dataset
missingBins <- filter(allBins, ref==0 | test == 0)
if (nrow(missingBins) > 0 ) {
# If some bins are missing from either dataset,
# print a message and set the sampling probability to be zero
missMess <- paste0("Some bins are not present in both datasets. This is not a fatal error, ",
"however, please be aware that\n",
sum(missingBins$test),
" genes (",
round(100*sum(missingBins$test)/length(testIds), 2),
"%) will be given a zero sampling probability in the test dataset, and\n",
sum(missingBins$ref),
" genes (",
round(100*sum(missingBins$ref)/length(refIds), 2),
"%) will be given a zero sampling probability in the reference dataset.\n",
"Please use your discretion to decide if this is acceptable.")
message(missMess)
allBins$test[allBins$bins %in% missingBins$bins] <- 0
allBins$ref[allBins$bins %in% missingBins$bins] <- 0
}
# Set up the test data frame
testData <- filter(data, test)
testBins <- select(allBins, bins, count = test)
testBins <- mutate(ungroup(testBins), p = count / sum(count))
# Set up the reference data frame
refData <- filter(data, !test)
refBins <- select(allBins, bins, count = ref)
refBins <- mutate(ungroup(refBins), p = count / sum(count))
refBins$effP <- testBins$p / refBins$count # This is the probability/gene
refBins$effP[is.nan(refBins$effP)] <- 0
# Assign bin weights for the reference dataset to ensure the matching sampling distributions
wRef <- refBins$effP[match(refData$bins, refBins$bin)]
refBins <- mutate(refBins, effP = effP / sum(wRef)) # Rescale effP to ensure a sum to 1
# Check for any points with potentially excessive influence.
# This defaults to checking for points with a probability of inclusion > 1 for each iteration
if (max(refBins$effP) > maxP) stop("Genes from one or more groups have probabilities of being resampled >",
maxP, "\n", refBins)
# The sampling function
sampleBoth <- function(testData, refData, nGenes, testBins, replace, wRef){
# Take the samples
testSamp <- sample_n(testData, nGenes, replace)
refSamp <- sample_n(refData, nGenes, replace, weight = wRef)
# Find the mean expression values & take the difference
mean(testSamp[[valCol]]) - mean(refSamp[[valCol]])
# testMeans <- summarise_each(testSamp, funs(mean), one_of(valCol))
# refMeans <- summarise_each(refSamp, funs(mean), one_of(valCol))
# means <- testMeans - refMeans
# means
}
# Now run the tilted bootstrap
out <- replicate(nBoot,
sampleBoth(testData, refData, nGenes, testBins, replace, wRef),
simplify = FALSE)
out <- unlist(out)
p <- mean(out > 0, na.rm =na.rm, ...)
# And the results
return(list(samples = out,
p = p,
nGenes = nGenes,
nBoot = nBoot,
sampleSizes = data_frame(dataset= c("test", "ref"),
Expressed = c(test = nrow(testData),
ref = nrow(refData)),
NotExpressed = c(length(testIds), length(refIds)) - Expressed),
testBins = testBins,
refBins = refBins,
missingBins = missingBins))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.