#'Generate bootstrapped windowed statistics
#'
#'\code{do_bootstraps} creates a distribution of bootstrapped smoothed values
#'for requested statistics.
#'
#'Bootstraps are conducted as described by Hohenlohe et al. (2010). For each
#'bootstrap, this function draws random window position, then draws random
#'statistics from all provided SNPs to fill each observed position on that
#'window and calculates a smoothed statistic for that window using
#'Gaussian-smoothing. Note that a "position" column \emph{must} be present in
#'the snp metadata of the snpRdata object to do any window calculations.
#'
#'Bootstraps for multiple statistics can be calculated at once. If the
#'statistics argument is set to "all", all calculated stats will be run. If it
#'is set to "single", then all non-pairwise statistics will be bootstrapped, if
#'it is set to "pairwise", then all pairwise statistics will be bootstrapped.
#'Individual statistics can also be requested by name ("pi", "ho", etc.). All
#'statistics bootstrapped at the same time will be calculated using \emph{the
#'same randomly filled windows}, and thus do not represent independent
#'observations between statistics. This is still computationally more efficient,
#'however.
#'
#'The data can be broken up categorically by snp or sample metadata, as
#'described in \code{\link{Facets_in_snpR}}.
#'
#'As described in Hohenlohe et al. (2010), the contribution of individual SNPs
#'to window averages can be weighted by the number of observations per SNP by
#'setting the nk argument to TRUE, as is the default. For bootstraps, nk values
#'are randomly drawn for each SNP in each window.
#'
#'Possible centers for windows can either SNPs (if no step size is provided), or
#'every step kilobases from the 0 position of each snp level facet category
#'(chromosome, etc.).
#'
#'
#'If do.p is TRUE, calculates p-values for smoothed values of a statistic based
#'upon the bootstrapped null distribution of that statistic using an empirical
#'continuous distribution function. Note that in this case, the minimum possible
#'p-value for a window depends upon the number of bootstraps calculated (if only
#'a 1000 bootstraps were performed, the minimum possible p-value is about .001,
#'or one in a thousand.)
#'
#'Note that this function will return an error if equivalent windowed statistics
#'have not first been calculated for the designated facets (if the "chromosome"
#'facet is requested with a sigma of 200 and a step of 50, do_bootstraps will
#'error unless \code{\link{calc_smoothed_averages}} has not yet been run with
#'the same facet, sigma, and step values).
#'
#'@param x snpRdata object.
#'@param facets character or NULL, default NULL. Categories by which to break up
#' bootstraps.
#'@param boots numeric. Number of bootstraps to generate.
#'@param sigma numeric. Designates the width of windows in kilobases. Full
#' window size is 6*sigma.
#'@param step numeric or NULL, default default \code{2*sigma} (non-overlapping
#' windows). Designates the number of kilobases between each window centroid.
#' If NULL, windows are centered on each SNP.
#'@param statistics character. Designates the statistic(s) to smooth, typically
#' "all", "single", or "pairwise". See details for options.
#'@param nk logical, default TRUE. If TRUE, weights SNP contribution to window
#' averages by the number of observations at those SNPs.
#'@param par numeric or FALSE, default FALSE. If numeric, the number of cores to
#' use for parallel processing.
#'@param triple_sigma Logical, default TRUE. If TRUE, sigma will be tripled to
#' create windows of 6*sigma total.
#'@param gaussian Logical, default TRUE. If TRUE, windows will be gaussian
#' smoothed. If not, windows will be raw averages.
#'@param do.p logical, default TRUE. Determines if p-values should be calculated
#' for sliding windows.
#'@param p.alt character, default "two-sided". Specifies the alternative
#' hypothesis to be used. Options: \itemize{ \item "less": probability that a
#' bootstrapped value is as small or smaller than observed. \item "greater":
#' probability that a bootstrapped value is as large or larger than observed.
#' \item "two-sided": probability that a bootstrapped value is as or more
#' extreme than observed. }
#'@return A snpRdata object with bootstrapped windows merged in to the
#' window.bootstraps slot. If do.p is TRUE, it will also merge p values in for
#' bootstrapped statistics into the stats or window.stats sockets.
#'
#'@references Hohenlohe et al. (2010). \emph{PLOS Genetics}
#'@author William Hemstrom
#'@export
#'
#' @examples
#' # add statistics
#' dat <- calc_basic_snp_stats(stickSNPs, "chr", sigma = 200, step = 150)
#'
#' # do bootstraps
#' dat <- do_bootstraps(dat, facets = "chr", boots = 100,
#' sigma = 200, step = 150)
#'
#' # fetch results, bootstraps and then p-values on original stats
#' get.snpR.stats(dat, "chr", "bootstraps")
#' get.snpR.stats(dat, "chr", "single.window")
#'
do_bootstraps <- function(x, facets = NULL, boots, sigma, step = 2*sigma, statistics = "all",
nk = TRUE, par = FALSE, triple_sigma = TRUE, gaussian = TRUE,
do.p = TRUE, p.alt = "two-sided"){
#note: it is possible to run all sample level facets at once, so something like c("pop.fam.chr", "pop.chr") can
# be run simultaneously, with no need to loop across facets.
# However, SNP level facets create different windows, and so need to be run separately. Essentially,
# we need to do everything once for each unique snp level facet defined in the data.
#================sanity checks================
msg <- character(0)
o.facets <- facets
facets <- .check.snpR.facet.request(x, facets, "none")
# figure out which stats we are using!
pairwise.types <- c("fst")
single.types <- c("pi", "ho", "pa", "pHWE", "fis", "he")
all.types <- c(pairwise.types, single.types)
o.stats <- statistics
if("all" %in% statistics){
statistics <- all.types[which(all.types %in% c(colnames(x@stats), colnames(x@pairwise.stats)))]
}
else if("single" %in% statistics){
statistics <- single.types
}
else if("pairwise" %in% statistics){
statisitcs <- pairwise.types
}
stats.type <- character()
if(any(statistics %in% pairwise.types)){
stats.type <- "pairwise"
}
if(any(statistics %in% single.types)){
stats.type <- c(stats.type, "single")
}
# get mafs if doing any normal stats. Can make this more efficient by adding only where missing in the future.
if("single" %in% stats.type & nk){
what.stats <- .check_calced_stats(x, .check.snpR.facet.request(x, facets), "maf")
if(any(!unlist(what.stats))){
x <- calc_maf(x, .check.snpR.facet.request(x, facets)[which(!unlist(what.stats))])
}
x@stats$nk <- x@stats$maj.count + x@stats$min.count
}
# check that we've actually calculated windowed stats for the facet we are doing!
facet_details <- .check.snpR.facet.request(x, facets, "none", "TRUE")
for(i in 1:length(facets)){
split.facet <- .check.snpR.facet.request(x, unlist(.split.facet(facets[i])), "none", TRUE)
snp.part <- which(split.facet[[2]] == "snp")
snp.part <- split.facet[[1]][snp.part]
if(length(snp.part) == 0){
snp.part <- ".base"
}
samp.part <- which(split.facet[[2]] == "sample")
samp.part <- split.facet[[1]][samp.part]
if(length(samp.part) == 0){
samp.part <- ".base"
}
# check pairwise
if("pairwise" %in% stats.type){
has.samp.part.pw <- which(x@pairwise.window.stats$facet == paste0(samp.part, collapse = "."))
has.snp.part.pw <- which(x@pairwise.window.stats$snp.facet == paste0(snp.part, collapse = "."))
has.step.matches.pw <- which(x@pairwise.window.stats$step == step)
has.sigma.matches.pw <- which(x@pairwise.window.stats$sigma == sigma)
has.gaus.matches.pw <- which(x@pairwise.window.stats$gaussian == gaussian)
has.triple.matches.pw <- which(x@pairwise.window.stats$triple_sigma == triple_sigma)
has.both.parts.pw <- intersect(intersect(intersect(has.snp.part.pw, has.samp.part.pw),
intersect(has.step.matches.pw, has.sigma.matches.pw)),
intersect(has.gaus.matches.pw, has.triple.matches.pw))
}
if("single" %in% stats.type){
has.samp.part.s <- which(x@window.stats$facet == paste0(samp.part, collapse = "."))
has.snp.part.s <- which(x@window.stats$snp.facet == paste0(snp.part, collapse = "."))
has.step.matches.s <- which(x@window.stats$step == step)
has.sigma.matches.s <- which(x@window.stats$sigma == sigma)
has.gaus.matches.s <- which(x@window.stats$gaussian == gaussian)
has.triple.matches.s <- which(x@window.stats$triple_sigma == triple_sigma)
has.both.parts.s <- intersect(intersect(intersect(has.snp.part.s, has.samp.part.s),
intersect(has.step.matches.s, has.sigma.matches.s)),
intersect(has.gaus.matches.s, has.triple.matches.s))
}
# check single
if("pairwise" %in% stats.type){
if(length(has.both.parts.pw) == 0){
msg <- c(msg, paste0("Windowed stats with the same step and sigma not calculated for any pairwise statstistics for facet: ", facets[i], "."))
}
}
if("single" %in% stats.type){
if(length(has.both.parts.s) == 0){
msg <- c(msg, paste0("Windowed stats with the same step and sigma not calculated for any single statstistics for facet: ", facets[i], "."))
}
}
}
if(length(msg) > 0){
stop(paste0(msg, "\n"))
}
# run basic sanity checks
.sanity_check_window(x, sigma, 200, stats.type, nk, facets, statistics, good.types = all.types)
#===========subfunctions=======
get.grps <- function(snps, facet){
if(facet == ".base"){
grps <- rep(".base", length(snps))
return(grps)
}
grps <- x@snp.meta[snps, facet]
grps <- as.data.frame(grps)
colnames(grps) <- facet
grps <- do.call(paste0, grps)
return(grps)
}
# gaussian weights on data
do.gaus <- function(fws, stats, nk, tnk, sig){
if(gaussian){
if(nk){
gws <- matrix(gaussian_weight(fws[[1]], as.numeric(names(fws)), sig), nrow(stats), ncol(stats))* (tnk - 1)
}
else{
gws <- matrix(gaussian_weight(fws[[1]], as.numeric(names(fws)), sig), nrow(stats), ncol(stats))
}
}
else{
if(nk){
gws <- matrix(1, nrow(stats), ncol(stats))* (tnk - 1) # equal weights
}
else{
gws <- matrix(1, nrow(stats), ncol(stats))
}
}
gwp <- gws * stats
return((colSums(gwp, na.rm = T)/colSums(gws, na.rm = T)))
}
# runs the bootstrapping on a set of data.
# this function should be given all of the stats in wide form, so multiple sample level facets can be done at once!
run.bootstrap.set <- function(fws, nrands, stats = NULL, stats.type, n_snps, nk, snk, pnk, part.cols){
# zero fixer subfunction
fix.zeros <- function(x){
zeros <- x == 0
if(any(zeros)){
x[zeros] <- 1
}
return(x)
}
#==========run the bootstrap=======
cpos <- as.numeric(names(fws))
spos <- as.numeric(unlist(fws))
# fix zeros and make a simple nk for when using just one stat.
if(length(part.cols) == 1){
if(!is.null(pnk[1,1])){
use.nk <- pnk
}
else{
use.nk <- snk
}
use.nk <- fix.zeros(use.nk)
}
else{
snk <- fix.zeros(snk)
pnk <- fix.zeros(pnk)
}
tracker <- 1
percentage <- 0
stats.out <- matrix(0, boots, ncol(stats))
colnames(stats.out) <- colnames(stats)
cat("Bootstrap Progress:\n0%\n")
for (j in 1:boots){
npercentage <- j/boots
if(npercentage - percentage >= 0.05){
cat(paste0(round(npercentage*100), "%"), "\n")
percentage <- npercentage
}
trows <- tracker:(tracker + n_snps[j] - 1)
# figure out nk
if(!is.null(part.cols$single) & !is.null(part.cols$pairwise)){
tnk <- cbind(snk, pnk)[trows,]
}
else{
tnk <- use.nk[trows,]
}
# get smoothed stats and save output
stats.out[j,] <- do.gaus(fws[j], stats[trows,], nk, tnk, sig)
tracker <- trows[length(trows)] + 1
}
# prepare and return output
return(stats.out)
}
# gets a list containing the position of snps within each window. Note that names are window centroids
get.window.info <- function(cs, sig, position, cgrps = NULL, all.grps = NULL){
ends <- cs + ifelse(triple_sigma, sig*3, sig)
starts <- cs - ifelse(triple_sigma, sig*3, sig)
lmat <- outer(position, starts, function(pos, starts) pos >= starts)
lmat <- lmat + outer(position, ends, function(pos, ends) pos <= ends)
if(!is.null(cgrps)){
lmat <- lmat + outer(all.grps, cgrps, function(all.grps, cgrps) all.grps == cgrps)
}
colnames(lmat) <- cs
rownames(lmat) <- position
if(!is.null(cgrps)){
lmat <- ifelse(lmat == 3, TRUE, FALSE)
}
else{
lmat <- ifelse(lmat == 2, TRUE, FALSE)
}
n_snps <- colSums(lmat)
# map positions to TRUEs
lmat[lmat == FALSE] <- NA
pos.mat <- matrix(position, nrow = nrow(lmat), ncol = ncol(lmat))
lmat <- lmat * pos.mat
lmat <- as.list(as.data.frame(lmat))
return(list(lmat = lmat, n_snps = n_snps))
}
# prepares a single snp facet to run
prep.one.snp.facet <- function(x, facet.info, stats = NULL, pairwise.stats = NULL){
snk <- NULL
pnk <- NULL
#============prepare a centroid list=========
# elements named for all of the random centroids, contain a vector of the positions of snps in those windows
# make sure to save a vector of the number of snps per window (n_snps)
if(is.null(step)){
csnps <- sample(1:nrow(x), boots, replace = T) #get random centroids
cs <- position[csnps] #get centroid positions
# get centroid and all snp groups for this facet
if(!is.null(snp.facets[1])){
cgrps <- get.grps(csnps, names(facet.info))
all.grps <- get.grps(1:nrow(x), names(facet.info))
}
# draw all of the random snps to fill in around centroids on windows now. First need to figure out snp count in each window.
## to do this, figure out which snps are in the csnp windows.
fws <- get.window.info(cs, sig, position, cgrps, all.grps)
n_snps <- fws$n_snps
# remove any empty windows
empties <- n_snps == 0
if(any(empties)){
fws <- fws[-which(empties)]
n_snps <- n_snps[-which(empties)]
}
fws <- lapply(fws$lmat, stats::na.omit)
## draw the random snps
nrands <- sample(1:nrow(x), length(unlist(fws)), replace = T)
}
else{ #same deal, but for fixed slide windows from provided window information
# go through each group and get the windows
grps <- get.grps(1:nrow(x), names(facet.info))
u.grps <- unique(grps)
fws <- vector("list")
n_snps <- numeric()
for(i in 1:length(u.grps)){
t.pos <- x@snp.meta$position[grps == u.grps[i]]
cs <- seq(0, max(t.pos), by = step)
tfws <- get.window.info(cs, sig, t.pos)
n_snps <- c(n_snps, tfws$n_snps)
fws <- c(fws, tfws$lmat)
}
# remove any empty windows
empties <- n_snps == 0
if(any(empties)){
fws <- fws[-which(empties)]
n_snps <- n_snps[-which(empties)]
}
# clean up the window info list
fws <- lapply(fws, stats::na.omit)
# get whatever info we can before passing to a looping function
cwin <- sample(1:length(fws), boots, replace = T)
fws <- fws[cwin]
n_snps <- n_snps[cwin]
cs <- as.numeric(names(fws))
}
#============draw random snps to assign to each position in each window========
nrands <- sample(1:nrow(x), sum(n_snps), replace = T)
#============prepare the stats and/or pairwise stats=========
# put them into long form across sample level facets
# cbind stats and pairwise stats, creating an nk vector to pass and saving facet and subfacet/comparison data
facet.meta <- character()
subfacet.meta <- facet.meta
stattype.meta <- character()
# get nk info and cast each type
if("single" %in% stats.type){
if(!data.table::is.data.table(stats)){
stats <- data.table::as.data.table(stats)
}
# get only data for the correct sample facets
stats <- stats[stats$facet %in% unlist(facet.info),]
# melt to put different stat/facet+subfacets in one row via snp id
stat.cols <- statistics[which(statistics %in% single.types)]
meta.cols <- c("facet", "subfacet", ".snp.id")
keep.cols <- c(meta.cols, stat.cols)
stattype.meta <- rep("single", length(facet.meta))
if(nk){ # grab a cast nk as well...
nk.meta <- c(meta.cols, "nk")
snk <- data.table::dcast(stats[,nk.meta, with = FALSE], .snp.id~facet + subfacet, value.var = "nk")
snk <- snk[nrands,-".snp.id"]
## need to duplicate columns, since each nk value will be used for multiple stats!
col.seq <- rep(1:ncol(snk), sum(statistics %in% single.types))
snk <- snk[,col.seq, with = FALSE]
}
stats <- stats[,keep.cols, with = FALSE]
stats <- data.table::dcast(stats, .snp.id~facet + subfacet, value.var = stat.cols, sep = "....")
# subset according to the snps we are using
stats <- stats[nrands, -".snp.id"]
# fix column names if needed.
if(sum(statistics %in% single.types) == 1){
colnames(stats) <- paste0(statistics[which(statistics %in% single.types)], "....", colnames(stats))
}
}
if("pairwise" %in% stats.type){
if(!data.table::is.data.table(pairwise.stats)){
pairwise.stats <- data.table::as.data.table(pairwise.stats)
}
# get only data for the correct sample facets
pairwise.stats <- pairwise.stats[pairwise.stats$facet %in% unlist(facet.info),]
# melt to put different stat/facet+subfacets in one row via snp id
stat.cols <- statistics[which(statistics %in% pairwise.types)]
meta.cols <- c("facet", "comparison", ".snp.id")
keep.cols <- c(meta.cols, stat.cols)
if(nk){ # grab a cast nk as well...
nk.meta <- c(meta.cols, "nk")
pnk <- data.table::dcast(pairwise.stats[,nk.meta, with = FALSE], .snp.id~facet + comparison, value.var = "nk")
pnk <- pnk[nrands,-".snp.id"]
col.seq <- rep(1:ncol(pnk), sum(statistics %in% pairwise.types))
pnk <- pnk[,col.seq, with = FALSE]
}
pairwise.stats <- pairwise.stats[,keep.cols, with = FALSE]
pairwise.stats <- data.table::dcast(pairwise.stats, .snp.id~facet + comparison, value.var = stat.cols, sep = "....")
# subset according to the snps we are using
pairwise.stats <- pairwise.stats[nrands, -".snp.id"]
# fix column names if needed.
if(sum(statistics %in% pairwise.types) == 1){
colnames(pairwise.stats) <- paste0(statistics[which(statistics %in% pairwise.types)], "....", colnames(pairwise.stats))
}
}
#============bind and prepare column type info================
part.cols <- vector("list", length(stats.type))
names(part.cols) <- stats.type
if(any(statistics %in% single.types) & any(statistics %in% pairwise.types)){
bound.stats <- cbind(stats, pairwise.stats)
part.cols$single <- ncol(stats)
part.cols$pairwise <- ncol(pairwise.stats)
}
else if(any(statistics %in% single.types)){
bound.stats <- stats
part.cols$single <- ncol(stats)
}
else{
bound.stats <- pairwise.stats
part.cols$pairwise <- ncol(pairwise.stats)
}
return(list(fws = fws, n_snps = n_snps, nrands = nrands, stats = bound.stats, snk = snk, pnk = pnk, part.cols = part.cols))
}
# melt a bootstrap output back into a long form and add some extra metadata
melt.bootstrap <- function(boot.set, sigma, nk, step){
# melt data
boot.set <- data.table::as.data.table(boot.set)
suppressWarnings(boot.set <- data.table::melt(boot.set))
# get metadata
facet <- unlist(strsplit(as.character(boot.set$variable), "\\.\\.\\.\\."))
stats <- facet[seq(1, length(facet), by = 3)]
subfacet <- facet[seq(3, length(facet), by = 3)]
facet <- facet[seq(2, length(facet), by = 3)]
# set metadata
set(boot.set, j = "facet", value = facet)
set(boot.set, j = "subfacet", value = subfacet)
set(boot.set, j = "stat", value = stats)
set(boot.set, j = "sigma", value = sigma)
set(boot.set, j = "nk", value = nk)
if(is.null(step)){
set(boot.set, j = "step", value = NA)
}
else{
set(boot.set, j = "step", value = step/1000)
}
# rearrange
ord <- c(3, 4, 5, 6, 7, 8, 2)
boot.set <- boot.set[,ord, with = FALSE]
return(boot.set)
}
# wrapper function to run a single facet for a given number of boots
boot.wrapper <- function(x, nk, stats.type, sigma, step){
boot.out <- run.bootstrap.set(fws = x$fws,
nrands = x$nrands,
stats = x$stats,
nk = nk,
snk = x$snk, pnk = x$pnk,
stats.type = stats.type,
n_snps = x$n_snps,
part.cols = x$part.cols)
boot.out <- melt.bootstrap(boot.out, sigma, nk, step)
return(boot.out)
}
#===========initialize=========
# figure out what different snp level facets we have, and figure out which facets have which snp levels
snp.facets <- .check.snpR.facet.request(x, facets, remove.type = "sample")
snp.facet.matches <- lapply(snp.facets, grep, x = facets)
names(snp.facet.matches) <- snp.facets
snp.facet.matches <- lapply(snp.facet.matches, function(y){
lapply(facets[y], function(z) .check.snpR.facet.request(x, z))
})
snp.facet.matches <- lapply(snp.facet.matches, unlist) # this is now a list with an entry for each snp level facet containing the pop level facets to run.
# remove empty facets (sample level only facets)
snp.facet.matches <- Filter(Negate(is.null), snp.facet.matches)
# add in .base if needed, for sample level only facets
samp.facets <- .check.snpR.facet.request(x, facets, "none", T)
samp.facets <- samp.facets[[1]][which(samp.facets[[2]] == "sample")]
if(length(samp.facets) > 0){
for(i in 1:length(samp.facets)){
snp.facet.matches <- c(snp.facet.matches, .base = samp.facets[i])
}
}
# for snp level only facets, change the NULL to ".base"
nulls <- which(unlist(lapply(snp.facet.matches, function(x) any(is.null(x)))))
if(length(nulls) > 0){
snp.facet.matches[[nulls]] <- ".base"
}
#report a few things, intialize others
sig <- 1000*sigma #correct sigma
if(!is.null(step)){
step <- 1000*step
}
num_snps <- nrow(x)
cat("Sigma:", sigma, "\nTotal number of input snps: ", num_snps, "\nNumber of boots:", boots, "\n")
position <- x@snp.meta$position
#===========run a loop to do bootstraps for each snp level facet==============
out <- vector("list", length(snp.facet.matches))
names(out) <- names(snp.facet.matches)
cat("Unique snp level facets:", paste0(names(snp.facet.matches), collapse = ", "), "\n")
for(i in 1:length(snp.facet.matches)){
cat("Bootstrap facet:\n", names(snp.facet.matches[i]), "\n")
# prepare data
prepped.facet <- prep.one.snp.facet(x, snp.facet.matches[i], x@stats, x@pairwise.stats)
# run in serial
if(par == FALSE){
out[[i]] <- boot.wrapper(x = prepped.facet,
nk = nk,
stats.type = stats.type,
sigma = sigma,
step = step)
set(out[[i]], j = "snp.facet", value = names(snp.facet.matches[i]))
}
# run in parallel
else{
cl <- parallel::makePSOCKcluster(par)
doParallel::registerDoParallel(cl)
#prepare reporting function
task_list <- split(1:boots, sort((1:boots)%%par))
ntasks <- par
# progress <- function(n) cat(sprintf("\tPart %d out of", n), ntasks, "is complete.\n")
# opts <- list(progress=progress)
tout <- foreach::foreach(q = 1:ntasks, .inorder = FALSE,
.export = "data.table") %dopar% {
# extract the correct parts of the prepped.facet
tpf <- prepped.facet
tpf$fws <- tpf$fws[task_list[[q]]]
tpf$n_snps <- tpf$n_snps[task_list[[q]]]
# for nrands, need to figure out where to start and end
if(q != 1){
prior.sum <- sum(prepped.facet$n_snps[1:(task_list[[q]][1] - 1)])
}
else{
prior.sum <- 1
}
post.sum <- sum(prepped.facet$n_snps[1:(task_list[[q]][length(task_list[[q]])])])
tpf$nrands <- tpf$nrands[prior.sum:post.sum]
# do the bootstrapp
boots <- length(task_list[[q]])
boot.wrapper(x = tpf,
nk = nk,
stats.type = stats.type,
sigma = sigma,
step = step)
}
#release cores
parallel::stopCluster(cl)
# bind
out[[i]] <- data.table::rbindlist(tout)
set(out[[i]], j = "snp.facet", value = names(snp.facet.matches[i]))
}
}
#=============bind and return====================
out <- data.table::rbindlist(out)
out$triple_sigma <- triple_sigma
out$gaussian <- gaussian
col.ord <- c(1, 2, (ncol(out) - 2):ncol(out), 3:(ncol(out) - 3))
out <- out[,col.ord, with = FALSE]
x@window.bootstraps <- rbind(x@window.bootstraps, out)
if(do.p){
x <- calc_p_from_bootstraps(x, facets, o.stats, alt = p.alt, par = par)
}
x <- .update_citations(x, "Hohenlohe2010", "window_bootstraps", "Bootstrapping of statistics across windows for p-value generation")
return(x)
}
#calculates a p-value for a stat based on a null distribution calculated via bootstraps of that stat.
#inputs: x: vector of observed statistics
# dist: vector of statistics bootstrapped from same stat.
# alt: Alternative hypothesis for p-value. Can be less, greater, or two-sided (default).
#'Calculate p-values from bootstrapped distributions.
#'
#'\code{calc_p_from_bootstraps} finds p-values for observed \emph{smoothed
#'window} statistics from bootstrapped distributions, such as produced by
#'\code{\link{do_bootstraps}}.
#'
#'Calculates p-values for smoothed values of a statistic based upon a
#'bootstrapped null distribution of that statistic using an empirical continuous
#'distribution function.
#'
#'p-values can be generated for specific snp or sample metadata categories
#'using the facets argument, as described in \code{\link{Facets_in_snpR}}. Only
#'facets for which bootstrap data and raw statistical data have both been
#'calculated will be run. "all" and NULL follow the typical facet rules.
#'
#'Likewise, p-values can be generated for specific statistics using the
#'statistics argument. Only statistics for which bootstrap data and raw
#'statistical data have both been calculated will be run. By default, all stats
#'for which a bootstrap null distribution has been generated will be run.
#'
#'@param x snpRdata object.
#'@param facets character, default "all". Facets to use.
#'@param statistics character, default "all". Vector naming the statistics to
#' calculate p-values for. By default calculates p-values for all possible
#' stats.
#'@param alt character, default "two-sided". Specifies the alternative
#' hypothesis to be used. Options: \itemize{ \item "less": probability that a
#' bootstrapped value is as small or smaller than observed. \item "greater":
#' probability that a bootstrapped value is as large or larger than observed.
#' \item "two-sided": probability that a bootstrapped value is as or more
#' extreme than observed. }
#'@param par numeric or FALSE, default FALSE. If numeric, the number of cores to
#' use for parallel processing.
#'@param fwe_method character, default c("bonferroni", "holm", "BH", "BY"). Type
#' of Family-Wise Error correction (multiple testing correction) to use. For
#' details and options, see \code{\link[stats]{p.adjust}}.
#'@param fwe_case character, default c("by_facet", "by_subfacet", "overall").
#' How should Family-Wise Error correction (multiple testing correction) be
#' applied? \itemize{\item{"by_facet":} Each facet supplied (such as pop or
#' pop.fam) is treated as a set of tests. \item{"by_subfacet":} Each level of
#' each subfacet is treated as a separate set of tests. \item{"overall":} All
#' tests are treated as a set.}
#'
#'
#'@return snpRdata object, with p-values merged into the stats or pairwise.stats
#' sockets.
#'
#'@seealso ecdf
#'
#'@export
#'@author William Hemstrom
#'
#' @examples
#' \dontrun{
#' # add statistics and generate bootstraps
#' x <- calc_basic_snp_stats(stickSNPs, c("chr.pop"), sigma = 200, step = 150)
#' x <- do_bootstraps(x, facets = c("chr.pop"), boots = 1000, sigma = 200, step = 150)
#' x <- calc_p_from_bootstraps(x)
#' get.snpR.stats(x, "chr.pop", "single.window") # pi, ho, etc
#' get.snpR.stats(x, "chr.pop", "pairwise.window") # fst
#' }
calc_p_from_bootstraps <- function(x, facets = "all", statistics = "all", alt = "two-sided", par = FALSE,
fwe_method = "BY",
fwe_case = c("by_facet", "overall")){
#==========sanity checks==========
if(!is.snpRdata(x)){
stop("x is not a snpRdata object.\n")
}
msg <- character()
# check statistics
if(!"all" %in% statistics){
miss.stats <- which(!statistics %in% unique(x@window.bootstraps$stat))
if(length(miss.stats) > 0){
msg <- c(msg,
paste0("Some statistics not found in bootstraps: ", paste0(miss.stats, collapse = ", "), "\n"))
}
}
if(length(msg) > 0){
stop(msg)
}
#===========subfunctions=======
# finds matching rows for given facet, etc
get.matches <- function(y, facet, subfacet, snp.facet, sigma, nk, step, triple_sigma, gaussian, statistic){
matches <- y$facet == facet &
y$subfacet == subfacet &
y$snp.facet == snp.facet &
y$sigma == sigma &
y$nk == nk &
y$step == step &
y$gaussian == gaussian &
y$triple_sigma == triple_sigma
if(any(colnames(y) == "stat")){
matches <- (y$stat == statistic) & matches
}
return(which(matches))
}
# extracts an edf with the matching bootstrap info and calculates p-values
get.one.pvalue <- function(x, facet, subfacet, snp.facet, statistic, nk, step, sigma, triple_sigma, gaussian, alt){
# grab the matching raw statistical data
matches <- get.matches(x@window.bootstraps, facet, subfacet, snp.facet, sigma, nk, step,
triple_sigma, gaussian, statistic)
# create a cumulative distibution function of the distribution
edist <- stats::ecdf(x@window.bootstraps[matches,]$value)
# get p-values
## get the matching statistic data
type <- ifelse(statistic %in% colnames(x@window.stats), "single", "pairwise")
if(type == "single"){
scol <- which(colnames(x@window.stats) == statistic)
meta.cols <- 1:which(colnames(x@window.stats) == "triple_sigma")
matches <- get.matches(x@window.stats, facet, subfacet, snp.facet, sigma, nk, step, triple_sigma, gaussian, statistic)
meta <- x@window.stats[matches, meta.cols, with = FALSE]
matches <- x@window.stats[matches, scol, with = FALSE]
}
else{
scol <- which(colnames(x@pairwise.window.stats) == statistic)
meta.cols <- 1:which(colnames(x@pairwise.window.stats) == "triple_sigma")
matches <- get.matches(x@pairwise.window.stats, facet, subfacet, snp.facet, sigma, nk, step, triple_sigma, gaussian, statistic)
meta <- x@pairwise.window.stats[matches, meta.cols, with = FALSE]
matches <- x@pairwise.window.stats[matches, scol, with = FALSE]
}
if(nrow(meta) == 0){
return(cbind(meta, stat = character(), p = numeric()))
} # this may occur if there were no stats in this level in the original data
## get the proportion of the bootstrapped distribution less than or equal to the observed point.
xp <- edist(matches[[1]])
if(alt == "two-sided"){
#Take abs(.5 - xp), which is the proportion of the distribution between the point and the distribution mean.
#Multiply that by 2 to get the proportion of the distribution between that point and the distirbution mean
#as well as between the mean and a point equally extreme in the other direction.
#take 1 minus that to get the proportion of the as or more extreme
xp <- 1 - (abs(xp - .5)*2)
}
else if (alt == "greater"){
#get the proportion of data greater than observed data points
xp <- 1 - xp
}
# bind and return
out <- cbind(meta, stat = statistic, p = xp)
return(out)
}
#===========loop through all of the requested facets/statistics=========
# generate a matrix containing all of the possible tasks
u.rows <- unique(x@window.bootstraps[,1:(which(colnames(x@window.bootstraps) == "value") - 1)])
# figure out which parts of the unique tasks are part of the requested facet
if(facets[1] != "all"){
keep.rows <- logical(nrow(u.rows))
facets <- .check.snpR.facet.request(x, facets, "none", T)
# loop through each facet
for(i in 1:length(facets[[1]])){
# for sample facets
if(facets[[2]][[i]] == "sample"){
keep.rows[which(u.rows$facet == facets[[1]][[i]])] <- T
}
# for complex facets
else if(facets[[2]][[i]] == "complex"){
u.facet <- unlist(.split.facet(facets[[1]][[i]]))
split.parts <- .check.snpR.facet.request(x, u.facet, remove.type = "none", return.type = T)
samp.part <- split.parts[[1]][which(split.parts[[2]] == "sample")]
snp.part <- split.parts[[1]][which(split.parts[[2]] == "snp")]
keep.samp <- which(u.rows$facet == samp.part)
keep.snp <- which(u.rows$snp.facet == snp.part)
keep.rows[intersect(keep.samp, keep.snp)] <- T
}
else if(facets[[2]][[i]] == "snp"){
keep.rows[which(u.rows$snp.facet == facets[[1]][[i]])] <- T
}
else{
keep.rows[which(u.rows$facet == ".base" & u.rows$snp.facet == ".base")] <- T
}
}
u.rows <- u.rows[keep.rows,]
}
# initialize
out <- vector("list", nrow(u.rows))
# run the loop
if(par == FALSE){
for(q in 1:nrow(u.rows)){
out[[q]] <- get.one.pvalue(x, facet = u.rows$facet[q],
subfacet = u.rows$subfacet[q],
snp.facet = u.rows$snp.facet[q],
statistic = u.rows$stat[q],
nk = u.rows$nk[q],
step = u.rows$step[q],
sigma = u.rows$sigma[q],
triple_sigma = u.rows$triple_sigma[q],
gaussian = u.rows$gaussian[q],
alt = alt)
}
}
else{
cl <- parallel::makePSOCKcluster(par)
doParallel::registerDoParallel(cl)
#prepare reporting function
ntasks <- nrow(u.rows)
# progress <- function(n) cat(sprintf("\tPart %d out of", n), ntasks, "is complete.\n")
# opts <- list(progress=progress)
out <- foreach::foreach(q = 1:ntasks, .inorder = FALSE,
.export = "data.table") %dopar% {
get.one.pvalue(x, facet = u.rows$facet[q],
subfacet = u.rows$subfacet[q],
snp.facet = u.rows$snp.facet[q],
statistic = u.rows$stat[q],
nk = u.rows$nk[q],
step = u.rows$step[q],
sigma = u.rows$sigma[q],
alt = alt,
gaussian = u.rows$gaussian[q],
triple_sigma = u.rows$triple_sigma[q])
}
#release cores
parallel::stopCluster(cl)
}
#===========bind, cast, and merge===========
# bind the result
out <- data.table::rbindlist(out)
# melt
meta.cols <- colnames(out)[-ncol(out)]
meta.cols <- meta.cols[-which(meta.cols == "stat")]
# merge
if(any(out$stat == "fst")){
cout <- data.table::dcast(out[which(out$stat == "fst"),], facet + subfacet + snp.facet + snp.subfacet + position + sigma + n_snps + step + nk.status + gaussian + triple_sigma ~ stat, value.var = "p")
colnames(cout)[(which(colnames(cout) == "triple_sigma") + 1):ncol(cout)] <- paste0("p_", colnames(cout)[(which(colnames(cout) == "triple_sigma") + 1):ncol(cout)])
# do fwe
cout <- .fwe_correction(cout, levs = c("facet", "subfacet"), pcol = "p_fst", methods = fwe_method, case = fwe_case)
x <- .merge.snpR.stats(x, cout, type = "pairwise.window.stats")
}
if(any(out$stat != "fst")){
s.stats <- which(out$stat != "fst")
cout <- data.table::dcast(out[s.stats,], facet + subfacet + snp.facet + snp.subfacet + position + sigma + n_snps + step + nk.status + gaussian + triple_sigma ~ stat, value.var = "p")
colnames(cout)[(which(colnames(cout) == "triple_sigma") + 1):ncol(cout)] <- paste0("p_", colnames(cout)[(which(colnames(cout) == "triple_sigma") + 1):ncol(cout)])
# do fwe
p_cols <- colnames(cout)[grep("p_", colnames(cout))]
cout_comb <- vector("list", length(p_cols))
for(i in 1:length(p_cols)){
cout_comb[[i]] <- .fwe_correction(cout, levs = c("facet", "subfacet"), pcol = p_cols[i], methods = fwe_method, case = fwe_case)
}
invisible(suppressMessages(cout <- purrr::reduce(cout_comb, dplyr::left_join)))
x <- .merge.snpR.stats(x, cout, type = "window.stats")
}
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.