#' Generate simulated GWAS results
#'
#' Generate simulated GWAS results from the data given an effect size distribution
#' and distributions of the parameters for that distribution. Can accept ABC results
#' as produced by \code{\link{ABC_on_hyperparameters}} to generate a joint effect size distribution
#' using 2d kernal smoothing.
#'
#' GWAS results are generated by first drawing hyperparameter values from either the provided distributions
#' or from a joint distribution produced by kernal smoothing the results of \code{\link{ABC_on_hyperparameters}}
#' using \code{\link[GenKern]{KernSur}}. These values are then passed to the provided effect size distribution
#' in order to draw allele effect sizes. Phenotypes are then calculated for all individuals based on a heritiablity
#' randomly drawn from the provided heritability distribution. A population and family structure corrected GWAS is
#' then conducted on the phenotypes and genotypes using the \code{\link[GMMAT]{glmmkin}} and
#' \code{\link[GMMAT]{glmm.score}} functions using a genetic relationship matrix (GRM) between individuals as a covariate.
#' The GRM is calculated using the method introduced in Yang et al 2010 via \code{\link[AGHmatrix]{Gmatrix}}. Note that
#' this matrix should be identical to that produced by GCTA or other programs that use the same method.
#'
#' The resulting p-value distributions are then summarized by a wide range of statistics, which are returned for
#' comaparison to GWAS result from the real phenotypes.
#'
#' Note that very large datasets can result in huge memory and time requirements during this step. As such,
#' it is possible to pass genotypes as a \code{\link[bigstatsr]{FBM}} instead of a standard matrix/data.table/data.frame.
#' This will result in quicker phenotype calculations. In addition, it is possible to pre-subset the genotypic data and
#' produce GRMs, input data for \code{\link[GMMAT]{glmm.score}}, window identity data for the subset data, and snp
#' metadata for the subset and pass
#' this data alongside the full genotypic data. These will be used for the GWAS instead of the full data, which can result
#' in much faster run-times and much lower memory requirements. As long as the same input files are used for both
#' \code{\link{sim_gen}} and for calculating the real GWAS during downstream application, the overall method should
#' still be valid.
#'
#' @param x object coercible to a matrix or a \code{\link[bigstatsr]{FBM}}. Input genotypic data.
#' @param meta data.frame. Metadata for the snps \emph{included in the GWAS}, where the first column
#' is chromosome/scaffold information and the second is position in base pairs. Note that if a subset
#' of the SNPs are used for the GWAS via the pass_windows, pass_G, and GMMAT_infile options, this metadata
#' should correspond to those SNPs, not those in x.
#' @param iters numeric. The number of simulations to perform.
#' @param center logical, default TRUE. Determines if the phenotypes should be centered prior to the GWAS.
#' @param scheme character, default "gwas". The method to use for p-value/effect size estimation. Currenly
#' only supports gwas.
#' @param effect_distribution function, default \code{\link{rbayesB}}. The effect size distribution to use.
#' @param parameter_distributions list containing named functions, default
#' list(pi = function(x) rbeta(x, 25, 1), d.f = function(x) runif(x, 1, 100), scale = function(x) rbeta(x, 1, 3)*100).
#' Named functions giving the distributions from which to draw effect_size distribution hyperparameters.
#' @param save_effects character or FALSE, default FALSE. If true, simulated effects will be saved to filepath provided here.
#' @param provided_effects data.frame or NULL. If provided, a data.frame containing the effects to use for each iteration instead of
#' sampling from a distribution. Useful if data is pre-drawn from a distribution and screened for phenotypic match (via, for example, \code{\link{ABC_on_hyperparameters}}).
#' @param provided_effect_hyperparameters data.frame or NULL. If provided_effects are given, a data frame of the hyperparamters used to generate
#' those effects. Needed to generate consistent reporting of results.
#'
#' @export
sim_gen <- function(x, meta, iters, center = T, scheme = "gwas",
effect_distribution = rbayesB,
parameter_distributions = list(pi = function(x) rbeta(x, 25, 1),
d.f = function(x) runif(x, 1, 100),
scale = function(x) rbeta(x, 1, 3)*100),
h_dist = function(x) rep(.5, x),
# burnin = burnin, thin = thin, chain_length = chain_length, method = "BayesB",
par = 1, joint_res = NULL, joint_acceptance = NULL, joint_res_dist = "ks",
peak_delta = .5, peak_pcut = 0.0005, window_sigma = 50, phased = FALSE, maf = 0.05,
pass_windows = NULL, pass_G = NULL, GMMAT_infile = NULL, reg_res = NULL,
find_similar_effects = F, real_effects = NULL,
provided_effects = NULL, provided_effect_hyperparameters = NULL){
#============schem functions for one simulation=============
# gp <- function(x, pi, df, scale, method, t_iter, h, windows, center = center){
# pseudo <- generate_pseudo_effects(x, effect_distribution, parameters, h, center = center)
#
# cat("Beginning pseudo data", method, "run.\n")
# pseudo.pred <-pred(x, phenotypes = pseudo$p,
# burnin = burnin, thin = thin, chain_length = chain_length,
# prediction.program = "BGLR", prediction.model = method,
# runID = paste0(t_iter, "_pseudo"), verbose = F)
#
# stats <- dist_desc(pseudo.pred, meta, windows, peak_delta, peak_pcut, chr, pvals = F)
#
# return(list(stats = stats, e = pseudo$e))
# }
gwas <- function(x, effect_distribution, parameters = NULL, h, center = center,
t_iter, G, windows, phased = FALSE, GMMAT_infile = NULL, effects = NULL){
if(is.null(effects)){
pseudo <- generate_pseudo_effects(x, effect_distribution, parameters, h, center = center, phased = phased)
}
else{
pseudo <- vector("list", 2)
names(pseudo) <- c("e", "p")
pseudo$p <- get.pheno.vals(x = x, effect.sizes = effects, h = h, phased = phased)$p
pseudo$e <- effects
if(center){
pseudo$p <- pseudo$p - mean(pseudo$p)
}
}
stats <- calc_distribution_stats(x = x, meta = meta, phenos = pseudo$p, center = center,
scheme = "gwas", chr = colnames(meta)[1],
peak_delta = peak_delta, peak_pcut = peak_pcut, window_sigma = window_sigma,
burnin = NULL, thin = NULL, chain_length = NULL, maf = maf, phased = phased,
pass_windows = windows, pass_G = G, GMMAT_infile = GMMAT_infile, find_similar_effects = find_similar_effects, real_effects = real_effects)
return(stats)
}
loop_func <- function(x, effect_distribution, parameters, scheme,
t_iter, G = NULL, h, windows, center = center, phased = FALSE, effects = NULL){
# if(scheme == "gp"){
# dist <- gp(x, pi, df, scale, method, t_iter, h, windows, center = center)
# }
if(scheme == "gwas"){
dist <- gwas(x, effect_distribution, parameters = parameters, h = h, center = center,
t_iter = t_iter, windows = windows, G = G, phased = phased, GMMAT_infile = GMMAT_infile, effects = effects)
}
return(dist)
}
#============prep for simulations======================================
# if any joint parameter priors, calculate and disambiguate
# joint_parms <- names(parameter_distributions)[which(parameter_distributions == "joint")]
# parms <- as.data.frame(matrix(NA, iters, 0))
# if(length(joint_parms) > 0){
# parms <- cbind(parms, gen_parms(iters, joint_res, joint_acceptance, joint_parms, dist.var = joint_res_dist))
# }
# if(!is.null(reg_res)){
# parms <- cbind(parms, sample_joint_quantile(iters, reg_res))
# }
# if(ncol(parms) < length(parameter_distributions)){
# other_parms <- names(parameter_distributions)[which(!names(parameter_distributions) %in% colnames(parms))]
# run_parameters <- vector("list", length = length(other_parms))
# names(run_parameters) <- other_parms
# for(i in 1:length(run_parameters)){
# run_parameters[[i]] <- parameter_distributions[[other_parms[i]]](iters)
# }
# parms <- cbind(parms, as.data.frame(run_parameters))
# }
# run_parameters <- parms
# rm(parms)
if(is.null(provided_effects)){
run_parameters <- sample_parameters_from_distributions(parameter_distributions = parameter_distributions,
iters = iters,
joint_res = joint_res,
joint_acceptance = joint_acceptance,
joint_res_dist = joint_res_dist,
reg_res = reg_res)
h <- h_dist(iters)
}
else{
run_parameters <- NULL
if(is.function(h_dist)){
h <- h_dist(iters)
}
else{
h <- h_dist
}
}
# can pass a g matrix forward once if doing gwas
if(scheme == "gwas" & is.null(pass_G)){
G <- make_G(smart_transpose_and_unphase(x, phased), maf, par)
}
else if(scheme == "gwas" & !is.null(pass_G)){
G <- pass_G
rm(pass_G)
}
else{
G <- NULL
}
# pre-run the window function unless passed
if(is.null(pass_windows)){
windows <- mark_windows(meta, window_sigma, colnames(meta)[1])
}
else{
windows <- pass_windows
rm(pass_windows)
}
#============run the simulations===========================
# initialize storage
dist_output <- matrix(0, iters, ncol = number_descriptive_stats)
colnames(dist_output) <- names_descriptive_stats
# run the simulations
## serial
if(isFALSE(par) | par == 1){
for(i in 1:iters){
cat("Iter: ", i, ".\n")
dist_output[i,] <- loop_func(x, effect_distribution,
parameters = as.list(run_parameters[i,,drop = F]),
scheme = scheme,
t_iter = i, G = G, h = h[i], windows = windows, center = center, phased = phased,
effects = unlist(provided_effects[i,]))
}
if(is.null(run_parameters)){
ret <- cbind(provided_effect_hyperparameters, h = h, as.data.table(dist_output))
}
else{
ret <- cbind(as.data.table(run_parameters), h = h, as.data.table(dist_output))
}
if(!is.list(ret)){
ret <- as.data.frame(t(ret))
}
return(list(stats = ret, errors = list(parms = ret[-c(1:nrow(ret)),], msgs = character())))
}
# parallel
else{
stop("Parallel is not currently working. Problem is likely due to how files are saved during GMMAT in parallel. Fix with pre-generated temp file names?")
cl <- snow::makeSOCKcluster(par)
doSNOW::registerDoSNOW(cl)
# divide up into ncore chunks
it_set <- (1:iters)%%par
chunks <- list(parms = .smart_split(run_parameters, it_set),
dist_output = .smart_split(as.data.frame(dist_output), it_set),
h = .smart_split(as.data.frame(h), it_set),
effects = .smart_split(as.data.frame(provided_effects), it_set),
effect_hyper = .smart_split(as.data.frame(provided_effect_hyperparameters), it_set))
# prepare reporting function
progress <- function(n) cat(sprintf("Chunk %d out of", n), par, "is complete.\n")
opts <- list(progress=progress)
output <- foreach::foreach(q = 1:par, .inorder = FALSE,
.options.snow = opts, .packages = c("data.table", "GeneArchEst")
) %dopar% {
parm_chunk <- chunks$parm[[q]]
h_chunk <- unlist(chunks$h[[q]])
dist_chunk <- chunks$dist_output[[q]]
effect_chunk <- chunks$effects[[q]]
is.err <- numeric(0)
errs <- character(0)
# run once per iter in this chunk
for(i in 1:nrow(dist_chunk)){
b <- try(loop_func(x, effect_distribution,
parameters = as.list(parm_chunk[i,,drop = F]),
scheme = scheme,
t_iter = paste0(q, "_", i), G = G, h = h_chunk[i],
effects = unlist(effect_chunk[i,]),
windows = windows, center = center, phased = phased), silent = T)
if(class(b) == "try-error"){
is.err <- c(is.err, i)
errs <- c(errs, b)
}
else{
dist_chunk[i,] <- b
}
}
if(is.null(run_parameters)){
parm_chunk <- chunks$effect_hyper[[q]]
}
out <- vector("list", 2)
names(out) <- c("successes", "fails")
if(length(is.err) > 0){
out[[1]] <- cbind(parm_chunk[-is.err,], h = h_chunk[-is.err], dist_chunk[-is.err,])
out[[2]] <- list(error_msg = errs,
error_parms = cbind(parm_chunk[is.err,,drop = F], h = h_chunk[is.err]))
}
else{
out[[1]] <- cbind(parm_chunk, h = h_chunk, dist_chunk)
out[[2]] <- list(error_msg = character(0),
error_parms = as.data.frame(matrix(NA, ncol = ncol(parm_chunk) + 1, nrow = 1)))
colnames(out[[2]]$error_parms) <- c(colnames(parm_chunk), "h")
}
out
}
parallel::stopCluster(cl)
doSNOW::registerDoSNOW()
gc();gc()
dat <- dplyr::bind_rows(purrr::map(output, 1))
errs <- purrr::map(output, 2)
err_parms <- dplyr::bind_rows(purrr::map(errs, 2))
err_msgs <- unlist(purrr::map(errs, 1))
err_parms <- na.omit(err_parms)
errs <- list(parms = err_parms, msgs = err_msgs)
if(length(errs$msgs) > 0){
warning("Errors occured on some simulations. See named element 'errors' in returned list for details.\n")
}
else{
cat("Complete, no errors on any iterations.\n")
}
return(list(stats = dat, errors = errs))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.