R/sim_gen.R

Defines functions sim_gen

Documented in sim_gen

#' Generate simulated GWAS results
#'
#' Generate simulated GWAS results from the data given an effect size distribution
#' and distributions of the parameters for that distribution. Can accept ABC results
#' as produced by \code{\link{ABC_on_hyperparameters}} to generate a joint effect size distribution
#' using 2d kernal smoothing.
#'
#' GWAS results are generated by first drawing hyperparameter values from either the provided distributions
#' or from a joint distribution produced by kernal smoothing the results of \code{\link{ABC_on_hyperparameters}}
#' using \code{\link[GenKern]{KernSur}}. These values are then passed to the provided effect size distribution
#' in order to draw allele effect sizes. Phenotypes are then calculated for all individuals based on a heritiablity
#' randomly drawn from the provided heritability distribution. A population and family structure corrected GWAS is
#' then conducted on the phenotypes and genotypes using the \code{\link[GMMAT]{glmmkin}} and
#' \code{\link[GMMAT]{glmm.score}} functions using a genetic relationship matrix (GRM) between individuals as a covariate.
#' The GRM is calculated using the method introduced in Yang et al 2010 via \code{\link[AGHmatrix]{Gmatrix}}. Note that
#' this matrix should be identical to that produced by GCTA or other programs that use the same method.
#'
#' The resulting p-value distributions are then summarized by a wide range of statistics, which are returned for
#' comaparison to GWAS result from the real phenotypes.
#'
#' Note that very large datasets can result in huge memory and time requirements during this step. As such,
#' it is possible to pass genotypes as a \code{\link[bigstatsr]{FBM}} instead of a standard matrix/data.table/data.frame.
#' This will result in quicker phenotype calculations. In addition, it is possible to pre-subset the genotypic data and
#' produce GRMs, input data for \code{\link[GMMAT]{glmm.score}},  window identity data for the subset data, and snp
#' metadata for the subset and pass
#' this data alongside the full genotypic data. These will be used for the GWAS instead of the full data, which can result
#' in much faster run-times and much lower memory requirements. As long as the same input files are used for both
#' \code{\link{sim_gen}} and for calculating the real GWAS during downstream application, the overall method should
#' still be valid.
#'
#' @param x object coercible to a matrix or a \code{\link[bigstatsr]{FBM}}. Input genotypic data.
#' @param meta data.frame. Metadata for the snps \emph{included in the GWAS}, where the first column
#'   is chromosome/scaffold information and the second is position in base pairs. Note that if a subset
#'   of the SNPs are used for the GWAS via the pass_windows, pass_G, and GMMAT_infile options, this metadata
#'   should correspond to those SNPs, not those in x.
#' @param iters numeric. The number of simulations to perform.
#' @param center logical, default TRUE. Determines if the phenotypes should be centered prior to the GWAS.
#' @param scheme character, default "gwas". The method to use for p-value/effect size estimation. Currenly
#'   only supports gwas.
#' @param effect_distribution function, default \code{\link{rbayesB}}. The effect size distribution to use.
#' @param parameter_distributions list containing named functions, default
#'   list(pi = function(x) rbeta(x, 25, 1), d.f = function(x) runif(x, 1, 100), scale = function(x) rbeta(x, 1, 3)*100).
#'   Named functions giving the distributions from which to draw effect_size distribution hyperparameters.
#' @param save_effects character or FALSE, default FALSE. If true, simulated effects will be saved to filepath provided here.
#' @param provided_effects data.frame or NULL. If provided, a data.frame containing the effects to use for each iteration instead of
#'   sampling from a distribution. Useful if data is pre-drawn from a distribution and screened for phenotypic match (via, for example, \code{\link{ABC_on_hyperparameters}}).
#' @param provided_effect_hyperparameters data.frame or NULL. If provided_effects are given, a data frame of the hyperparamters used to generate
#'   those effects. Needed to generate consistent reporting of results.
#'
#' @export
sim_gen <- function(x, meta, iters, center = T, scheme = "gwas",
                    effect_distribution = rbayesB,
                    parameter_distributions = list(pi = function(x) rbeta(x, 25, 1),
                                                   d.f = function(x) runif(x, 1, 100),
                                                   scale = function(x) rbeta(x, 1, 3)*100),
                    h_dist = function(x) rep(.5, x),
                    # burnin = burnin, thin = thin, chain_length = chain_length, method = "BayesB",
                    par = 1, joint_res = NULL, joint_acceptance = NULL, joint_res_dist = "ks",
                    peak_delta = .5, peak_pcut = 0.0005, window_sigma = 50, phased = FALSE, maf = 0.05,
                    pass_windows = NULL, pass_G = NULL, GMMAT_infile = NULL, reg_res = NULL,
                    find_similar_effects = F, real_effects = NULL,
                    provided_effects = NULL, provided_effect_hyperparameters = NULL){

  #============schem functions for one simulation=============
  # gp <- function(x, pi, df, scale, method, t_iter, h, windows, center = center){
  #   pseudo <- generate_pseudo_effects(x, effect_distribution, parameters, h, center = center)
  #
  #   cat("Beginning pseudo data", method, "run.\n")
  #   pseudo.pred <-pred(x, phenotypes = pseudo$p,
  #                      burnin = burnin, thin = thin, chain_length = chain_length,
  #                      prediction.program = "BGLR", prediction.model = method,
  #                      runID = paste0(t_iter, "_pseudo"), verbose = F)
  #
  #   stats <- dist_desc(pseudo.pred, meta, windows, peak_delta, peak_pcut, chr, pvals = F)
  #
  #   return(list(stats = stats, e = pseudo$e))
  # }
  gwas <- function(x, effect_distribution, parameters = NULL, h, center = center,
                   t_iter, G, windows, phased = FALSE, GMMAT_infile = NULL, effects = NULL){
    if(is.null(effects)){
      pseudo <- generate_pseudo_effects(x, effect_distribution, parameters, h, center = center, phased = phased)
    }
    else{
      pseudo <- vector("list", 2)
      names(pseudo) <- c("e", "p")
      pseudo$p <- get.pheno.vals(x = x, effect.sizes = effects, h = h, phased = phased)$p
      pseudo$e <- effects
      if(center){
        pseudo$p <- pseudo$p - mean(pseudo$p)
      }
    }



    stats <- calc_distribution_stats(x = x, meta = meta, phenos = pseudo$p, center = center,
                                         scheme = "gwas", chr = colnames(meta)[1],
                                         peak_delta = peak_delta, peak_pcut = peak_pcut, window_sigma = window_sigma,
                                         burnin = NULL, thin = NULL, chain_length = NULL, maf = maf, phased = phased,
                                         pass_windows = windows, pass_G = G, GMMAT_infile = GMMAT_infile, find_similar_effects = find_similar_effects, real_effects = real_effects)

    return(stats)
  }

  loop_func <- function(x, effect_distribution, parameters, scheme,
                        t_iter, G = NULL, h, windows, center = center, phased = FALSE, effects = NULL){
    # if(scheme == "gp"){
    #   dist <- gp(x, pi, df, scale, method, t_iter, h, windows, center = center)
    # }
    if(scheme == "gwas"){
      dist <- gwas(x, effect_distribution, parameters = parameters, h = h, center = center,
                   t_iter = t_iter, windows = windows, G = G, phased = phased, GMMAT_infile = GMMAT_infile, effects = effects)
    }
    return(dist)
  }

  #============prep for simulations======================================
  # if any joint parameter priors, calculate and disambiguate
  # joint_parms <- names(parameter_distributions)[which(parameter_distributions == "joint")]
  # parms <- as.data.frame(matrix(NA, iters, 0))
  # if(length(joint_parms) > 0){
  #   parms <- cbind(parms, gen_parms(iters, joint_res, joint_acceptance, joint_parms, dist.var = joint_res_dist))
  # }
  # if(!is.null(reg_res)){
  #   parms <- cbind(parms, sample_joint_quantile(iters, reg_res))
  # }
  # if(ncol(parms) < length(parameter_distributions)){
  #   other_parms <- names(parameter_distributions)[which(!names(parameter_distributions) %in% colnames(parms))]
  #   run_parameters <- vector("list", length = length(other_parms))
  #   names(run_parameters) <- other_parms
  #   for(i in 1:length(run_parameters)){
  #     run_parameters[[i]] <- parameter_distributions[[other_parms[i]]](iters)
  #   }
  #   parms <- cbind(parms, as.data.frame(run_parameters))
  # }
  # run_parameters <- parms
  # rm(parms)

  if(is.null(provided_effects)){
    run_parameters <- sample_parameters_from_distributions(parameter_distributions = parameter_distributions,
                                                           iters = iters,
                                                           joint_res = joint_res,
                                                           joint_acceptance = joint_acceptance,
                                                           joint_res_dist = joint_res_dist,
                                                           reg_res = reg_res)
    h <- h_dist(iters)
  }
  else{
    run_parameters <- NULL
    if(is.function(h_dist)){
      h <- h_dist(iters)
    }
    else{
      h <- h_dist
    }
  }


  # can pass a g matrix forward once if doing gwas
  if(scheme == "gwas" & is.null(pass_G)){

    G <- make_G(smart_transpose_and_unphase(x, phased), maf, par)
  }
  else if(scheme == "gwas" & !is.null(pass_G)){
    G <- pass_G
    rm(pass_G)
  }
  else{
    G <- NULL
  }

  # pre-run the window function unless passed
  if(is.null(pass_windows)){
    windows <- mark_windows(meta, window_sigma, colnames(meta)[1])
  }
  else{
    windows <- pass_windows
    rm(pass_windows)
  }


  #============run the simulations===========================
  # initialize storage
  dist_output <-  matrix(0, iters, ncol = number_descriptive_stats)
  colnames(dist_output) <- names_descriptive_stats

  # run the simulations
  ## serial
  if(isFALSE(par) | par == 1){
    for(i in 1:iters){
      cat("Iter: ", i, ".\n")
      dist_output[i,] <- loop_func(x, effect_distribution,
                                   parameters = as.list(run_parameters[i,,drop = F]),
                                   scheme = scheme,
                                   t_iter = i, G = G, h = h[i], windows = windows, center = center, phased = phased,
                                   effects = unlist(provided_effects[i,]))
    }
    if(is.null(run_parameters)){
      ret <- cbind(provided_effect_hyperparameters, h = h, as.data.table(dist_output))
    }
    else{
      ret <- cbind(as.data.table(run_parameters), h = h, as.data.table(dist_output))
    }
    if(!is.list(ret)){
      ret <- as.data.frame(t(ret))
    }
    return(list(stats = ret, errors = list(parms = ret[-c(1:nrow(ret)),], msgs = character())))
  }


  # parallel
  else{
    stop("Parallel is not currently working. Problem is likely due to how files are saved during GMMAT in parallel. Fix with pre-generated temp file names?")
    cl <- snow::makeSOCKcluster(par)
    doSNOW::registerDoSNOW(cl)

    # divide up into ncore chunks
    it_set <- (1:iters)%%par
    chunks <- list(parms = .smart_split(run_parameters, it_set),
                   dist_output = .smart_split(as.data.frame(dist_output), it_set),
                   h = .smart_split(as.data.frame(h), it_set),
                   effects = .smart_split(as.data.frame(provided_effects), it_set),
                   effect_hyper = .smart_split(as.data.frame(provided_effect_hyperparameters), it_set))


    # prepare reporting function
    progress <- function(n) cat(sprintf("Chunk %d out of", n), par, "is complete.\n")
    opts <- list(progress=progress)


    output <- foreach::foreach(q = 1:par, .inorder = FALSE,
                               .options.snow = opts, .packages = c("data.table", "GeneArchEst")
                               ) %dopar% {

                                 parm_chunk <- chunks$parm[[q]]
                                 h_chunk <- unlist(chunks$h[[q]])
                                 dist_chunk <- chunks$dist_output[[q]]
                                 effect_chunk <- chunks$effects[[q]]
                                 is.err <- numeric(0)
                                 errs <- character(0)


                                 # run once per iter in this chunk
                                 for(i in 1:nrow(dist_chunk)){
                                   b <- try(loop_func(x, effect_distribution,
                                                      parameters = as.list(parm_chunk[i,,drop = F]),
                                                      scheme = scheme,
                                                      t_iter = paste0(q, "_", i), G = G, h = h_chunk[i],
                                                      effects = unlist(effect_chunk[i,]),
                                                      windows = windows, center = center, phased = phased), silent = T)
                                   if(class(b) == "try-error"){
                                     is.err <- c(is.err, i)
                                     errs <- c(errs, b)
                                   }
                                   else{
                                     dist_chunk[i,] <- b
                                   }
                                 }

                                 if(is.null(run_parameters)){
                                   parm_chunk <- chunks$effect_hyper[[q]]
                                 }


                                 out <- vector("list", 2)
                                 names(out) <- c("successes", "fails")
                                 if(length(is.err) > 0){
                                   out[[1]] <- cbind(parm_chunk[-is.err,], h = h_chunk[-is.err], dist_chunk[-is.err,])
                                   out[[2]] <- list(error_msg = errs,
                                                    error_parms = cbind(parm_chunk[is.err,,drop = F], h = h_chunk[is.err]))
                                 }
                                 else{
                                   out[[1]] <- cbind(parm_chunk, h = h_chunk, dist_chunk)
                                   out[[2]] <- list(error_msg = character(0),
                                                    error_parms = as.data.frame(matrix(NA, ncol = ncol(parm_chunk) + 1, nrow = 1)))
                                   colnames(out[[2]]$error_parms) <- c(colnames(parm_chunk), "h")
                                 }
                                 out
                               }


    parallel::stopCluster(cl)
    doSNOW::registerDoSNOW()
    gc();gc()
    dat <- dplyr::bind_rows(purrr::map(output, 1))
    errs <- purrr::map(output, 2)
    err_parms <- dplyr::bind_rows(purrr::map(errs, 2))
    err_msgs <- unlist(purrr::map(errs, 1))
    err_parms <- na.omit(err_parms)
    errs <- list(parms = err_parms, msgs = err_msgs)

    if(length(errs$msgs) > 0){
      warning("Errors occured on some simulations. See named element 'errors' in returned list for details.\n")
    }
    else{
      cat("Complete, no errors on any iterations.\n")
    }
    return(list(stats = dat, errors = errs))
  }
}
hemstrow/GeneArchEst documentation built on June 10, 2025, 5:06 a.m.