#' Run an automated MTMC-SKAT job
#'
#' This function provides a one-liner to run MTMC-SKAT over a scaffold.
#'
#' For whole-genome analysis, this function (accesible by command line) is to be parallelized over jobs submitted in a batch query system on a high-performance cluster, or looped over scaffolds if running on a single machine.
#'
#' @param phenodata string for filepath for phenotype file, with labeled columns for FID, IID, and trait.
#' @param covariates string for filepath for covariate file, with ordered covariate values in each column and no header
#' @param raw_file_path string for filepath to .traw file (see [PLINK file format reference](https://www.cog-genomics.org/plink/2.0/formats))
#' @param output_dir string for directory where results will be saved
#' @param n_thread numeric, maximum number of cores over which parallelization is allowed
#' @param job_id string, identifier to label the job; this identifier will go into output filename
#' @param desired_sig_figs Integer, minimum number of significant figures
#' desired for empirical p-values
#' @param min_accuracy numeric, threshold x to begin resampling for SNP windows with p-values below 10^-x; default value is 2
#' @param max_accuracy numeric, limit x to end resampling for SNP windows with p-values below 10^-x, usually due to computational cost
#' @param switch_point numeric, limit x at which SNP windows with p-values below 10^-x must be testing by parallelizing over null models rather than SNP windows (may be set by user due to limitations on RAM preventing production of large null models)
#' @param plot if TRUE, produce Manhattan plot of results
#' @inheritParams pre_allocate
#' @inheritParams calculate_max_perm_per_core
#' @inheritParams select_windows_range_p
#'
#' @return None; outputs are saved to the user-specified output directory, with the user-specified ```job_id```
#' @export
#'
#' @examples
#' \dontrun{
#' phenodata <- system.file("extdata",
#' "TDZ_shoot_area.plink.pheno",
#' package = "SKATMCMT")
#'
#' covariates <- system.file("extdata",
#' "poplar_PCs_covariates.tbt",
#' package = "SKATMCMT")
#'
#' raw_file_path <- system.file("extdata",
#' "poplar_SNPs_Chr10_14460to14550kb.traw",
#' package = "SKATMCMT")
#'
#' MTMCSKAT_workflow(phenodata = phenodata,
#' covariates = covariates,
#' raw_file_path = raw_file_path,
#' window_size = 3000,
#' window_shift = 1000,
#' output_dir = "Results/",
#' pre_allocated_dir = "pre_allocated_dir/",
#' n_thread = "AllCores",
#' job_id = "my_sample_analysis",
#' desired_sig_figs = 2,
#' min_accuracy = 2,
#' max_accuracy = 5,
#' plot = TRUE)
#' }
MTMCSKAT_workflow_pre_allocate_only <- function(phenodata, covariates, raw_file_path, window_size,
window_shift, output_dir, pre_allocated_dir,
job_id, desired_sig_figs = 2,
min_accuracy = 2,
max_accuracy = 5,
switch_point = 4, plot = TRUE,
RAM="AllRAM",
n_thread="AllCores",
missing_cutoff = 0.15,
top_N = Inf){ # get rid of switchpoint parameter once making function determine_switch_point?
this_phenotype <- unlist(utils::read.csv(phenodata,
sep = "\t",
colClasses = c("character",
"character",
"numeric"))[, 3])
# fread is robust, automatically detects if there is a header and type of
# separation, which may vary across covariate files (lack a standard format).
covariates <- data.table::fread(covariates)
output_dir <- paste0(output_dir, "/", job_id)
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
output_basename <- paste0(output_dir, "/pheno-",
basename(phenodata),
"-scaff-",
basename(raw_file_path))
sink(file = paste0(output_basename,
".log"))
whole_genome_start <- proc.time()
future::plan("multisession")
# Initial SKAT round w/o resampling, parallelized over SNP windows --------
pre_allocated_SNP_windows <-
pre_allocate(
raw_file_path = raw_file_path,
window_size = window_size,
window_shift = window_shift,
pre_allocated_dir = pre_allocated_dir)
# master_output <-
# mtskat(
# this_phenotype = this_phenotype,
# covariates = covariates,
# raw_file_path = pre_allocated_SNP_windows[[1]][[3]],
# pre_allocated_SNP_windows = pre_allocated_SNP_windows,
# n_thread = n_thread,
# missing_cutoff = missing_cutoff)
#
# # Secondary SKAT round w/ resampling, multithread over SNP windows or NMs-----
#
# max_accuracy <-
# set_accuracy(
# master_output = master_output,
# max_accuracy = max_accuracy)
#
# ## A null model with HOW MUCH resampling can fit into the RAM allotted
# ## for a single thread?
#
# RAM_per_permutation <-
# benchmark_nm(
# phenotype = this_phenotype,
# covariates = covariates,
# benchmark_size = 1000)
#
# # This parameter only used if multithreading over null models
# max_permutations_per_job <-
# calculate_max_perm_per_core(
# nm_RAM_per_perm = RAM_per_permutation,
# RAM = size_RAM_wiggle(RAM = as.numeric(RAM), wiggle_factor = 10),
# n_thread = n_thread)
#
# # This should be less than (option 'future.globals.maxSize')
# # which is 500MB by default.
# #RAM_per_thread <- RAM / n_thread
# RAM_per_thread <- 500*1e6
#
# for(leading_0s in min_accuracy:max_accuracy){
#
# terminal_resampling <-
# determine_whether_final_resampling(
# leading_0s = leading_0s,
# max_accuracy = max_accuracy)
#
# boundaries <-
# determine_subset_pval_boundaries(
# leading_0s = leading_0s,
# desired_sig_figs = desired_sig_figs,
# terminal_resampling = terminal_resampling)
#
# cat(paste(Sys.time(), "- Re-allocating SNP windows with only those",
# "with p-values between", boundaries$upper, "and",
# boundaries$lower, "..."))
#
# new_pre_allocated_SNP_windows <-
# re_allocate_windows(
# x = master_output,
# upper_bound = boundaries$upper,
# lower_bound = boundaries$lower,
# pre_allocated_SNP_windows = pre_allocated_SNP_windows,
# top_N = top_N)
#
# cat(paste(Sys.time(), "Complete."))
#
# if(new_pre_allocated_SNP_windows == "None") next
#
# n_permutations <-
# determine_n_permutations(
# leading_0s = leading_0s,
# desired_sig_figs = desired_sig_figs)
#
# null_models_fit_in_RAM_per_thread <-
# check_if_null_models_fit_in_RAM_per_thread(
# n_permutations = n_permutations,
# RAM_per_permutation = RAM_per_permutation,
# RAM_per_thread = RAM_per_thread)
#
# more_threads_than_windows <-
# check_if_more_threads_than_windows(
# pre_allocated_SNP_windows = new_pre_allocated_SNP_windows,
# n_thread = n_thread)
#
# cat(paste0(Sys.time(), " - Running resampling for ",
# length(new_pre_allocated_SNP_windows),
# " SNP windows with p-values between ",
# boundaries$upper, " and ",
# boundaries$lower,
# " (", leading_0s, " leading zeroes) with at least ",
# n_permutations, " permutations so empirical p-values",
# " can be calculated to ", desired_sig_figs,
# " significant figures...\n"))
#
# if(null_models_fit_in_RAM_per_thread & !more_threads_than_windows){
# cat("Multithreading over SNP windows\n")
#
# add_to_master_output <-
# mtmcskat_SNPs(
# this_phenotype = this_phenotype,
# covariates = covariates,
# n_permutations = n_permutations,
# pre_allocated_SNP_windows = new_pre_allocated_SNP_windows,
# scaffold_ID = pre_allocated_SNP_windows[[1]][[3]],
# n_thread = n_thread,
# missing_cutoff = missing_cutoff)
# }
#
# if(!null_models_fit_in_RAM_per_thread | more_threads_than_windows){
# cat("Multithreading over null models\n")
#
# add_to_master_output <-
# mtmcskat_NullModels(
# this_phenotype = this_phenotype,
# covariates = covariates,
# n_thread = n_thread,
# n_permutations = n_permutations,
# max_permutations_per_job = max_permutations_per_job,
# pre_allocated_SNP_windows = new_pre_allocated_SNP_windows,
# scaffold_ID = pre_allocated_SNP_windows[[1]][[3]],
# missing_cutoff = missing_cutoff)
#
# }
#
# # Update the master output, replacing entries for which we now have...
# # ...empirical p-values
# master_output <-
# rbind(
# master_output[-which(
# master_output$position %in% add_to_master_output$position), ],
# add_to_master_output)
#
# }
#
#
#
# raw_out_name <- paste0(output_basename, ".csv")
# plot_out_name <- paste0(output_basename, ".png")
#
# cat(paste0("Writing ", raw_out_name))
#
# data.table::fwrite(master_output, raw_out_name)
#
# if(plot==TRUE){
#
# compare_plots(master_output,
# plot_out_name = plot_out_name,
# scaffold_ID = pre_allocated_SNP_windows[[1]][[3]])
#
# }
#
# cat(paste0("Done running in...",
# print(proc.time() - whole_genome_start)[3],
# "s"))
#
sink()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.