R/estimate_saturation_n.R

#' Estimate the substitution saturation ratio for the oligotyping input file (n) times
#' @description Perform multiple random subsamplings of your alignemnt. 
#' @param aln a matrix containing the DNA sequences; this must be of class "DNAbin" 
#' @param rep number of random subsamplings
#' @param nseqs a value for the number of sequences to pick
#' @param model a character string specifying the evolutionary model to be used by \code{\link[ape]{dist.dna}}
#' @param all a logical indicating whether to use all codon positions; defaults to FALSE so only the third codon position is used.
#' @param save_aln a logical indicating whether to save the alignments
#' @param dir path where the alignment should be saved
#' @param parallel a logical indicating whether to do the random subsampling on a sequential way or use the \code{\href{https://github.com/tudo-r/BatchJobs}{BatchJobs}}
#' framework to distribute the random subsamplings on different cores or in computer cluster
#' @param reg_id Name of registry. Displayed e.g. in mails or in cluster queue
#' @param reg_dir Path where files regarding the registry / jobs should be saved
#' @param conf_file Location of the configuration file to load
#' @param save_aln a logical whether to save the alignments used for the estimation
#' @param dir path where the alignments should be saved
#' @param keep_reg a logical wether to keep BatchJobs registry
#' @details You can calculate multiple saturation plots and its associated statistics. The different
#' random subsamplings can be easily distributed over multiple cores or in a computer cluster.
#' We recommend to set \emph{save_aln = FALSE} and use \code{\link{write_saturation_alignments}} instead.
#' Writing the alignments to the disk can generate a high IO load in the parallel mode. Setting \emph{keep_reg = TRUE}
#' will let you keep the BatchJob registry and explore the output files in case of failure.
#' 
#' @return An object of class \dQuote{oligodiag} is a list containing at least the following components:  
#' \describe{
#'  \item{plot}{a ggplot object containing the saturation plots of each random sampling}
#'  \item{seed}{the seeds used for picking the random sequences}
#'  \item{aln}{a DNAbin object containing the original alignment}
#'  \item{aln_no_3rd}{a DNAbin object containing the original alignment without the 3rd codon}
#'  \item{combined_stats}{mean and standard deviation of transitions and transversions for each random subsampling}
#'  \item{saturation}{whether your alignment presents saturation for each random subsampling}
#' }
#' 
#' 
#' @examples saturation_plots <- estimate_saturation_n(aln, nseqs = 1000, rep = 100, parallel = F)
#' @examples saturation_plots <- estimate_saturation_n(aln, nseqs = 1000, rep = 100, parallel = T, reg_id = "test_id", reg_dir = "test-dir", conf_file = ".BatchJobs.R")
#' @export
estimate_saturation_n<-function(aln = aln, rep = 100, nseqs = 1000, model = "K80", parallel = FALSE,
                                all = FALSE, save_aln = FALSE, dir = NULL, reg_id = NULL, reg_dir = NULL, conf_file = NULL, 
                                job_res = list(), keep_reg = TRUE, ...){
  
  is_inFrame(aln)
  
  if (rep < 2){
    stop("Please use estimate_saturation for only one random subsampling", call. = FALSE)
  }
  
  if (is.null(dir)){
    dir <- getwd()
  }
  
  results <- list()
  if (parallel){
    if ((is.null(conf_file)) || (is.null(reg_dir)) || (is.null(reg_id))){
      stop("Please add the configuration file and registry values requiered for BatchJobs" , call. = FALSE)
    }
    
    function_args <- list()
    fun <- estimate_saturation
    function_args <- list(aln = aln, nseqs = nseqs, model = model, all = all, 
                          verbose = FALSE, seed = 0,  save_aln = save_aln,
                          dir =  dir, .rsamp = TRUE)
    
    iterations <- 1:rep
    
    batch_function <- function(X) {
      tmp <- do.call(fun, function_args)
    }
    
    BatchJobs::loadConfig(conf_file) 
    reg <- BatchJobs::makeRegistry(id=reg_id, file.dir=reg_dir)
    id  <- BatchJobs::batchMap(reg, fun = fun, more.args = function_args, iterations)
    estimate_submission <- BatchJobs::submitJobs(reg, resources=job_res)
    estimate_run <- BatchJobs::waitForJobs(reg, id)
    estimate_runs <- reduceResultsList(reg)
#     cat("Gathering results...\n")
#     estimate_runs <- plyr::llply(1:rep, function(x, ...){
#       res <- BatchJobs::loadResult(reg = reg , id = x)
#     }, reg, .parallel = FALSE, .progress = plyr::progress_text(width = 80, char = "+"))
#     
    showStatus(reg)
    if (!(keep_reg)){
      removeRegistry(reg, ask = "no")
    }
    if (!estimate_run) {
      stop('Error in batch jobs', call. = FALSE)
    }
    
  }else{
    
    estimate_runs <- plyr::llply(1:rep, estimate_saturation, aln = aln, nseqs = nseqs, model = model, all = all, verbose = FALSE,
                                 seed = 0, save_aln = save_aln, dir =  dir, .parallel = FALSE, 
                                 .progress = plyr::progress_text(width = 80, char = "+"), .rsamp = TRUE)
  }
  
  results <-list(
    combined_stats = dplyr::rbind_all(lapply(estimate_runs, function(x) dplyr::rbind_list(x[["stats"]]))),
    raw = estimate_runs,
    plot = lapply(estimate_runs, function(x) x[["plot"]]),
    seed = lapply(estimate_runs, function(x) x[["seed"]]),
    saturation = lapply(estimate_runs, function(x) x[["saturation"]]),
    aln_no_3rd = remove_3rd_codon(aln),
    repetitions = rep,
    aln = aln,
    nseqs = nseqs
  )
  class(results)<-"oligodiag"
  class(results$aln)<-"DNAbin"
  class(results$aln_no_3rd) <- "DNAbin"
  
  if (save_aln){
    cat("Alignment written to", dir, "\n")
  }
  return(results)
}
genomewalker/oligo4fun documentation built on May 17, 2019, 1:11 a.m.