R/sim.msABC.sumstat.R

Defines functions sim.msABC.sumstat

Documented in sim.msABC.sumstat

#' Simulate summary statistics using msABC
#' @description This function simulates summary statistics using msABC.
#'              It is optimized for nextgen data, but it should also work for sanger.
#'              This will simulate the entire loci, not a single SNP.
#'              You should include all loci in the model object, including invariable ones. Use the get.data.structure function to set up the parameters of the loci - bp and number of individuals - to be simulated.
#' @param model A model object bult by the main.menu function. Both genomic and sanger-type models are allowed.
#' @param use.alpha Logical. If TRUE the most recent population size change will be exponential. If FALSE sudden demographic changes. Default is FALSE.
#'                  This argument changes ONLY the MOST RECENT demographich change.
#' @param nsim.blocks Number of blocks to simulate. The total number of simulations is: nsim.blocks x sim.block.size.
#' @param sim.block.size Simulations are performed in blocks. This argument defines the size of the block in number of simulations, i.e. how many simulations to run per block.
#'                       A block of 1000 will work for most cases. Increse the total number of simulations with nsim.block argument.
#' @param path Path to write the output. By default outputs will be saved in the working directory.
#' @param output.name String. The prefix of the output names. Defalt is "model"
#' @param append.sims Logical. If TRUE simulations will be appended in the last output. Default is FALSE.
#' @param msABC.call String. Path to the msABC executable. msABC binaries for Mac and Linux are included in the package and should work cases for these operating systems.
#'                   There is no need to change this unless you want to compile the program yourself and point the function to it.
#' @param mu.rates List. Distribution to sample the mutation rates. The first element of the list should be the name of the distribution as a character string. All distributions available in r-base and r-package e1071 are allowed.
#'                   The second element of the list must be the number of loci. The following elements are the parameters of the distribution to be passed on to the r-distribution function.
#'                    Ex.: mu.rates = list("rtnorm", 1000, 1e-9, 1e-9, 0). i.e sample 1000 values using the rtnorm function with mean 1e-9 and SD 1e-9, with lower tail limit at zero.
#' @param rec.rates List. Distribution to sample the recombination rates. The first element of the list should be the name of the distribution as a character string. All distributions available in r-base and r-package e1071 are allowed.
#'                   The second element of the list must be the number of loci. The following elements are the parameters of the distribution to be passed on to the r-distribution function.
#'                    Ex.: rec.rates = list("rtnorm", 1000, 1e-9, 1e-9, 0). i.e sample 1000 values using the rtnorm function with mean 1e-9 and SD 1e-9, with lower tail limit at zero.
#' @return Writes simulations and parameters to the path directory.
#' @references Hudson R.R. (2002) Generating samples under a Wright-Fisher neutral model of genetic variation. Bioinformatics, 18, 337–338.
#' @references Pavlidis P., Laurent S., & Stephan W. (2010) msABC: A modification of Hudson’s ms to facilitate multi-locus ABC analysis. Molecular Ecology Resources, 10, 723–727.
#' @author Marcelo Gehara
#' @export
sim.msABC.sumstat<-function(model, nsim.blocks, path=getwd(), use.alpha=F, mu.rates=NULL, rec.rates = NULL,
                            append.sims=F,block.size=10, msABC.call = get.msABC(),output.name,ncores){

  WD<-getwd()

  if(class(model)!="Model"){
    stop("First arguments should be an object of class Model generated by the main.menu() function.")
  }
  if(model$I[1,1]=="genomic"){
    stop("You need to get the data structure first. See function get.data.structure")
  }
  # set working directory
  setwd(path)

  locfile <- PipeMaster:::get.locfile(model)
  msABC.path <- find.package("PipeMaster")

  if(append.sims==F){
    com <- PipeMaster:::msABC.commander(model,use.alpha=use.alpha,arg=1)
    write.table(locfile,paste(".",1,"locfile.txt",sep=""),row.names = F,col.names = T,quote = F,sep=" ")
    options(warn=-1)
    x <- strsplit(system2(msABC.call, args=com[[1]], stdout = T,stderr=T,wait=T),"\t")
    options(warn=0)
    nam<-x[1][[1]]
    #TD_denom <- paste(nam[grep("pi",nam)],nam[grep("_w",nam)],sep="_")
    #nam<-nam[-grep("ZnS",nam)]
    #nam<-nam[-grep("thomson",nam)]
    #cols <- grep("fwh",nam)
    cols <- grep("thomson",nam)
    cols <- c(cols, grep("ZnS",nam))
    #cols <- c(cols,grep("_FayWuH",nam))
    if(length(cols)!=0) nam <- nam[-cols]
    #nam <- c(nam, TD_denom)
    nam <- c(com[[2]][1,],"mean.rate","sd.rate",nam)
    #t(paste(nam,"_skew",sep="")),t(paste(nam,"_var",sep="")))
    write.table(t(nam),file=paste("SIMS_",output.name,".txt",sep=""),quote=F,row.names = F,col.names = F, append=F,sep="\t")
}

  write(paste('arg <- commandArgs(TRUE)',
              'cat(paste("Core",arg),sep="\\n")',
              "suppressMessages(library(PipeMaster))",
              #"suppressMessages(library(devtools))",
              #"load_all('~/Github/PipeMaster')",
              'load(file=".PM_objects.RData")',
              "res<-sim.func(arg)",
              'save(res,file=paste(".",arg,"_SIMS.rda",sep=""))',
              'write(1,".log",append=T,sep="\\n")',
              'invisible(file.remove(paste(".",arg,"locfile.txt",sep="")))',
              "quit(save='no')",sep="\n"),".script_parallel.R")

  sim.func<-function(arg){

      simulations<-NULL
      for(i in 1:block.size){
        if(!(is.null(mu.rates))){
          rates<-do.call(mu.rates[[1]],args=mu.rates[2:length(mu.rates)])
          rates<-rep(rates, each=as.numeric(model$I[1,3]))
          rates<-list(rates,c(0,0))
        } else {
        rates <- sample.mu.rates(model)
        }
        if(!(is.null(rec.rates))){
          r.rates <- do.call(rec.rates[[1]],args=rec.rates[2:length(rec.rates)])
          r.rates <- rep(r.rates, each = as.numeric(model$I[1,3]))
        } else {
        r.rates <- 0
        }
        locfile[,5] <- rates[[1]]
        locfile[,6] <- r.rates

        write.table(locfile,paste(".",arg,"locfile.txt",sep=""),row.names = F,col.names = T,quote = F,sep=" ")
        com <- msABC.commander(model, use.alpha=use.alpha,arg=arg)
        options(warn=-1)
        sumstat <- read.table(text=system2(msABC.call, args=com[[1]], stdout = T,stderr=T,wait=T),header=T,sep="\t")
        options(warn=0)
        sumstat <- subset(sumstat, select=-c(X))
        #TD_denom <- data.frame(sumstat[,grep("pi",colnames(sumstat))]-sumstat[,grep("_w",colnames(sumstat))])
        #colnames(TD_denom) <- paste(colnames(sumstat)[grep("pi",colnames(sumstat))],sumstat[,grep("_w",colnames(sumstat))])
        #sumstat<-sumstat[,-grep("ZnS",colnames(sumstat))]
        #sumstat<-sumstat[,-grep("thomson",colnames(sumstat))]
        #sumstat <- cbind(sumstat, TD_denom)
        #Mean<-apply(sumstat,2,mean, na.rm = T)
        #var<-apply(sumstat,2,var, na.rm=T)
        #kur<-apply(sumstat,2,kurtosis, na.rm=T)
        #skew<-apply(sumstat,2,skewness, na.rm=T)

        #cols <- grep("fwh",colnames(sumstat))
        cols <- grep("thomson",colnames(sumstat))
        cols <- c(cols,grep("ZnS",colnames(sumstat)))
        #cols <- c(cols,grep("_FayWuH",colnames(sumstat)))

        if(length(cols)!=0) sumstat <- sumstat[,-cols]

        param <- c(com[[2]][2,],rates[[2]])
        names(param) <- c(com[[2]][1,],"mean.rate","sd.rate")
      simulations <- rbind(simulations,c(param,sumstat))
      }
    return(simulations)
  }

  parentls <- function()ls(envir=parent.frame())
  save(list=parentls(),file='.PM_objects.RData')

  total.sims<-0
  for(j in 1:nsim.blocks) {

    start.time <- Sys.time()

    write(0,".log")
    for(c in 1:ncores){
      system(paste("Rscript .script_parallel.R",c), wait = F)
    }

    l<-"0"
    while(sum(as.numeric(unlist(strsplit(l, "")))) < ncores){
        Sys.sleep(5)
        l<-readLines(".log")
      }

    file.remove(".log")

    simulations<-NULL
    cat("Reading simulations from worker nodes", sep="\n")
    for(c in 1:ncores){
        load(file = paste(".",c,"_SIMS.rda",sep=""))
        simulations <- rbind(simulations, res)
        }

    cat("Writing simulations to file", sep="\n")

    write.table(simulations,file=paste("SIMS_",output.name,".txt",sep=""),quote=F,row.names = F,col.names = F, append=T,sep="\t")

    cat("Removing old simulations", sep="\n")

    for(t in 1:ncores){
    file.remove(paste(".",t,"_SIMS.rda",sep=""))
    }

    end.time <- Sys.time()
    total.sims <- total.sims+(block.size*ncores)
    cycle.time <- (as.numeric(end.time)-as.numeric(start.time))/60/60
    total.time <- cycle.time*nsim.blocks
    passed.time <- cycle.time*j
    remaining.time <- round(total.time-passed.time,3)
    cat(paste("PipeMaster:: ",total.sims," (~",round((block.size*ncores)/cycle.time)," sims/h) | ~",remaining.time," hours remaining",sep=""),"\n")
  }
setwd(WD)
}
gehara/PipeMaster documentation built on April 19, 2024, 8:14 a.m.