R/simScenarios.R

Defines functions simScenarios

Documented in simScenarios

#' Multi-scenario simulator of single-cell epigenomic data
#'
#' Add description here
#'
#' @param type either 'scATACseq' or 'scChIPseq'
#' @param outdir directory where the output files will be saved
#' @param reference the reference set of peaks.
#' @param seed an integer defining the seed
#' @param binSize size of the bins of the final simulated dataset
#' @param shuffleChunks size of the chunks that will be swapped across the genome (applicable for scChIPseq data only)
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @rawNamespace import(GenomicRanges, except = c(shift,update))
#' @rawNamespace import(BiocGenerics, except = c(density,mad,weights,residuals,IQR))
#' @import S4Vectors
#'
#' @export
#'
simScenarios <- function(type, outdir, reference, seed = 2020, binSize = 500, shuffleChunks = 5000){

    # Check if outdir is in fact a full path
    if(!substr(outdir,1,1)=='/'){stop("outdir must be a full path")}

    # Set the seed

    set.seed(seed)

    # Get current directory

    workdir <- getwd()

    # Creating output dir

    if(!dir.exists(outdir)){
        system2("mkdir",paste("-p",outdir))
    }

    # Scenarios

    scenarios <- generateScenarios(type)

    # Sorting the scenarios

    scenarios <- scenarios[order(scenarios$Groups,scenarios$Cells),]

    # Looping through scenarios

    for(i in seq_len(nrow(scenarios))){

        # Subsetting scenario

        scenario <- scenarios[i,]

        # Creating directory

        datadir <- file.path(outdir,scenario$Label)
        if(!dir.exists(datadir)){
            system2("mkdir",paste("-p",datadir))
        }

        # Scenario-specific parameters

        ngroups <- ifelse(scenario$Groups == 1,3,ifelse(scenario$Groups == 2,6,ifelse(scenario$Groups == 3,9,NA)))
        cellsLevel <- ifelse(scenario$Cells == 1,500,ifelse(scenario$Cells == 2,1000,ifelse(scenario$Cells == 3,2500,NA)))
        rareLevel <- ifelse(scenario$Rarity == 1,0,ifelse(scenario$Rarity == 2,0.90,ifelse(scenario$Rarity == 3,0.95,NA)))
        noiseLevel <- ifelse(scenario$Noise == 1,0,ifelse(scenario$Noise == 2,0.25,ifelse(scenario$Noise == 3,0.75,NA)))
        depthLevel <- ifelse(scenario$Depth == 1,5000,ifelse(scenario$Depth == 2,10000,ifelse(scenario$Depth == 3,25000,NA)))
        nrare <- ngroups/3
        cores <- ngroups
        reqTime <- ifelse(scenario$Cells == 1,1,ifelse(scenario$Cells == 2,2,ifelse(scenario$Cells == 3,3,NA)))

        refList <- reference[seq_len(ngroups)]

        if(scenario$Data == 'scATACseq'){
            shuffleProp <- 0
            genome <- 'hg19'
        } else{
            shuffleProp <- ifelse(scenario$Dissimilarity == 1,0.01,ifelse(scenario$Dissimilarity == 2,0.05,ifelse(scenario$Dissimilarity == 3,0.10,NA)))
            genome <- 'hg38'
        }

        # Constructing input data

        cellList <- lapply(seq_len(ngroups),function(x){ifelse(x<=nrare,cellsLevel*(1-rareLevel),cellsLevel)})
        noiseList <- lapply(seq_len(ngroups),function(x){rep(noiseLevel,cellList[[x]])}) # Assumed constant across cells and clusters
        depthList <- lapply(seq_len(ngroups),function(x){depthLevel}) # Assumed constant across cells and clusters
        relDepthList <- lapply(seq_len(ngroups),function(x){rep(1,cellList[[x]])})

        # Looping through simulations

        for(j in seq_len(scenario$Sim)){

            label <- paste0('Simulation',j)
            path <- datadir

            if(!file.exists(file.path(datadir,paste0('Simulation',j,'.RData')))){

                # Saving input
                save(refList,cellList,noiseList,depthList,relDepthList,binSize,shuffleChunks,
                     label,path,genome,shuffleProp,cores,file = file.path(datadir,paste0('Input',j,'.RData')))

                # Writing R code
                sink(file = file.path(datadir,paste0('Simulation',j,'.R')))
                cat("

library(scChIPseqsim)

load('./Input",j,".RData')

simData(refList = refList,
cellList = cellList,
noiseList = noiseList,
depthList = depthList,
relDepthList = relDepthList,
label = label,
path = path,
genome = genome,
binSize = binSize,
shuffleChunks = shuffleChunks,
shuffleProp = shuffleProp,
cores = cores)

system2('rm','./Input",j,".RData')

                ",sep = "")
                sink()

                # Changing directories and submitting job
                setwd(datadir)
                # system(paste0('sbatch -t ',reqTime,':00:00 --mem=',4*cores,'g --cpus-per-task=',(cores+1),' R CMD BATCH ',paste0('./Simulation',j,'.R')))
                system(paste0('sbatch -t ',reqTime,':00:00 --mem=',4*cores,'g --ntasks=',(cores+1),' R CMD BATCH ',paste0('./Simulation',j,'.R')))
                setwd(workdir)
            }
        }
    }
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.