R/simData.R

Defines functions simData

Documented in simData

#' Multi-scenario simulator of single-cell epigenomic data
#'
#' Add description here
#'
#' @param refList either a list of GenomicRanges (each element specifies the cluster's peaks) or a list of bam paths (each element specifies the alignment to generate data from)
#' @param cellList list of size length(refList) with the number of cells per cluster
#' @param noiseList list of size length(refList) with cluster-specific vectors of noise values from each cell
#' @param depthList list of size length(refList) with the baseline number of cells per cluster
#' @param relDepthList list of size length(refList) with cluster-specific vectors of relative depth values from each cell
#' @param label a character with the data label
#' @param path a character with the path where the simulated data will be saved
#' @param genome a character specifying the target genome
#' @param binSize integer with the size of the bin window
#' @param cores number of cores for parallel computing
#' @param shuffleProp either NULL (default) or the proportion of the genome to be shuffled (only applicable for scChIPseqFromBam)
#' @param shuffleChunks size of the chunks that will be swapped across the genome (applicable for scChIPseq data only)
#' @param ... Arguments to pass to the scChIPseqsim constructor.
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @importFrom parallel mclapply
#' @import data.table
#' @rawNamespace import(SingleCellExperiment, except = weights)
#' @importFrom foreach %dopar%
#'
#' @export
#'
simData <- function(refList,cellList,noiseList,depthList,relDepthList,
                    label,path,genome,binSize,cores = 1,shuffleProp = NULL,shuffleChunks = 5000,...){

    matCounts = grGenomeBinned = i = NULL

    nGroups <- length(refList)

    # Registering number of cores

    doParallel::registerDoParallel(cores = cores)

    # Creating auxiliary variable

    newLabel <- paste0(label,paste0('_g',seq_len(nGroups)))

    # Running simulation in parallel
    if(cores==1){
        cat('Simulating counts\n')
    } else{
        cat('Simulating counts in parallel\n')
    }

    matLength <- NULL
    genomeLength <- NULL

    toiter <- unlist(lapply(seq_len(length(newLabel)),FUN = function(x){
        tryCatch({
            suppressWarnings({
                load(file.path(path,paste(newLabel[[x]],'counts.RData',sep = '_')))
                matLength[x] <<- nrow(matCounts)
                rm(matCounts)

                load(file.path(path,paste(newLabel[[x]],'ranges.RData',sep = '_')))
                genomeLength[x] <<- length(grGenomeBinned)
                rm(grGenomeBinned)
            })
        },error = function(cond){return(x)})
    }))

    if(length(unique(na.omit(matLength)))>1 | length(unique(na.omit(genomeLength)))>1){
        toiter <- union(toiter,as.numeric(names(table(matLength)[table(matLength) == min(table(matLength))])))
        toiter <- union(toiter,as.numeric(names(table(genomeLength)[table(genomeLength) == min(table(genomeLength))])))
    }

    if(all(unlist(lapply(refList,is.character)))){
        foreach::foreach(i = iterators::iter(toiter)) %dopar% scChIPseqFromBam(label = newLabel[i],
                                                                               path = path,genome = genome,binSize = binSize,
                                                                               bamFile = refList[[i]],
                                                                               nCells = cellList[[i]],
                                                                               baseSeqDepth = depthList[[i]],
                                                                               relSeqDepthPerCell = relDepthList[[i]],
                                                                               noisePerCell = noiseList[[i]],
                                                                               excludeRegions = TRUE,outputRanges = TRUE,
                                                                               shuffleProp = shuffleProp,
                                                                               shuffleChunks = shuffleChunks)

    } else{
        foreach::foreach(i = iterators::iter(toiter)) %dopar% scChIPseqFromPeaks(label = newLabel[i],
                                                                                 path = path,genome = genome,binSize = binSize,
                                                                                 grPeaks = refList[[i]],
                                                                                 nCells = cellList[[i]],
                                                                                 baseSeqDepth = depthList[[i]],
                                                                                 relSeqDepthPerCell = relDepthList[[i]],
                                                                                 noisePerCell = noiseList[[i]],
                                                                                 excludeRegions = TRUE,outputRanges = TRUE)
    }

    # Merging everything now

    cat('Merging simulated counts\n')

    mat <- list()
    metadata <- list()

    for(x in seq_len(nGroups)){
        load(file.path(path,paste(newLabel[x],'counts.RData',sep = '_')))
        mat[[x]] <- matCounts
        metadata[[x]] <- data.table(Cluster = rep(x,cellList[[x]]),
                                    Depth = depthList[[x]]*relDepthList[[x]],
                                    Noise = noiseList[[x]])
        rm(matCounts)
    }

    # Merging ranges

    cat('Merging ranges\n')

    for(x in seq_len(nGroups)){
        load(file.path(path,paste(newLabel[x],'ranges.RData',sep = '_')))

        if(x==1){
            grGenome <- grGenomeBinned
            colnames(S4Vectors::values(grGenome)) <- paste0('reference',x)
        } else{
            S4Vectors::values(grGenome) <- cbind(S4Vectors::values(grGenome),S4Vectors::DataFrame('reference' = grGenomeBinned$reference))
            colnames(S4Vectors::values(grGenome))[x] <- paste0('reference',x)
        }
        rm(grGenomeBinned)
    }

    # Creating SingleCellExperiment

    sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = do.call(cbind,mat)),
                                                     rowRanges = grGenome,
                                                     colData = rbindlist(metadata))

    # Removing old files
    for(x in seq_len(nGroups)){
        unlink(file.path(path,paste(newLabel[x],'counts.RData',sep = '_')))
        unlink(file.path(path,paste(newLabel[x],'ranges.RData',sep = '_')))
    }


    # Saving the resulting output
    cat("Done. The following file has been saved:\n")

    file <- file.path(path,paste0(label,'.RData'))
    cat(file,'\n')
    save(sc,file = file,compress = 'xz')
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.