R/simMikado.R

Defines functions simMikado

Documented in simMikado

#' Multi-scenario simulator of single-cell epigenomic data
#'
#' Add description here
#'
#' @param path path where the datasets will be saved
#' @param nsim number of simulated datasets
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @rawNamespace import(GenomicRanges, except = c(shift,update))
#' @rawNamespace import(BiocGenerics, except = c(density,mad,weights,residuals,IQR))
#' @import S4Vectors
#'
#' @export
#'
simMikado <- function(path = 'Data',nsim = 100){

    system(paste('mkdir',path))
    path <- normalizePath(file.path(path))

    # Defining transition matrix

    P <- list(matrix(c(0.995,0.01,0.005,0.99),2,2),
              matrix(c(0.995,0.01,0.005,0.99),2,2),
              matrix(c(0.995,0.01,0.005,0.99),2,2))
    palloc <- 0.7

    # General parameters

    M <- 1e4 # Genome size
    nClusters <- 3 # Number of clusters
    cellsPerCluster <- 100 # Number of cells per cluster
    depth <- round(seq(250,250,length.out = nClusters)) # Average sequencing depth

    # Cluster & cell assignments

    cells <- rep(seq_len(cellsPerCluster),times = nClusters)
    clusters <- rep(seq_len(nClusters),each = cellsPerCluster)

    # Set seed
    seed <- sample(1e7,1,replace = F)
    set.seed(seed)

    # Simulating data

    for(sim in seq_len(nsim)){
        message('Simulating data ',sim,'. The seed is ',seed,'.')

        # Generate cluster-specific chains (see set.seed)

        Z <- do.call(cbind,lapply(seq_len(nClusters),function(i){
            return(epigraHMM:::generateHMM(P[[i]],M))
        }))

        # Defining bernoulli probabilities & counts

        mat <- do.call(cbind,lapply(clusters,function(i){
            prob <- (1-palloc)*depth[i]*(Z[,i]==1)/(2*sum(Z[,i]==1)) + palloc*depth[i]*(Z[,i]==2)/(2*sum(Z[,i]==2))
            return(1*(rbinom(M,2,prob)>0))
        }))

        # Creating SC object

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = Matrix::Matrix(mat,sparse = T,
                                                                                               dimnames = list(NULL,paste(paste0('Sim',sim),
                                                                                                                          paste0('Cluster',clusters),
                                                                                                                          paste0('Cell',cells),sep='_')))),
                                                         rowRanges = GenomicRanges::GRanges('chr1',IRanges::IRanges(1+500*0:(M-1),500+500*0:(M-1))),
                                                         colData = data.frame(Cluster = clusters))

        # Saving data
        save(sc,file = file.path(path,paste0('Simulation',sim,'.RData')),compress = 'xz')
    }
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.