R/scChIPseqFromPeaks.R

Defines functions scChIPseqFromPeaks

Documented in scChIPseqFromPeaks

#' Simulator of scChIP-seq read counts from pseudo-bulk profiles
#'
#' Add description here
#'
#' @param label a character with the data label
#' @param path a character with the path where the simulated data will be saved
#' @param genome a character specifying the target genome
#' @param grPeaks a GenomicRanges object with the coordinates of peaks from pseudo-bulk data
#' @param binSize integer with the size of the bin window
#' @param baseSeqDepth integer with the baseline sequencing depth for all simulated cells
#' @param nCells integer with the number of cells to simulate
#' @param relSeqDepthPerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific relative sequencing depth with respect to \code{baseSeqDepth}.
#' For instance, if \code{relSeqDepthPerCell=rep(1,nCells)} then all cells are simulated with an equal
#' expected sequencing depth.
#' @param noisePerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific noise relative to the cell-specific expected sequencing depth. For instance,
#' if \code{noisePerCell=rep(0.05,nCells)} then 5% of the reads from the cell-specific sequencing depth
#' will be randomly assigned to any genomic bin (regardless if the bin is in a peak region or not).
#' @param chromosome vector of chromosomes to consider from the specified genome. Default is
#' \code{c(paste0('chr',1:22),'chrX')}
#' @param basebinSize integer giving the base bin size to calculate the gold-standard set of peaks. It must be smaller than binSize.
#' Default 250
#' @param outputRanges a logical indicating whether (\code{TRUE}) or not (\code{FALSE}) to save the binned coordinates of the genome. Default is \code{TRUE}
#' @param excludeRegions a logical indicating whether to exclude blacklisted and gap regions or not. Default if FALSE
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @import data.table
#' @importFrom GenomicRanges tileGenome union
#' @import GenomeInfoDb
#' @importFrom Matrix sparseMatrix
#' @import stats
#' @importFrom rtracklayer browserSession getTable ucscTableQuery
#' @importFrom IRanges findOverlaps overlapsAny
#'
#' @export
#'
scChIPseqFromPeaks <- function(label,path,genome,grPeaks,binSize,baseSeqDepth,nCells,relSeqDepthPerCell,
                         noisePerCell,chromosome = c(paste0('chr',1:22),'chrX'),basebinSize = 250,
                         outputRanges = TRUE,excludeRegions = FALSE){

    x = i = j = NULL

    if(basebinSize>=binSize){stop("basebinSize must be smaller than binSize")}

    # Reading and subsetting the genome
    if(exists(paste0(genome,'.genome'))){
        cat('Loading internal genome\n')
        siGenome <- get(paste0(genome,'.genome'))

        grGenome <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)

        grGenomeBinned <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
    } else{
        cat('Loading external genome\n')
        grGenome <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)

        grGenomeBinned <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
    }

    grGenome <- GenomeInfoDb::keepSeqlevels(grGenome,value = chromosome,pruning.mode = 'coarse')
    grGenomeBinned <- GenomeInfoDb::keepSeqlevels(grGenomeBinned,value = chromosome,pruning.mode = 'coarse')

    # Excluding gap and blacklisted regions
    if(excludeRegions){

        # Setting up gap track
        if(exists(paste0(genome,'.gap'))){
            cat('Loading internal gap\n')
            grGaps <- get(paste0(genome,'.gap'))
        } else{
            cat('Loading external gap\n')
            session <- rtracklayer::browserSession()
            GenomeInfoDb::genome(session) <- genome
            grGaps <- GenomicRanges::makeGRangesFromDataFrame(
                rtracklayer::getTable(rtracklayer::ucscTableQuery(session,table="gap")),
                starts.in.df.are.0based = TRUE)
        }

        grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grGaps)]
        grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grGaps)]

        # Setting up blacklist track
        if(exists(paste0(genome,'.blacklist'))){
            cat('Loading internal blacklist\n')
            grBlackList <- get(paste0(genome,'.blacklist'))

            grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grBlackList)]
            grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grBlackList)]
        }
    }

    # Prinning extra windows
    idx <- IRanges::overlapsAny(grGenome,grGenomeBinned)
    idxBinned <- IRanges::overlapsAny(grGenomeBinned,grGenome)

    grGenome <- grGenome[idx]
    grGenomeBinned <- grGenomeBinned[idxBinned]

    # Matching the two ranges for aggregation of reads
    dtHits <- data.table::as.data.table(IRanges::findOverlaps(grGenomeBinned,grGenome))
    data.table::setnames(dtHits,c('i','subjectHits'))
    data.table::setkey(dtHits,'subjectHits')

    # Checking overlaps
    tmp <- data.table::data.table(i = which(IRanges::overlapsAny(grGenome,grPeaks)==T))

    # Simulation parameters
    nGenome <- length(grGenome)
    nOverlap <- nrow(tmp)
    nGenomeBinned <- length(grGenomeBinned)

    # Creating the gold-standard
    grGenomeBinned$reference <- F
    grGenomeBinned$reference[dtHits[tmp$i,][,.N,by='i']$i] <- T

    # Looping over cells to create counts
    matCounts <- data.table::rbindlist(lapply(seq_len(nCells),FUN = function(cell){
        # Creating noise
        ## We will randomly add noise in genomic windows (peak or not)
        ## We assume that noise in bins ~ Binomial(2,q)
        ## Hence, the cell-specific baseline amount of noise reads is 2*length(grGenome)*q
        ## Assuming that the cell-specific baseline expected amount of noise reads is (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]
        ## Therefore, q = (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]/(2*(length(grGenome)))

        dtHits[,x := stats::rbinom(n = nGenome,size = 2,prob = (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]/(2*nGenome+.Machine$double.xmin))]

        # Creating counts
        ## We assume that reads from every peak bin ~ Binomial(2,p)
        ## Hence, the cell-specific baseline expected depth is 2*nrow(tmp)*p.
        ## Asuming that the cell-specific baseline expected depth is (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])
        ## Therefore, p = (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])/(2*nrow(tmp))

        dtHits[tmp$i, x := x + stats::rbinom(n = nOverlap,size = 2,prob = (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])/(2*nOverlap))]

        # Print progress

        cat(paste0('Simulation for ',label,'. Progress: ',round(cell/nCells* 100),"%"),"\r")

        # Aggregating reads into the binned ranges
        return(data.table::data.table(dtHits[,list(x = sum(x)),by = 'i'][(x>0),],j = cell))
    }))

    # Transforming it into sparseMatrix

    matCounts <- with(matCounts,Matrix::sparseMatrix(i = i,j = j,x = x,
                                             dims = list(nGenomeBinned,nCells),
                                             dimnames = list(NULL,paste(label,paste0('c',seq_len(nCells)),sep='_'))))

    cat("Done. The following file(s) has(have) been saved:\n")
    if(outputRanges){
        file <- file.path(path,paste(label,'ranges.RData',sep = '_'))
        cat(file,'\n')
        save(grGenomeBinned,file = file,compress = 'xz')
    }

    file <- file.path(path,paste(label,'counts.RData',sep = '_'))
    cat(file,'\n')
    save(matCounts,file = file,compress = 'xz')
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.