R/scChIPseqFromBam.R

Defines functions scChIPseqFromBam

Documented in scChIPseqFromBam

#' Simulator of scChIP-seq read counts from alignment files
#'
#' Add description here
#'
#' @param label a character with the data label
#' @param path a character with the path where the simulated data will be saved
#' @param genome a character specifying the target genome
#' @param bamFile a character specifying the path of the target alignment file
#' @param binSize integer with the size of the bin window
#' @param baseSeqDepth integer with the baseline sequencing depth for all simulated cells
#' @param nCells integer with the number of cells to simulate
#' @param relSeqDepthPerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific relative sequencing depth with respect to \code{baseSeqDepth}.
#' For instance, if \code{relSeqDepthPerCell=rep(1,nCells)} then all cells are simulated with an equal
#' expected sequencing depth.
#' @param noisePerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific noise relative to the cell-specific expected sequencing depth. For instance,
#' if \code{noisePerCell=rep(0.05,nCells)} then 5% of the reads from the cell-specific sequencing depth
#' will be randomly assigned to any genomic bin (regardless if the bin is in a peak region or not).
#' @param chromosome vector of chromosomes to consider from the specified genome. Default is
#' \code{'chr1'}
#' @param basebinSize integer giving the base bin size to calculate the gold-standard set of peaks. It must be smaller than binSize.
#' Default 250
#' @param outputRanges a logical indicating whether (\code{TRUE}) or not (\code{FALSE}) to save the binned coordinates of the genome. Default is \code{TRUE}
#' @param excludeRegions a logical indicating whether to exclude blacklisted and gap regions or not. Default if FALSE
#' @param shuffleChunks either NULL or an integer specifying the width of the size of the chunks to shuffle. If an integer, it must be divisible by basebinSize
#' @param shuffleProp either NULL or the proportion of the genome to be shuffled
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @import data.table
#' @importFrom GenomicRanges tileGenome union
#' @import GenomeInfoDb
#' @importFrom Matrix sparseMatrix
#' @import stats
#' @importFrom rtracklayer browserSession getTable ucscTableQuery
#' @importFrom IRanges findOverlaps overlapsAny
#'
#' @export
#'
scChIPseqFromBam <- function(label,path,genome,bamFile,binSize,baseSeqDepth,nCells,relSeqDepthPerCell,
                             noisePerCell,chromosome = 'chr19',basebinSize = 250,
                             outputRanges = TRUE,excludeRegions = FALSE,shuffleChunks = 5000,shuffleProp = 0){

    x = i = j = NULL

    if(basebinSize>=binSize){stop("basebinSize must be smaller than binSize")}

    if((is.null(shuffleChunks)+is.null(shuffleProp))==1){stop('shuffleChunks and shuffleProp must be either NULL or assume a non-null value')}

    if(!is.null(shuffleChunks)){if(!shuffleChunks%%basebinSize==0){stop('shuffleChunks is not divisible by binSize')}}

    if(!is.null(shuffleProp)){if(!(shuffleProp>=0 & shuffleProp<=1)){stop('shuffleProp must be between 0 and 1')}}

    # Reading and subsetting the genome
    if(genome %in% c('ce10','dm3','hg19','hg38','mm10')){
        message('Loading internal genome\n')
        siGenome <- get(paste0(genome,'.genome'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)
        grGenome.full <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
        grGenomeBinned.full <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
    } else{
        message('Loading external genome\n')
        grGenome.full <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
        grGenomeBinned.full <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
    }

    grGenome.full <- GenomeInfoDb::keepSeqlevels(grGenome.full,value = paste0('chr',1:22),pruning.mode = 'coarse')
    grGenome <- GenomeInfoDb::keepSeqlevels(grGenome.full,value = chromosome,pruning.mode = 'coarse')
    grGenomeBinned <- GenomeInfoDb::keepSeqlevels(grGenomeBinned.full,value = chromosome,pruning.mode = 'coarse')

    # Excluding gap and blacklisted regions
    if(excludeRegions){
        # Setting up gap track
        if(genome %in% c('dm3','dm6','hg19','hg38','mm10')){
            message('Loading internal gap\n')
            grGaps <- get(paste0(genome,'.gap'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)
        } else{
            message('Loading external gap\n')
            session <- rtracklayer::browserSession()
            GenomeInfoDb::genome(session) <- genome
            grGaps <- GenomicRanges::makeGRangesFromDataFrame(
                rtracklayer::getTable(rtracklayer::ucscTableQuery(session,table="gap")),
                starts.in.df.are.0based = TRUE)
        }

        grGenome.full <- grGenome.full[!IRanges::overlapsAny(grGenome.full,grGaps)]

        grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grGaps)]
        grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grGaps)]

        # Setting up blacklist track (swap the below for exists(paste0(genome,'.blacklist')))
        if(genome %in% c('ce10','ce11','dm3','dm6','hg19','hg38','mm10')){
            message('Loading internal blacklist\n')
            grBlackList <- get(paste0(genome,'.blacklist'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)

            grGenome.full <- grGenome.full[!IRanges::overlapsAny(grGenome.full,grBlackList)]

            grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grBlackList)]
            grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grBlackList)]
        }
    } else{
        grGaps <- GenomicRanges::GRanges()
        grBlackList <- GenomicRanges::GRanges()
    }

    # Prinning extra windows
    idx <- IRanges::overlapsAny(grGenome,grGenomeBinned)
    idxBinned <- IRanges::overlapsAny(grGenomeBinned,grGenome)

    grGenome <- grGenome[idx]
    grGenomeBinned <- grGenomeBinned[idxBinned]

    rm(idx,idxBinned)

    # Matching the two ranges for aggregation of reads
    dtHits <- data.table::as.data.table(IRanges::findOverlaps(grGenomeBinned,grGenome))
    data.table::setnames(dtHits,c('i','subjectHits'))
    data.table::setkey(dtHits,'subjectHits')

    # Mapping counts
    fragLength <- csaw::maximizeCcf(csaw::correlateReads(bam.files = bamFile,param=csaw::readParam(discard=GenomicRanges::union(grGaps,grBlackList))))
    vecWeights <- bamsignals::bamCount(bampath = bamFile,gr = grGenome,verbose = FALSE,shift = fragLength/2)
    vecWeights <- vecWeights/sum(vecWeights)

    # Calculating the relative depth to the actual chromosome of interest

    newBaseSeqDepth <- bamsignals::bamCount(bampath = bamFile,gr = grGenome.full,verbose = FALSE,shift = fragLength/2)
    newBaseSeqDepth <- as.integer(baseSeqDepth*(sum(newBaseSeqDepth[as.character(GenomeInfoDb::seqnames(grGenome.full))%in%chromosome])/sum(newBaseSeqDepth)))

    # Simulation parameters
    nGenome <- length(grGenome)
    nGenomeBinned <- length(grGenomeBinned)
    cellSpecificDepth <- newBaseSeqDepth*relSeqDepthPerCell

    # Shuffling positions
    if(shuffleProp>0){
        message('Shuffling chunks\n')
        indexLength <- shuffleChunks/basebinSize
        chunksToShuffle <- round(nGenome*shuffleProp/indexLength)
        maxseq <- (length(grGenome)-indexLength+1)

        grListFrom <- list(0)
        grListTo <- list(0)
        for(shuffleCount in 1:chunksToShuffle){
            # Select a chunk from
            sampledIndexFrom <- -1:0
            sampledIndexTo <- -1:0

            while(length(intersect(sampledIndexFrom,sampledIndexTo))>0 |
                  length(intersect(union(sampledIndexFrom,sampledIndexTo),union(unlist(grListFrom),unlist(grListTo))))>0){
                sampledIndexFrom <- sample(maxseq,1,replace = F) + 0:(indexLength-1)
                sampledIndexTo <- sample(maxseq,1,replace = F) + 0:(indexLength-1)
            }

            # Suffling vecWeights

            aux <- vecWeights[sampledIndexTo]
            vecWeights[sampledIndexTo] <- vecWeights[sampledIndexFrom]
            vecWeights[sampledIndexFrom] <- aux

            grListFrom[[shuffleCount]] <- sampledIndexFrom
            grListTo[[shuffleCount]] <- sampledIndexTo
        }
    }
    rm(indexLength,chunksToShuffle,grListFrom,grListTo,shuffleCount,sampledIndexFrom,sampledIndexTo,aux,maxseq)

    # Creating the gold-standard
    grGenomeBinned$reference <- dtHits[,x := vecWeights][, list(x = sum(x)), by = 'i'][['x']]

    # Looping over cells to create counts
    matCounts <- data.table::rbindlist(lapply(seq_len(nCells),FUN = function(cell){

        # Creating counts

        dtHits[,x := stats::rbinom(n = nGenome,size = 2,
                                   prob = range01(cellSpecificDepth[cell]*(1-noisePerCell[cell])*vecWeights/2+cellSpecificDepth[cell]*noisePerCell[cell]/(2*nGenome)))]

        # Print progress

        message(paste0('Simulation for ',label,'. Progress: ',round(cell/nCells* 100),"%"),"\r")

        # Aggregating reads into the binned ranges
        return(data.table::data.table(dtHits[(x>0),][,list(x = sum(x)),by = 'i'],j = cell))

    }))

    # Transforming it into sparseMatrix

    matCounts <- with(matCounts,Matrix::sparseMatrix(i = i,j = j,x = x,
                                                     dims = list(nGenomeBinned,nCells),
                                                     dimnames = list(NULL,paste(label,paste0('c',seq_len(nCells)),sep='_'))))

    # Output

    message("Done. The following file(s) has(have) been saved:\n")
    if(outputRanges){
        file <- file.path(path,paste(label,'ranges.RData',sep = '_'))
        message(file,'\n')
        save(grGenomeBinned,file = file,compress = 'xz')
    }

    file <- file.path(path,paste(label,'counts.RData',sep = '_'))
    message(file,'\n')
    save(matCounts,file = file,compress = 'xz')
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.