#' Simulator of scChIP-seq read counts from alignment files
#'
#' Add description here
#'
#' @param label a character with the data label
#' @param path a character with the path where the simulated data will be saved
#' @param genome a character specifying the target genome
#' @param bamFile a character specifying the path of the target alignment file
#' @param binSize integer with the size of the bin window
#' @param baseSeqDepth integer with the baseline sequencing depth for all simulated cells
#' @param nCells integer with the number of cells to simulate
#' @param relSeqDepthPerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific relative sequencing depth with respect to \code{baseSeqDepth}.
#' For instance, if \code{relSeqDepthPerCell=rep(1,nCells)} then all cells are simulated with an equal
#' expected sequencing depth.
#' @param noisePerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific noise relative to the cell-specific expected sequencing depth. For instance,
#' if \code{noisePerCell=rep(0.05,nCells)} then 5% of the reads from the cell-specific sequencing depth
#' will be randomly assigned to any genomic bin (regardless if the bin is in a peak region or not).
#' @param chromosome vector of chromosomes to consider from the specified genome. Default is
#' \code{'chr1'}
#' @param basebinSize integer giving the base bin size to calculate the gold-standard set of peaks. It must be smaller than binSize.
#' Default 250
#' @param outputRanges a logical indicating whether (\code{TRUE}) or not (\code{FALSE}) to save the binned coordinates of the genome. Default is \code{TRUE}
#' @param excludeRegions a logical indicating whether to exclude blacklisted and gap regions or not. Default if FALSE
#' @param shuffleChunks either NULL or an integer specifying the width of the size of the chunks to shuffle. If an integer, it must be divisible by basebinSize
#' @param shuffleProp either NULL or the proportion of the genome to be shuffled
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @import data.table
#' @importFrom GenomicRanges tileGenome union
#' @import GenomeInfoDb
#' @importFrom Matrix sparseMatrix
#' @import stats
#' @importFrom rtracklayer browserSession getTable ucscTableQuery
#' @importFrom IRanges findOverlaps overlapsAny
#'
#' @export
#'
scChIPseqFromBam <- function(label,path,genome,bamFile,binSize,baseSeqDepth,nCells,relSeqDepthPerCell,
noisePerCell,chromosome = 'chr19',basebinSize = 250,
outputRanges = TRUE,excludeRegions = FALSE,shuffleChunks = 5000,shuffleProp = 0){
x = i = j = NULL
if(basebinSize>=binSize){stop("basebinSize must be smaller than binSize")}
if((is.null(shuffleChunks)+is.null(shuffleProp))==1){stop('shuffleChunks and shuffleProp must be either NULL or assume a non-null value')}
if(!is.null(shuffleChunks)){if(!shuffleChunks%%basebinSize==0){stop('shuffleChunks is not divisible by binSize')}}
if(!is.null(shuffleProp)){if(!(shuffleProp>=0 & shuffleProp<=1)){stop('shuffleProp must be between 0 and 1')}}
# Reading and subsetting the genome
if(genome %in% c('ce10','dm3','hg19','hg38','mm10')){
message('Loading internal genome\n')
siGenome <- get(paste0(genome,'.genome'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)
grGenome.full <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
grGenomeBinned.full <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
} else{
message('Loading external genome\n')
grGenome.full <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
grGenomeBinned.full <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
}
grGenome.full <- GenomeInfoDb::keepSeqlevels(grGenome.full,value = paste0('chr',1:22),pruning.mode = 'coarse')
grGenome <- GenomeInfoDb::keepSeqlevels(grGenome.full,value = chromosome,pruning.mode = 'coarse')
grGenomeBinned <- GenomeInfoDb::keepSeqlevels(grGenomeBinned.full,value = chromosome,pruning.mode = 'coarse')
# Excluding gap and blacklisted regions
if(excludeRegions){
# Setting up gap track
if(genome %in% c('dm3','dm6','hg19','hg38','mm10')){
message('Loading internal gap\n')
grGaps <- get(paste0(genome,'.gap'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)
} else{
message('Loading external gap\n')
session <- rtracklayer::browserSession()
GenomeInfoDb::genome(session) <- genome
grGaps <- GenomicRanges::makeGRangesFromDataFrame(
rtracklayer::getTable(rtracklayer::ucscTableQuery(session,table="gap")),
starts.in.df.are.0based = TRUE)
}
grGenome.full <- grGenome.full[!IRanges::overlapsAny(grGenome.full,grGaps)]
grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grGaps)]
grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grGaps)]
# Setting up blacklist track (swap the below for exists(paste0(genome,'.blacklist')))
if(genome %in% c('ce10','ce11','dm3','dm6','hg19','hg38','mm10')){
message('Loading internal blacklist\n')
grBlackList <- get(paste0(genome,'.blacklist'), envir = asNamespace('scChIPseqsim'), inherits = FALSE)
grGenome.full <- grGenome.full[!IRanges::overlapsAny(grGenome.full,grBlackList)]
grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grBlackList)]
grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grBlackList)]
}
} else{
grGaps <- GenomicRanges::GRanges()
grBlackList <- GenomicRanges::GRanges()
}
# Prinning extra windows
idx <- IRanges::overlapsAny(grGenome,grGenomeBinned)
idxBinned <- IRanges::overlapsAny(grGenomeBinned,grGenome)
grGenome <- grGenome[idx]
grGenomeBinned <- grGenomeBinned[idxBinned]
rm(idx,idxBinned)
# Matching the two ranges for aggregation of reads
dtHits <- data.table::as.data.table(IRanges::findOverlaps(grGenomeBinned,grGenome))
data.table::setnames(dtHits,c('i','subjectHits'))
data.table::setkey(dtHits,'subjectHits')
# Mapping counts
fragLength <- csaw::maximizeCcf(csaw::correlateReads(bam.files = bamFile,param=csaw::readParam(discard=GenomicRanges::union(grGaps,grBlackList))))
vecWeights <- bamsignals::bamCount(bampath = bamFile,gr = grGenome,verbose = FALSE,shift = fragLength/2)
vecWeights <- vecWeights/sum(vecWeights)
# Calculating the relative depth to the actual chromosome of interest
newBaseSeqDepth <- bamsignals::bamCount(bampath = bamFile,gr = grGenome.full,verbose = FALSE,shift = fragLength/2)
newBaseSeqDepth <- as.integer(baseSeqDepth*(sum(newBaseSeqDepth[as.character(GenomeInfoDb::seqnames(grGenome.full))%in%chromosome])/sum(newBaseSeqDepth)))
# Simulation parameters
nGenome <- length(grGenome)
nGenomeBinned <- length(grGenomeBinned)
cellSpecificDepth <- newBaseSeqDepth*relSeqDepthPerCell
# Shuffling positions
if(shuffleProp>0){
message('Shuffling chunks\n')
indexLength <- shuffleChunks/basebinSize
chunksToShuffle <- round(nGenome*shuffleProp/indexLength)
maxseq <- (length(grGenome)-indexLength+1)
grListFrom <- list(0)
grListTo <- list(0)
for(shuffleCount in 1:chunksToShuffle){
# Select a chunk from
sampledIndexFrom <- -1:0
sampledIndexTo <- -1:0
while(length(intersect(sampledIndexFrom,sampledIndexTo))>0 |
length(intersect(union(sampledIndexFrom,sampledIndexTo),union(unlist(grListFrom),unlist(grListTo))))>0){
sampledIndexFrom <- sample(maxseq,1,replace = F) + 0:(indexLength-1)
sampledIndexTo <- sample(maxseq,1,replace = F) + 0:(indexLength-1)
}
# Suffling vecWeights
aux <- vecWeights[sampledIndexTo]
vecWeights[sampledIndexTo] <- vecWeights[sampledIndexFrom]
vecWeights[sampledIndexFrom] <- aux
grListFrom[[shuffleCount]] <- sampledIndexFrom
grListTo[[shuffleCount]] <- sampledIndexTo
}
}
rm(indexLength,chunksToShuffle,grListFrom,grListTo,shuffleCount,sampledIndexFrom,sampledIndexTo,aux,maxseq)
# Creating the gold-standard
grGenomeBinned$reference <- dtHits[,x := vecWeights][, list(x = sum(x)), by = 'i'][['x']]
# Looping over cells to create counts
matCounts <- data.table::rbindlist(lapply(seq_len(nCells),FUN = function(cell){
# Creating counts
dtHits[,x := stats::rbinom(n = nGenome,size = 2,
prob = range01(cellSpecificDepth[cell]*(1-noisePerCell[cell])*vecWeights/2+cellSpecificDepth[cell]*noisePerCell[cell]/(2*nGenome)))]
# Print progress
message(paste0('Simulation for ',label,'. Progress: ',round(cell/nCells* 100),"%"),"\r")
# Aggregating reads into the binned ranges
return(data.table::data.table(dtHits[(x>0),][,list(x = sum(x)),by = 'i'],j = cell))
}))
# Transforming it into sparseMatrix
matCounts <- with(matCounts,Matrix::sparseMatrix(i = i,j = j,x = x,
dims = list(nGenomeBinned,nCells),
dimnames = list(NULL,paste(label,paste0('c',seq_len(nCells)),sep='_'))))
# Output
message("Done. The following file(s) has(have) been saved:\n")
if(outputRanges){
file <- file.path(path,paste(label,'ranges.RData',sep = '_'))
message(file,'\n')
save(grGenomeBinned,file = file,compress = 'xz')
}
file <- file.path(path,paste(label,'counts.RData',sep = '_'))
message(file,'\n')
save(matCounts,file = file,compress = 'xz')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.