#' Simulator of scChIP-seq read counts from pseudo-bulk profiles
#'
#' Add description here
#'
#' @param label a character with the data label
#' @param path a character with the path where the simulated data will be saved
#' @param genome a character specifying the target genome
#' @param grPeaks a GenomicRanges object with the coordinates of peaks from pseudo-bulk data
#' @param binSize integer with the size of the bin window
#' @param baseSeqDepth integer with the baseline sequencing depth for all simulated cells
#' @param nCells integer with the number of cells to simulate
#' @param relSeqDepthPerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific relative sequencing depth with respect to \code{baseSeqDepth}.
#' For instance, if \code{relSeqDepthPerCell=rep(1,nCells)} then all cells are simulated with an equal
#' expected sequencing depth.
#' @param noisePerCell vector of size \code{nCells} with positive scalars representing the
#' cell-specific noise relative to the cell-specific expected sequencing depth. For instance,
#' if \code{noisePerCell=rep(0.05,nCells)} then 5% of the reads from the cell-specific sequencing depth
#' will be randomly assigned to any genomic bin (regardless if the bin is in a peak region or not).
#' @param chromosome vector of chromosomes to consider from the specified genome. Default is
#' \code{c(paste0('chr',1:22),'chrX')}
#' @param basebinSize integer giving the base bin size to calculate the gold-standard set of peaks. It must be smaller than binSize.
#' Default 250
#' @param outputRanges a logical indicating whether (\code{TRUE}) or not (\code{FALSE}) to save the binned coordinates of the genome. Default is \code{TRUE}
#' @param excludeRegions a logical indicating whether to exclude blacklisted and gap regions or not. Default if FALSE
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @import data.table
#' @importFrom GenomicRanges tileGenome union
#' @import GenomeInfoDb
#' @importFrom Matrix sparseMatrix
#' @import stats
#' @importFrom rtracklayer browserSession getTable ucscTableQuery
#' @importFrom IRanges findOverlaps overlapsAny
#'
#' @export
#'
scChIPseqFromPeaks <- function(label,path,genome,grPeaks,binSize,baseSeqDepth,nCells,relSeqDepthPerCell,
noisePerCell,chromosome = c(paste0('chr',1:22),'chrX'),basebinSize = 250,
outputRanges = TRUE,excludeRegions = FALSE){
x = i = j = NULL
if(basebinSize>=binSize){stop("basebinSize must be smaller than binSize")}
# Reading and subsetting the genome
if(exists(paste0(genome,'.genome'))){
cat('Loading internal genome\n')
siGenome <- get(paste0(genome,'.genome'))
grGenome <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
grGenomeBinned <- GenomicRanges::tileGenome(seqlengths = siGenome,tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
} else{
cat('Loading external genome\n')
grGenome <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = basebinSize,cut.last.tile.in.chrom = TRUE)
grGenomeBinned <- GenomicRanges::tileGenome(seqlengths = GenomeInfoDb::Seqinfo(genome=genome),tilewidth = binSize,cut.last.tile.in.chrom = TRUE)
}
grGenome <- GenomeInfoDb::keepSeqlevels(grGenome,value = chromosome,pruning.mode = 'coarse')
grGenomeBinned <- GenomeInfoDb::keepSeqlevels(grGenomeBinned,value = chromosome,pruning.mode = 'coarse')
# Excluding gap and blacklisted regions
if(excludeRegions){
# Setting up gap track
if(exists(paste0(genome,'.gap'))){
cat('Loading internal gap\n')
grGaps <- get(paste0(genome,'.gap'))
} else{
cat('Loading external gap\n')
session <- rtracklayer::browserSession()
GenomeInfoDb::genome(session) <- genome
grGaps <- GenomicRanges::makeGRangesFromDataFrame(
rtracklayer::getTable(rtracklayer::ucscTableQuery(session,table="gap")),
starts.in.df.are.0based = TRUE)
}
grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grGaps)]
grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grGaps)]
# Setting up blacklist track
if(exists(paste0(genome,'.blacklist'))){
cat('Loading internal blacklist\n')
grBlackList <- get(paste0(genome,'.blacklist'))
grGenome <- grGenome[!IRanges::overlapsAny(grGenome,grBlackList)]
grGenomeBinned <- grGenomeBinned[!IRanges::overlapsAny(grGenomeBinned,grBlackList)]
}
}
# Prinning extra windows
idx <- IRanges::overlapsAny(grGenome,grGenomeBinned)
idxBinned <- IRanges::overlapsAny(grGenomeBinned,grGenome)
grGenome <- grGenome[idx]
grGenomeBinned <- grGenomeBinned[idxBinned]
# Matching the two ranges for aggregation of reads
dtHits <- data.table::as.data.table(IRanges::findOverlaps(grGenomeBinned,grGenome))
data.table::setnames(dtHits,c('i','subjectHits'))
data.table::setkey(dtHits,'subjectHits')
# Checking overlaps
tmp <- data.table::data.table(i = which(IRanges::overlapsAny(grGenome,grPeaks)==T))
# Simulation parameters
nGenome <- length(grGenome)
nOverlap <- nrow(tmp)
nGenomeBinned <- length(grGenomeBinned)
# Creating the gold-standard
grGenomeBinned$reference <- F
grGenomeBinned$reference[dtHits[tmp$i,][,.N,by='i']$i] <- T
# Looping over cells to create counts
matCounts <- data.table::rbindlist(lapply(seq_len(nCells),FUN = function(cell){
# Creating noise
## We will randomly add noise in genomic windows (peak or not)
## We assume that noise in bins ~ Binomial(2,q)
## Hence, the cell-specific baseline amount of noise reads is 2*length(grGenome)*q
## Assuming that the cell-specific baseline expected amount of noise reads is (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]
## Therefore, q = (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]/(2*(length(grGenome)))
dtHits[,x := stats::rbinom(n = nGenome,size = 2,prob = (baseSeqDepth*relSeqDepthPerCell[cell])*noisePerCell[cell]/(2*nGenome+.Machine$double.xmin))]
# Creating counts
## We assume that reads from every peak bin ~ Binomial(2,p)
## Hence, the cell-specific baseline expected depth is 2*nrow(tmp)*p.
## Asuming that the cell-specific baseline expected depth is (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])
## Therefore, p = (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])/(2*nrow(tmp))
dtHits[tmp$i, x := x + stats::rbinom(n = nOverlap,size = 2,prob = (baseSeqDepth*relSeqDepthPerCell[cell])*(1-noisePerCell[cell])/(2*nOverlap))]
# Print progress
cat(paste0('Simulation for ',label,'. Progress: ',round(cell/nCells* 100),"%"),"\r")
# Aggregating reads into the binned ranges
return(data.table::data.table(dtHits[,list(x = sum(x)),by = 'i'][(x>0),],j = cell))
}))
# Transforming it into sparseMatrix
matCounts <- with(matCounts,Matrix::sparseMatrix(i = i,j = j,x = x,
dims = list(nGenomeBinned,nCells),
dimnames = list(NULL,paste(label,paste0('c',seq_len(nCells)),sep='_'))))
cat("Done. The following file(s) has(have) been saved:\n")
if(outputRanges){
file <- file.path(path,paste(label,'ranges.RData',sep = '_'))
cat(file,'\n')
save(grGenomeBinned,file = file,compress = 'xz')
}
file <- file.path(path,paste(label,'counts.RData',sep = '_'))
cat(file,'\n')
save(matCounts,file = file,compress = 'xz')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.