Nothing
#' @title \code{calculate_intensities}
#'
#' @description \code{calculate_intensities} is an R helper function, part of the single-cell peak calling
#' algorithm MOCHA by (Zaim, Pebworth, et. al. 2022) that calculates the design matrix, X,
#' used to make peak calls.
#'
#' @param fragMat a genomic ranges object containing the fragments
#' @param candidatePeaks a genomic ranges object indicating where to call peaks. This could be a pre-selected peak set or a dynamic bins approach to searching the whole genome.
#' @param totalFrags a numeric normalizing value, indicating the total # of fragments for a cell pop'n in a sample
#'
#' @return a data.table, countsByBin, that returns the two intensity parameters required
#' to calculate the probability of a (+) peak
#'
#' @details The technical details of the algorithm are found in XX.
#'
#' @usage calculate_intensities(fragMat, candidatePeaks, FALSE)
#'
#' @import data.table
#'
#' @noRd
#'
calculate_intensities <- function(fragMat,
candidatePeaks,
totalFrags,
cellCol = "RG",
verbose = FALSE) {
if (!methods::is(fragMat, "GRanges")) {
stop("Input fragMat must be a Genomic Ranges (GRanges) object")
}
if (!methods::is(candidatePeaks, "GRanges")) {
stop("Input candidatePeaks must be a Genomic Ranges (GRanges) object")
}
if (!is.integer(totalFrags)) {
stop("totalFrags user-input must be an integer denoting the total # of frags")
}
bin <- cell <- normedFrags <- NULL
### set normalization
### scale for fragment counts
normScale <- 10^9
## transform fragments into data.table
fragMat_dt <- data.table::as.data.table(fragMat)
## transform granges to data.table
candidatePeaksDF <- data.table::as.data.table(candidatePeaks)
candidatePeaksDF$bin <- paste(candidatePeaksDF$seqnames, ":", candidatePeaksDF$start,
"-", candidatePeaksDF$end,
sep = ""
)
### identify overlaps between
### fragments & candidate peak regions
fragsPerBin <- GenomicRanges::findOverlaps(fragMat,
candidatePeaks,
minoverlap = 0
)
### Convert overlap matrix into DT
fragsPerBin <- data.table::as.data.table(fragsPerBin)
### Get Cell Counts
numCells <- length(unique(fragMat_dt[[cellCol]]))
### Label Cell Name
fragsPerBin$cell <- fragMat_dt[[cellCol]][fragsPerBin$queryHits]
### Get Window, Chr, Start, End and Strand information
### From the Overlaps matrix
fragsPerBin$bin <- candidatePeaksDF$bin[fragsPerBin$subjectHits]
fragsPerBin$chr <- candidatePeaksDF$seqnames[fragsPerBin$subjectHits]
fragsPerBin$strand <- candidatePeaksDF$strand[fragsPerBin$subjectHits]
fragsPerBin$start <- candidatePeaksDF$start[fragsPerBin$subjectHits]
fragsPerBin$end <- candidatePeaksDF$end[fragsPerBin$subjectHits]
fragsPerBin$width <- candidatePeaksDF$width[fragsPerBin$subjectHits]
### Convert frags per bin matrix into
### a data.table for obtaining cell counts
fragsPerBin <- data.table::as.data.table(fragsPerBin)
### get cell count matrix
cell_counts <- fragsPerBin[, .N, by = list(bin, cell)]
### include normalized counts
cell_counts$normedFrags <- cell_counts$N / (totalFrags / normScale)
#### doing bin-level summaries
setkey(cell_counts, bin)
#### calculate pseudoBulk intensity features
countsByBin <- cell_counts[, list(
TotalIntensity = sum(normedFrags),
maxIntensity = max(normedFrags)
), by = bin]
### join to the original dynamic bins
### to retain original variables
countsByBin <- dplyr::left_join(candidatePeaksDF[, c("strand", "seqnames", "start", "end", "bin")],
countsByBin,
by = "bin"
)
### if the dynamic bins is calculated
### on a cohort, and applied to samples
### some entries will be NAs
### and this will set NAs to 0 counts
countsByBin[is.na(countsByBin)] <- 0
### order by chromosome
countsByBin <- countsByBin[order(countsByBin$seqnames, countsByBin$start, decreasing = FALSE), ]
countsByBin$numCells <- numCells
chromosomeOrder <- c(
"chr1", "chr2", "chr3", "chr4",
"chr5", "chr6", "chr7", "chr8",
"chr9", "chr10", "chr11", "chr12",
"chr13", "chr14", "chr15", "chr16",
"chr17", "chr18", "chr19", "chr20",
"chr21", "chr22", "chrX", "chrY"
)
### Order Chromosome Names
countsByBin$seqnames <- factor(countsByBin$seqnames, levels = chromosomeOrder, ordered = T)
countsByBin <- countsByBin[order(countsByBin$seqnames, countsByBin$start, decreasing = FALSE), ]
### retain only features of interest and in order required
countsByBin <- countsByBin[, c(
"bin", "seqnames", "start", "end", "strand",
"TotalIntensity", "maxIntensity", "numCells"
), with = F]
### Rename "bin" identifier as "tileID"
colnames(countsByBin)[1] <- "tileID"
if (verbose) {
message(" ---> Analysis finished on ", numCells, " cells")
}
return(countsByBin)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.