#' DsATAC
#'
#' A class for storing ATAC-seq accessibility data
#' inherits from \code{\linkS4class{DsAcc}}
#'
#' @section Slots:
#' \describe{
#' \item{\code{fragments}}{
#' \code{GRanges} object storing sequencing fragments. Alternativily pointers to files in which this data isf stored
#' as R data object
#' }
#' \item{\code{counts}}{
#' List of count matrices for each summarized region type (dimension: regions X samples).
#' Depending on the settings for the slots \code{diskDump} and \code{sparseCounts}, the matrices are
#' either (a) regular matrices, (b) \code{HDF5Array}/\code{DelayedArray} or (c) sparse matrices.
#' }
#' \item{\code{countTransform}}{
#' list of character vectors specifying which transformations have been applied to the count matrices
#' }
#' \item{\code{sparseCounts}}{
#' Flag indicating whether count data will be stored as sparse matrices rather than regular matrices
#' }
#' \item{\code{diskDump.fragments}}{
#' Flag indicating whether fragment data will be kept on disk rather than in main memory.
#' }
#' }
#' @name DsATAC-class
#' @rdname DsATAC-class
#' @author Fabian Mueller
#' @exportClass DsATAC
setClass("DsATAC",
slots = list(
fragments = "list",
counts = "list",
countTransform = "list",
sparseCounts = "logical",
diskDump.fragments = "logical",
diskDump.fragments.nSamplesPerFile = "integer"
),
contains = "DsAcc",
package = "ChrAccR"
)
setMethod("initialize","DsATAC",
function(
.Object,
fragments,
coord,
counts,
sampleAnnot,
genome,
diskDump,
diskDump.fragments,
diskDump.fragments.nSamplesPerFile,
sparseCounts
) {
.Object@fragments <- fragments
.Object@coord <- coord
.Object@counts <- counts
.Object@countTransform <- rep(list(character(0)), length(.Object@counts))
names(.Object@countTransform) <- names(.Object@counts)
.Object@sampleAnnot <- sampleAnnot
.Object@genome <- genome
.Object@diskDump <- diskDump
.Object@diskDump.fragments <- diskDump.fragments
.Object@diskDump.fragments.nSamplesPerFile <- diskDump.fragments.nSamplesPerFile
.Object@sparseCounts <- sparseCounts
.Object@pkgVersion <- packageVersion("ChrAccR")
.Object
}
)
#' @param sampleAnnot \code{data.frame} object containing sample annotation
#' @param genome character string containing genome assembly
#' @noRd
DsATAC <- function(sampleAnnot, genome, diskDump=FALSE, diskDump.fragments=TRUE, sparseCounts=FALSE){
obj <- new("DsATAC",
list(),
list(),
list(),
sampleAnnot,
genome,
diskDump,
diskDump.fragments,
diskDump.fragments.nSamplesPerFile=1L,
sparseCounts
)
return(obj)
}
################################################################################
# Getters
################################################################################
if (!isGeneric("getCounts")) {
setGeneric(
"getCounts",
function(.object, ...) standardGeneric("getCounts"),
signature=c(".object")
)
}
#' getCounts-methods
#'
#' Return table of count values
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying the region type
#' @param i (optional) row (region) indices
#' @param j (optional) column (sample) indices
#' @param asMatrix return a matrix object instead of the internal representation
#' @param naIsZero should \code{NA}s in the count matrix be considered 0 value (instead of unknown/missing)
#' @param allowSparseMatrix if \code{asMatrix}: allow for sparse matrices as returned data format
#' @return Matrix containing counts for
#' each region and sample
#'
#' @rdname getCounts-DsATAC-method
#' @docType methods
#' @aliases getCounts
#' @aliases getCounts,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getCounts",
signature(
.object="DsATAC"
),
function(
.object,
type,
i=NULL,
j=NULL,
asMatrix=TRUE,
naIsZero=TRUE,
allowSparseMatrix=FALSE
) {
if (!is.element(type, getRegionTypes(.object))) logger.error(c("Unsupported region type:", type))
res <- .object@counts[[type]]
# # OLD
# if (.object@diskDump){
# if (!is.null(i) || !is.null(j) || asMatrix){
# # DelayedArray can have serious performance issues, when indexing is not done efficiently
# # --> workaround-function: fastDelayedArrayToMatrix()
# res <- fastDelayedArrayToMatrix(res, i=i, j=j) # not needed any more (?): issue should be fixed starting with HDF5Array version 1.11.11 (https://github.com/Bioconductor/DelayedArray/issues/13)
# cns <- colnames(res)
# if (!asMatrix){
# res <- as(res, "HDF5Array")
# colnames(res) <- cns
# }
# }
# } else {
# if (!is.null(i)) res <- res[i,,drop=FALSE]
# if (!is.null(j)) res <- res[,j,drop=FALSE]
# if (asMatrix && !is.matrix(res) && !(.object@sparseCounts && allowSparseMatrix)){
# res <- as.matrix(res)
# if (!naIsZero && .object@sparseCounts){
# res[res==0] <- NA
# }
# }
# }
# # ###
if (!is.null(i)) res <- res[i,,drop=FALSE]
if (!is.null(j)) res <- res[,j,drop=FALSE]
if (asMatrix && !is.matrix(res)){
if (.object@diskDump) {
res <- as.matrix(res)
} else if (.object@sparseCounts && !allowSparseMatrix){
res <- as.matrix(res)
if (!naIsZero){
res[res==0] <- NA
}
}
}
if (naIsZero){
res[is.na(res)] <- 0
}
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getCountsSE")) {
setGeneric(
"getCountsSE",
function(.object, ...) standardGeneric("getCountsSE"),
signature=c(".object")
)
}
#' getCountsSE-methods
#'
#' Return a \code{SummarizedExperiment} object of count values
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying the region type
#' @param naIsZero should \code{NA}s in the count matrix be considered 0 value (instead of unknown/missing)
#' @return \code{SummarizedExperiment} containing counts for each region and sample
#'
#' @rdname getCountsSE-DsATAC-method
#' @docType methods
#' @aliases getCountsSE
#' @aliases getCountsSE,DsATAC-method
#' @author Fabian Mueller
#' @export
#'
#' @examples
#' \dontrun{
#' dsa <- ChrAccRex::loadExample("dsAtac_ia_example")
#' se <- getCountsSE(dsa, "IA_prog_peaks")
#' se
#' }
setMethod("getCountsSE",
signature(
.object="DsATAC"
),
function(
.object,
type,
naIsZero=TRUE
) {
if (!is.element(type, getRegionTypes(.object))) logger.error(c("Unsupported region type:", type))
#count matrix
cm <- ChrAccR::getCounts(.object, type, asMatrix=TRUE, naIsZero=naIsZero, allowSparseMatrix=TRUE)
coords <- getCoord(.object, type)
se <- SummarizedExperiment::SummarizedExperiment(assays=list(counts=cm), rowRanges=coords, colData=S4Vectors::DataFrame(getSampleAnnot(.object)))
return(se)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getFragmentGr")) {
setGeneric(
"getFragmentGr",
function(.object, ...) standardGeneric("getFragmentGr"),
signature=c(".object")
)
}
#' getFragmentGr-methods
#'
#' Return a \code{GRanges} object of fragment data for a given sample
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param sampleId sample identifier
#' @return \code{GRanges} object containing fragment data
#'
#' @rdname getFragmentGr-DsATAC-method
#' @docType methods
#' @aliases getFragmentGr
#' @aliases getFragmentGr,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getFragmentGr",
signature(
.object="DsATAC"
),
function(
.object,
sampleId
) {
if (!is.element(sampleId, getSamples(.object))) logger.error(c("Invalid samples:", sampleId))
if (!is.element(sampleId, names(.object@fragments))) logger.error(c("Object does not contain insertion information for sample:", sampleId))
fragGr <- .object@fragments[[sampleId]]
if (is.character(fragGr)){
if (file.exists(fragGr)) {
fragGr <- readRDS(fragGr)
if (is.list(fragGr) || is.element(class(fragGr), c("GRangesList", "CompressedGRangesList"))){
fragGr <- fragGr[[sampleId]]
}
} else {
logger.error(c("Could not load fragment data from file:", fragGr))
}
}
return(fragGr)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getFragmentGrl")) {
setGeneric(
"getFragmentGrl",
function(.object, ...) standardGeneric("getFragmentGrl"),
signature=c(".object")
)
}
#' getFragmentGrl-methods
#'
#' Return a list of \code{GRanges} objects of fragment data for a given set of samples
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param sampleIds sample identifiers
#' @param asGRangesList should a \code{GRangesList} object be returned instead of a regular \code{list}
#' @return A named list of \code{GRanges} objects containing fragment data
#'
#' @rdname getFragmentGrl-DsATAC-method
#' @docType methods
#' @aliases getFragmentGrl
#' @aliases getFragmentGrl,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getFragmentGrl",
signature(
.object="DsATAC"
),
function(
.object,
sampleIds,
asGRangesList=FALSE
) {
if (!all(sampleIds %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(sampleIds, getSamples(.object)), collapse=";")))
if (!all(sampleIds %in% names(.object@fragments))) logger.error(c("Object does not contain insertion information for samples:", paste(setdiff(sampleIds, names(.object@fragments)), collapse=";")))
fragL <- .object@fragments[sampleIds]
# load disk-dumped fragment GRanges into memory
isDd <- sapply(fragL, is.character)
if (any(isDd)){
ddFns_all <- unlist(fragL[isDd])
sidsDd <- names(fragL)[isDd] # sample names of disk dumped fragments
fnToSampleL <- tapply(sidsDd, ddFns_all, c)
ddFns <- unique(ddFns_all)
# large GRanges list of all samples that have been disk-dumped
ddGrl <- lapply(ddFns, FUN=function(fn){
# logger.status(fn)
if (!file.exists(fn)) logger.error(c("Could not load fragment data from file:", fn))
rr <- readRDS(fn)
if (is.element(class(rr), c("GRangesList", "CompressedGRangesList"))){
rr <- as.list(rr)
}
if (!is.list(rr)) {
rr <- list(rr)
names(rr) <- fnToSampleL[[fn]]
}
return(rr)
})
names(ddGrl) <- NULL
ddGrl <- do.call("c", ddGrl)
fragL[sidsDd] <- ddGrl[sidsDd]
}
if (asGRangesList){
fragL <- GenomicRanges::GRangesList(fragL) # can potentially take long and throw error: 'long vectors not supported yet: memory.c:3715'
}
return(fragL)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getFragmentNum")) {
setGeneric(
"getFragmentNum",
function(.object, ...) standardGeneric("getFragmentNum"),
signature=c(".object")
)
}
#' getFragmentNum-methods
#'
#' Return the number of fragments in the \code{\linkS4class{DsATAC}} object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param sampleIds sample identifiers
#' @return a vector of fragment counts per sample
#'
#' @rdname getFragmentNum-DsATAC-method
#' @docType methods
#' @aliases getFragmentNum
#' @aliases getFragmentNum,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getFragmentNum",
signature(
.object="DsATAC"
),
function(
.object,
sampleIds=getSamples(.object)
) {
if (!all(sampleIds %in% getSamples(.object))) logger.error(c("Invalid sampleIds:", paste(setdiff(sampleIds, getSamples(.object)), collapse=", ")))
fgrl <- getFragmentGrl(.object, getSamples(.object), asGRangesList=FALSE)
res <- sapply(fgrl, length)
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getInsertionSites")) {
setGeneric(
"getInsertionSites",
function(.object, ...) standardGeneric("getInsertionSites"),
signature=c(".object")
)
}
#' getInsertionSites-methods
#'
#' Return a list of insertion sites (Tn5 cut sites) for each sample
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param samples sample identifiers
#' @param asGRangesList should a \code{GRangesList} object be returned instead of a regular \code{list}
#' @return \code{list} or \code{GRangesList} containing Tn5 cut sites for each sample
#'
#' @rdname getInsertionSites-DsATAC-method
#' @docType methods
#' @aliases getInsertionSites
#' @aliases getInsertionSites,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getInsertionSites",
signature(
.object="DsATAC"
),
function(
.object,
samples=getSamples(.object),
asGRangesList=FALSE
) {
if (!all(samples %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(samples, getSamples(.object)), collapse=", ")))
if (!all(samples %in% names(.object@fragments))) logger.error(c("Object does not contain insertion information for samples:", paste(setdiff(samples, names(.object@fragments)), collapse=", ")))
fGrl <- getFragmentGrl(.object, samples, asGRangesList=FALSE)
res <- lapply(fGrl, getInsertionSitesFromFragmentGr)
names(res) <- samples
if (asGRangesList){
res <- GRangesList(res)
}
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getCoverage")) {
setGeneric(
"getCoverage",
function(.object, ...) standardGeneric("getCoverage"),
signature=c(".object")
)
}
#' getCoverage-methods
#'
#' Return a list of genome-wide coverage from insertion sites
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param samples sample identifiers
#' @return \code{list} of \code{Rle} objects of sample coverage tracks
#'
#' @rdname getCoverage-DsATAC-method
#' @docType methods
#' @aliases getCoverage
#' @aliases getCoverage,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getCoverage",
signature(
.object="DsATAC"
),
function(
.object,
samples=getSamples(.object)
) {
if (!all(samples %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(samples, getSamples(.object)), collapse=", ")))
insGrl <- getInsertionSites(.object, samples)
sampleCovgRle <- lapply(samples, FUN=function(sid){
# logger.status(c("Computing genome-wide coverage for sample", sid))
return(GenomicRanges::coverage(insGrl[[sid]]))
})
names(sampleCovgRle) <- samples
return(sampleCovgRle)
}
)
################################################################################
# Display
################################################################################
setMethod("show","DsATAC",
function(object) {
ss <- getSamples(object)
str.ss <- paste(getSamples(object), collapse=", ")
if (length(ss) > 5) str.ss <- paste(c(getSamples(object)[1:5], "..."), collapse=", ")
rts <- getRegionTypes(object)
str.rts <- "no region types"
if (length(rts) > 0) str.rts <- paste0(length(rts), " region types: ", paste(rts, collapse=", "))
str.frags <- "no fragment data"
if (length(object@fragments) > 0) str.frags <- paste0("fragment data for ", length(object@fragments), " samples")
if (object@diskDump.fragments) str.frags <- paste0(str.frags, " [disk-backed]")
str.disk <- "[in memory object]"
if (object@diskDump) str.disk <- "[contains disk-backed data]"
if (class(object)=="DsATACsc"){
cat("Single-cell ")
}
cat("DsATAC chromatin accessibility dataset \n")
cat(str.disk, "\n")
cat("contains:\n")
cat(" * ", length(ss), " samples: ", str.ss, " \n")
cat(" * ", str.frags, " \n")
cat(" * ", str.rts, " \n")
if (length(rts) > 0) {
for (rt in rts){
cat(" * * ", getNRegions(object, rt), "regions of type", rt, " \n")
}
}
}
)
################################################################################
# Summary functions
################################################################################
# #-------------------------------------------------------------------------------
# if (!isGeneric("getRegionMapping")) {
# setGeneric(
# "getRegionMapping",
# function(.object, ...) standardGeneric("getRegionMapping"),
# signature=c(".object")
# )
# }
# #' getRegionMapping-methods
# #'
# #' Retrieve a mapping from regions to the indices of counts in the dataset
# #'
# #' @param .object \code{\linkS4class{DsATAC}} object
# #' @param type character string specifying a name for the region type
# #' @return list containing vectors of indices of GCs for each region of the
# #' specified type
# #'
# #' @rdname getRegionMapping-DsATAC-method
# #' @docType methods
# #' @aliases getRegionMapping
# #' @aliases getRegionMapping,DsATAC-method
# #' @author Fabian Mueller
# #' @export
# setMethod("getRegionMapping",
# signature(
# .object="DsATAC"
# ),
# function(
# .object,
# type
# ) {
# baseGr <- getCoord(.object, type="counts")
# regGr <- getCoord(.object, type=type)
# oo <- findOverlaps(baseGr, regGr)
# sl <- tapply(GenomicRanges::queryHits(oo), GenomicRanges::subjectHits(oo), c)
# res <- rep(list(integer(0)), getNRegions(.object, type=type))
# res[as.integer(names(sl))] <- sl
# return(res)
# }
# )
#-------------------------------------------------------------------------------
if (!isGeneric("regionAggregation")) {
setGeneric(
"regionAggregation",
function(.object, ...) standardGeneric("regionAggregation"),
signature=c(".object")
)
}
#' regionAggregation-methods
#'
#' Aggregate signal counts across a set of regions
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regGr \code{GRanges} object containing regions to summarize
#' @param type character string specifying a name for the region type
#' @param signal character string specifying a name for the region type for the signal to be aggregated
#' If it is \code{NULL} (default), the new region type will be initialized with NA values.
#' If it is \code{"insertions"} count data will be initialized from insertion sites (if
#' fragment data is present in the object).
#' @param aggrFun aggregation function for signal counts. Will only be used if \code{signal!="insertions"}
#' Currently \code{sum}, \code{mean} and \code{median} (default) are supported.
#' @param dropEmpty discard all regions with no observed signal counts
#' @param bySample [only relevant if \code{signal=="insertions"}]. Process data sample-by-sample to save memory.
#' @param chunkSize [only relevant if \code{signal=="insertions" & !bySample}] number of samples to process per chunk (saves memory).
#' If \code{NULL} or larger than the number of samples, only one chunk will be processed.
#' @return a new \code{\linkS4class{DsATAC}} object with aggregated signal counts per regions
#'
#'
#' @rdname regionAggregation-DsATAC-method
#' @docType methods
## @aliases regionAggregation
#' @aliases regionAggregation,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("regionAggregation",
signature(
.object="DsATAC"
),
function(
.object,
regGr,
type,
signal=NULL,
aggrFun="median",
dropEmpty=TRUE,
bySample=TRUE,
chunkSize=5000L
) {
if (!is.element(aggrFun, c("sum", "mean", "median"))){
logger.error(c("Unknown signal count aggregation function:", aggrFun))
}
if (!is.null(signal)){
if (!is.element(signal, c(getRegionTypes(.object), "insertions"))){
logger.error(c("invalid signal region type", signal))
}
if (type==signal){
logger.error(paste0("Region type ('", type, "') is not allowed to be the same as signal ('", signal, "')"))
}
}
if (is.element(type, getRegionTypes(.object))){
logger.warning(c("Overwriting aggregated region type:", type))
}
rsFun <- rowSums
# sparse matrices
if (.object@sparseCounts){
rsFun <- Matrix::rowSums
}
# DelayedArray
if (.object@diskDump){
rsFun <- BiocGenerics::rowSums
}
#sort the regions
coordOrder <- order(as.integer(seqnames(regGr)), start(regGr), end(regGr), as.integer(strand(regGr)))
regGr <- regGr[coordOrder]
.object@coord[[type]] <- regGr
#initialize empty count matrix for new region type
sampleIds <- getSamples(.object)
nSamples <- length(sampleIds)
nRegs <- length(regGr)
# emptyVec <- rep(as.integer(NA), nRegs)
# .object@counts[[type]] <- data.table(emptyVec)
# for (i in 1:nSamples){
# .object@counts[[type]][[i]] <- emptyVec
# }
if (.object@sparseCounts){
.object@counts[[type]] <- Matrix::sparseMatrix(i=c(), j=c(), x=1, dims=c(nRegs,nSamples))
} else {
.object@counts[[type]] <- matrix(as.integer(NA), nrow=nRegs, ncol=nSamples)
if (.object@diskDump) .object@counts[[type]] <- as(.object@counts[[type]], "HDF5Array")
}
colnames(.object@counts[[type]]) <- sampleIds
.object@countTransform[[type]] <- character(0)
#Aggregate signal
doAggr <- FALSE
if (!is.null(signal) && !is.element(signal, c("insertions")) && is.element(aggrFun, c("sum", "mean", "median"))){
doAggr <- TRUE
signalGr <- getCoord(.object, signal)
if (genome(regGr)[1]!=genome(signalGr)[1]){
logger.warning(c("Potentially incompatible genome assemblies (object:", genome(signalGr)[1], ", regGr:", genome(regGr)[1], ")"))
}
oo <- findOverlaps(signalGr, regGr)
if (any(duplicated(queryHits(oo)))) logger.info("Some signals map to multiple regions")
dtC <- data.table(getCounts(.object, signal, i=queryHits(oo)), mergedIndex=subjectHits(oo))
if (aggrFun=="sum") {
rr <- dtC[, lapply(.SD, sum, na.rm=TRUE), by=.(mergedIndex)]
} else if (aggrFun=="mean") {
rr <- dtC[, lapply(.SD, mean, na.rm=TRUE), by=.(mergedIndex)]
} else if (aggrFun=="median") {
rr <- dtC[, lapply(.SD, median, na.rm=TRUE), by=.(mergedIndex)]
} else {
logger.error(c("Unknown signal count aggregation function:", aggrFun))
}
# .object@counts[[type]][rr[["mergedIndex"]],] <- rr[,!"mergedIndex"]
.object@counts[[type]][rr[["mergedIndex"]],] <- as.matrix(rr[,!"mergedIndex"])
rm(dtC, rr); cleanMem() #clean-up
}
if (!is.null(signal) && signal=="insertions"){
doAggr <- TRUE
if (bySample){
i <- 0
nUnrealizedValuesAdded <- 0
for (sid in sampleIds){
i <- i + 1
doMsg <- nSamples < 100 || (i %% 100 == 0)
if (doMsg) logger.status(c("Aggregating counts for sample", sid, paste0("(", i, " of ", nSamples, ")"), "..."))
.object@counts[[type]][,sid] <- as.matrix(countOverlaps(regGr, getInsertionSites(.object, samples=sid)[[1]], ignore.strand=TRUE))
# avoid issues if the stack of operations to realize in the DelayedArray becomes to large
# avoids `dim(x)' must work` issues (@seed C stack usage X is too close to the limit)
nUnrealizedValuesAdded <- nUnrealizedValuesAdded + nRegs
if (is.element("DelayedMatrix", class(.object@counts[[type]]))
& nUnrealizedValuesAdded > 8e8){
logger.status("Realizing disk-backed matrix")
.object@counts[[type]] <- DelayedArray::realize(.object@counts[[type]])
nUnrealizedValuesAdded <- 0
}
}
} else {
chunkIdxL <- list(1:nSamples)
if (!is.null(chunkSize) & chunkSize < nSamples){
chunkIdxL <- split(1:nSamples, as.integer((1:nSamples - 1) / chunkSize))
}
scm <- NULL
for (k in seq_along(chunkIdxL)){
logger.start(c("Processing chunk", k, "of", length(chunkIdxL)))
nSamples_cur <- length(chunkIdxL[[k]])
sampleIds_cur <- sampleIds[chunkIdxL[[k]]]
logger.status("Retrieving joined insertion sites ...")
fGrl <- getFragmentGrl(.object, sampleIds_cur, asGRangesList=FALSE)
nFrags <- sapply(fGrl, length)
fGr <- do.call("c", unname(fGrl))
# fGr <- unlist(fGrl, use.names=FALSE)
elementMetadata(fGr)[,".sampleIdx"] <- rep(seq_along(nFrags), nFrags)
igr <- getInsertionSitesFromFragmentGr(fGr)
logger.status("computing overlaps with regions ...") #TODO: this step can take a lot of memory (reduce chunksize?)
oo <- findOverlaps(regGr, igr, ignore.strand=TRUE)
# convenient: when constructing sparse matrices, if (i,j) indices are repeated, the x-values are summed up
scm_cur <- Matrix::sparseMatrix(
i=queryHits(oo),
j=elementMetadata(igr)[subjectHits(oo), ".sampleIdx"],
x=rep(1, length(queryHits(oo))),
dims=c(nRegs, nSamples_cur)
)
colnames(scm_cur) <- sampleIds_cur
scm <- cbind(scm, scm_cur)
rm(igr, oo, scm_cur); cleanMem()
logger.completed()
}
logger.status("Adding data to object ...")
if (.object@sparseCounts){
.object@counts[[type]] <- scm
} else {
.object@counts[[type]][,] <- as.matrix(scm)
}
}
}
if (doAggr){
logger.info(c("Aggregated signal counts across", nrow(.object@counts[[type]]), "regions"))
if (.object@sparseCounts) {
hasNas <- anyNA(.object@counts[[type]]) # this should not be TRUE
if (prod(dim(.object@counts[[type]])) < .Machine$integer.max && hasNas) {
hasValM <- !is.na(.object@counts[[type]]) # TODO: this can take a lot of memory. Use a more efficient method to detect NAs in sparse matrices
hasValM <- hasValM & .object@counts[[type]] != 0
} else {
if (hasNas) logger.warning("Sparse matrix with NAs detected --> assuming NAs are 0")
# avoid this error: "Cholmod error 'problem too large'"
hasValM <- .object@counts[[type]] != 0
}
} else {
hasValM <- !is.na(.object@counts[[type]]) & .object@counts[[type]] > 0
}
rows2keep <- rsFun(hasValM) > 0
logger.info(c(" of which", sum(rows2keep), "regions contained signal counts"))
#discard regions where all signal counts are unobserved
if (dropEmpty){
.object@coord[[type]] <- .object@coord[[type]][rows2keep]
.object@counts[[type]] <- .object@counts[[type]][rows2keep,]
}
}
if (.object@sparseCounts) .object@counts[[type]] <- drop0(.object@counts[[type]])
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("mergeSamples")) {
setGeneric(
"mergeSamples",
function(.object, ...) standardGeneric("mergeSamples"),
signature=c(".object")
)
}
#' mergeSamples-methods
#'
#' Merge signal and insertion data across samples
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param mergeGroups factor or character vector or column name in sample annotation table.
#' Can alternatively be a (named) list containing sample indices or names
#' for each group to merge.
#' @param countAggrFun aggregation function for signal counts.
#' Currently \code{sum} (default), \code{mean} and \code{median} are supported.
#' @return a new \code{\linkS4class{DsATAC}} object with samples merged
#'
#'
#' @rdname mergeSamples-DsATAC-method
#' @docType methods
#' @aliases mergeSamples
#' @aliases mergeSamples,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("mergeSamples",
signature(
.object="DsATAC"
),
function(
.object,
mergeGroups,
countAggrFun="sum"
) {
inputObj <- .object
if (!is.element(countAggrFun, c("sum", "mean", "median"))){
logger.error(c("Unknown signal count aggregation function:", countAggrFun))
}
if (.object@sparseCounts && is.element(countAggrFun, c("mean", "median"))){
logger.warning("Mean and median merging of samples can be slow due to conversion of sparse matrices")
}
sampleNames <- getSamples(.object)
nSamples <- length(sampleNames)
ph <- getSampleAnnot(.object)
rownames(ph) <- sampleNames # not sure if really necessary
if (is.character(mergeGroups) && length(mergeGroups) == 1 && is.element(mergeGroups, colnames(ph))){
mergeGroups <- ph[,mergeGroups]
}
# index list supplied
if (is.list(mergeGroups)){
if (is.null(names(mergeGroups))) names(mergeGroups) <- paste0("g", seq_along(mergeGroups))
# convert to integer indices
mergeGroups <- lapply(mergeGroups, FUN=function(x){
if (is.character(x)){
return(match(x, sampleNames))
} else if (is.logical(x)){
return(which(x))
} else {
return(x)
}
})
ph_cat <- do.call(rbind, lapply(names(mergeGroups), FUN=function(gn){
data.frame(ph[mergeGroups[[gn]],,drop=FALSE], .mergeGroup=gn, stringsAsFactors=FALSE)
}))
phm <- muRtools::aggregateDf(ph_cat, ph_cat[,".mergeGroup"])
phm <- phm[names(mergeGroups), ]
mgL <- mergeGroups
} else {
if ((!is.factor(mergeGroups) && !is.character(mergeGroups)) || length(mergeGroups) != nrow(ph)){
logger.error("Invalid merge groups")
}
if (is.factor(mergeGroups)) mergeGroups <- as.character(mergeGroups)
phm <- muRtools::aggregateDf(ph, mergeGroups)
mgL <- lapply(rownames(phm), FUN=function(mg){which(mergeGroups==mg)})
names(mgL) <- rownames(phm)
}
.object@sampleAnnot <- phm
#count data
mergeFun <- NULL
if(countAggrFun=="sum"){
mergeFun <- function(X){rowSums(X, na.rm=TRUE)}
if (.object@diskDump) {
mergeFun <- function(X){BiocGenerics::rowSums(X, na.rm=TRUE)}
} else if (.object@sparseCounts) {
mergeFun <- function(X){Matrix::rowSums(X, na.rm=TRUE)}
}
} else if(countAggrFun=="mean"){
mergeFun <- function(X){rowMeans(X, na.rm=TRUE)}
} else if(countAggrFun=="median"){
mergeFun <- function(X){matrixStats::rowMedians(X, na.rm=TRUE)}
}
regTypes <- getRegionTypes(.object)
for (rt in regTypes){
logger.status(paste0("Merging samples (region set: '", rt, "')..."))
cm <- ChrAccR::getCounts(.object, rt, asMatrix=FALSE) #design question: should we set the parameter naIsZero==FALSE?
cmm <- do.call("cbind", lapply(mgL, FUN=function(iis){
if (.object@sparseCounts && is.element(countAggrFun, c("mean", "median"))){
aggMat <- ChrAccR::getCounts(.object, rt, j=iis, asMatrix=TRUE) #design question: should we set the parameter naIsZero==FALSE?
} else {
aggMat <- cm[,iis,drop=FALSE]
}
return(mergeFun(aggMat))
}))
# .object@counts[[rt]] <- data.table(cmm)
if (.object@sparseCounts) {
.object@counts[[rt]] <- as(cmm, "sparseMatrix")
.object@counts[[rt]] <- drop0(.object@counts[[rt]])
} else {
.object@counts[[rt]] <- cmm
}
if (.object@diskDump) .object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
#insertion data: concatenate GRanges objects
if (length(.object@fragments) == nSamples){
logger.start(paste0("Merging sample fragment data"))
.object@fragments <- lapply(1:length(mgL), FUN=function(i){
iis <- mgL[[i]]
logger.status(c("Merged sample", i, "of", length(mgL)))
curSns <- sampleNames[iis]
curFragL <- getFragmentGrl(inputObj, curSns, asGRangesList=FALSE)
nFrags <- sapply(curFragL, length)
catRes <- do.call("c", unname(curFragL))
# catRes <- unlist(curFragL, use.names=FALSE)
elementMetadata(catRes)[,".sample"] <- rep(curSns, nFrags)
if (.object@diskDump.fragments) {
# logger.status(c("... saving to disk"))
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext=".rds")
saveRDS(catRes, fn, compress=TRUE)
catRes <- fn
}
return(catRes)
})
names(.object@fragments) <- names(mgL)
if (.object@diskDump.fragments){
# currently merging samples does not support multiple samples per fragment file
.object@diskDump.fragments.nSamplesPerFile <- 1L
# TODO: support multiple samples per merged fragment file
}
logger.completed()
}
return(.object)
}
)
#-------------------------------------------------------------------------------
# IN DEVELOPMENT
if (!isGeneric("join")) {
setGeneric(
"join",
function(.object, ...) standardGeneric("join"),
signature=c(".object")
)
}
#' join-methods
#'
#' Combine two \code{\linkS4class{DsATAC}} objects
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param objectB \code{\linkS4class{DsATAC}} object
#' @param combineRegionTypes how to combine region types:
#' 'union' (default): the resulting object will have counts aggregated over region types
#' from both objects. 'intersect': only region types present in both objects will occur
#' in the output
#' @return a new \code{\linkS4class{DsATAC}} object combining both input objects.
#' It contains untransformed counts.
#'
#'
#' @rdname join-DsATAC-method
#' @docType methods
#' @aliases join
#' @aliases join,DsATAC-method
#' @author Fabian Mueller
#' @export
## @examples
## \dontrun{
## dsaA <- ChrAccRex::loadExample("dsAtac_ia_example")
## dsaB <- ChrAccRex::loadExample("dsAtac_ia_example")
## rownames(dsaA@sampleAnnot) <- paste0(rownames(dsaB@sampleAnnot), "_A")
## names(dsaA@fragments) <- rownames(dsaA@sampleAnnot)
## rownames(dsaB@sampleAnnot) <- paste0(rownames(dsaB@sampleAnnot), "_B")
## names(dsaB@fragments) <- rownames(dsaB@sampleAnnot)
## dsaA@sampleAnnot[,"replicate"] <- NULL
## dsaB@sampleAnnot[,"dataset"] <- NULL
## dsaA <- removeRegionType(dsaA, "t200")
## dsaA <- removeRegionType(dsaA, "promoters_gc_protein_coding")
## # dsaA <- removeRegions(dsaA, sample(1:getNRegions(dsaA, "t10k"),1000), "t10k")
## dsaB <- removeRegionType(dsaB, "t200")
## dsaB <- removeRegionType(dsaB, "IA_prog_peaks")
## dsaJ <- join(dsaA, dsaB)
## # check the results
## dsaJ
## getSampleAnnot(dsaJ)
## length(dsaJ@fragments)
## rt <- "promoters_gc_protein_coding"
## getNRegions(dsaJ, rt)
## dim(getCounts(dsaJ, rt))
## dim(dsaJ@counts[[rt]])
## class(dsaJ@counts[[rt]])
## }
setMethod("join",
signature(
.object="DsATAC"
),
function(
.object,
objectB,
joinRegionTypes="union"
) {
logger.warning("IN DEVELOPMENT: The join function for DsATAC objects has not extensively been tested yet")
objectA <- .object
# compatibility checks
if (getGenome(objectA) != getGenome(objectB)) logger.error("Could not combine objects: non-matching genomes")
if ((objectA@diskDump != objectB@diskDump) | (objectA@diskDump.fragments != objectB@diskDump.fragments)){
logger.error("Could not combine objects: non-matching disk-dump settings")
}
if (objectA@sparseCounts != objectB@sparseCounts){
logger.error("Could not combine objects: non-matching sparseCounts settings")
}
# if ((.hasSlot(objectA, "diskDump.fragments.nSamplesPerFile") &
# objectA@diskDump.fragments.nSamplesPerFile != 1L) |
# (.hasSlot(objectB, "diskDump.fragments.nSamplesPerFile") &
# objectB@diskDump.fragments.nSamplesPerFile != 1L)){
# logger.error("Currently merging multiple samples for disk-dumped fragments is not supported in combine()")
# }
logger.start("Combining metadata")
samplesA <- getSamples(objectA)
samplesB <- getSamples(objectB)
if (length(intersect(samplesA, samplesB))>0) logger.error("Objects A and B must have distinct samples")
phA <- getSampleAnnot(objectA)
phB <- getSampleAnnot(objectB)
# cns <- union(colnames(phA), colnames(phB))
ph <- plyr::join(phA, phB, type="full", by=character(0))
if (nrow(ph)!=length(samplesA)+length(samplesB)) logger.error("Something went wrong in table join")
rownames(ph) <- c(samplesA, samplesB)
hasFragmentsA <- length(objectA@fragments) > 0
hasFragmentsB <- length(objectB@fragments) > 0
logger.completed()
obj <- DsATAC(ph, getGenome(objectA), diskDump=objectA@diskDump, diskDump.fragments=objectA@diskDump.fragments, sparseCounts=objectA@sparseCounts)
if (hasFragmentsA & hasFragmentsB){
logger.start("Combining fragment data")
obj@fragments <- c(objectA@fragments, objectB@fragments)
names(obj@fragments) <- getSamples(obj)
logger.completed()
}
logger.start("Combining regions types and counts")
regionTypes <- union(getRegionTypes(objectA), getRegionTypes(objectB))
if (joinRegionTypes == "intersect"){
regionTypes <- intersect(getRegionTypes(objectA), getRegionTypes(objectB))
}
obj@countTransform <- rep(list(character(0)), length(regionTypes))
names(obj@countTransform) <- regionTypes
for (rt in regionTypes){
logger.status(c("Region type:", rt))
# coordinates
gr <- NULL
grA <- NULL
grB <- NULL
if (is.element(rt, getRegionTypes(objectA))){
grA <- getCoord(objectA, type=rt)
gr <- grA
}
if (is.element(rt, getRegionTypes(objectB))){
grB <- getCoord(objectB, type=rt)
if (!is.null(gr)){
gr <- c(gr, grB)
} else {
gr <- grB
}
}
gr <- muRtools::sortGr(unique(gr))
obj@coord[[rt]] <- gr
# count matrix A
doAggr <- FALSE
if (is.null(grA) | length(grA) != length(gr)){
if (!hasFragmentsA) logger.error("Combining on non-matching region types requires fragment data")
doAggr <- TRUE
} else {
oo <- findOverlaps(grA, gr, type="equal")
# check if matches are exact
if (!all(subjectHits(oo)==1:length(gr))) doAggr <- TRUE
# only combine untransformed/unnormalized counts
if (length(objectA@countTransform[[rt]]) > 0) doAggr <- TRUE
}
cmA <- NULL
if (doAggr){
aggObj <- regionAggregation(objectA, gr, ".combineRegs", signal="insertions", dropEmpty=FALSE)
cmA <- aggObj@counts[[".combineRegs"]]
} else {
cmA <- objectA@counts[[rt]]
}
# count matrix B
doAggr <- FALSE
if (is.null(grB) | length(grB) != length(gr)){
if (!hasFragmentsB) logger.error("Combining on non-matching region types requires fragment data")
doAggr <- TRUE
} else {
oo <- findOverlaps(grB, gr, type="equal")
# check if matches are exact
if (!all(subjectHits(oo)==1:length(gr))) doAggr <- TRUE
# only combine untransformed/unnormalized counts
if (length(objectB@countTransform[[rt]]) > 0) doAggr <- TRUE
}
cmB <- NULL
if (doAggr){
aggObj <- regionAggregation(objectB, gr, ".combineRegs", signal="insertions", dropEmpty=FALSE)
cmB <- aggObj@counts[[".combineRegs"]]
} else {
cmB <- objectB@counts[[rt]]
}
# combine
obj@counts[[rt]] <- cbind(cmA, cmB) # TODO: check: does cbind work for sparse and HDF5Array matrices?
}
logger.completed()
return(obj)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("removeFragmentData")) {
setGeneric(
"removeFragmentData",
function(object) standardGeneric("removeFragmentData"),
signature=c("object")
)
}
#' removeFragmentData-methods
#'
#' Removes fragment data from \code{\linkS4class{DsATAC}} object (e.g. to save space)
#'
#' @param object \code{\linkS4class{DsATAC}} object
#' @return the modified object (without fragment data)
#'
#' @rdname removeFragmentData-DsATAC-method
#' @docType methods
#' @aliases removeFragmentData
#' @aliases removeFragmentData,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("removeFragmentData",
signature(
object="DsATAC"
),
function(
object
) {
object@fragments <- list()
return(object)
}
)
#-------------------------------------------------------------------------------
# EXPERIMENTAL
if (!isGeneric("undiskFragmentData")) {
setGeneric(
"undiskFragmentData",
function(object) standardGeneric("undiskFragmentData"),
signature=c("object")
)
}
#' undiskFragmentData-methods
#'
#' converts disk-backed fragment data to in-memory fragment data
#'
#' @param object \code{\linkS4class{DsATAC}} object
#' @return the modified object
#'
#' @rdname undiskFragmentData-DsATAC-method
#' @docType methods
#' @aliases undiskFragmentData
#' @aliases undiskFragmentData,DsATAC-method
#' @author Fabian Mueller
#' @noRd
setMethod("undiskFragmentData",
signature(
object="DsATAC"
),
function(
object
) {
object@fragments <- getFragmentGrl(object, names(object@fragments))
object@diskDump.fragments <- FALSE
return(object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("addCountDataFromBam")) {
setGeneric(
"addCountDataFromBam",
function(.object, ...) standardGeneric("addCountDataFromBam"),
signature=c(".object")
)
}
#' addCountDataFromBam-methods
#'
#' Add count data to DsATAC object based on bam files
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param fns a named vector of bam file locations. The names must correspond to sample identifiers in the object
#' @return a new \code{\linkS4class{DsATAC}} object with read counts for each sample and region set
#'
#' @rdname addCountDataFromBam-DsATAC-method
#' @docType methods
#' @aliases addCountDataFromBam
#' @aliases addCountDataFromBam,DsATAC-method
#' @author Fabian Mueller
#' @noRd
setMethod("addCountDataFromBam",
signature(
.object="DsATAC"
),
function(
.object,
fns
) {
if (!requireNamespace("GenomicAlignments")) logger.error(c("Could not load dependency: GenomicAlignments"))
# TODO: adjust for Tn5 insertion:
# + strand: i + 4
# - strand: i - 5
# Buenrostro, et al. (2013). Nature Methods, 10(12), 1213-1218.
# "For peak calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event. Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adaptors separated by 9 bp (ref. 11). Therefore, all reads aligning to the + strand were offset by +4 bp, and all reads aligning to the - strand were offset -5bp"
ResizeReads <- function(reads, width=1, fix="start", ...) {
reads <- as(reads, "GRanges")
stopifnot(all(strand(reads) != "*"))
resize(reads, width=width, fix=fix, ...)
}
sids <- names(fns)
if (length(sids)!=length(fns)){
logger.error("The vector specifying filenames (fns) must be named")
}
if (!all(sids %in% getSamples(.object))){
logger.error(c("DsATAC dataset does not contain samples:", paste(setdiff(sids, getSamples(.object)), collapse=", ")))
}
rts <- getRegionTypes(.object)
for (rt in rts){
logger.status(c("Counting reads in region set:", rt))
gr <- getCoord(.object, rt)
ov.rse <- summarizeOverlaps(gr, fns, mode="Union", ignore.strand=TRUE, inter.feature=FALSE, preprocess.reads=ResizeReads)
.object@counts[[rt]][,sids] <- as.matrix(assays(ov.rse)$count)
if (.object@sparseCounts) .object@counts[[rt]] <- drop0(.object@counts[[rt]])
.object@countTransform[[rt]] <- character(0)
}
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("addCountDataFromGRL")) {
setGeneric(
"addCountDataFromGRL",
function(.object, ...) standardGeneric("addCountDataFromGRL"),
signature=c(".object")
)
}
#' addCountDataFromGRL-methods
#'
#' Add count data to DsATAC object based on a list of GRanges objects
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param grl NAMED \code{GRangesList} or NAMED list of \code{GRanges} objects. Names must correspond to sample ids in the object
#' @param silent limit log messages
#' @return a new \code{\linkS4class{DsATAC}} object with read counts for each sample and region set
#'
#' @rdname addCountDataFromGRL-DsATAC-method
#' @docType methods
#' @aliases addCountDataFromGRL
#' @aliases addCountDataFromGRL,DsATAC-method
#' @author Fabian Mueller
#' @noRd
setMethod("addCountDataFromGRL",
signature(
.object="DsATAC"
),
function(
.object,
grl,
silent=FALSE
) {
sids <- names(grl)
if (length(sids)!=length(grl)){
logger.error("The list of GRanges must be named")
}
if (!all(sids %in% getSamples(.object))){
logger.error(c("DsATAC dataset does not contain samples:", paste(setdiff(sids, getSamples(.object)), collapse=", ")))
}
if (class(grl)=="GRangesList") {
nElem <- elementNROWS(grl)
gr.c <- unlist(grl, use.names=FALSE)
} else {
nElem <- sapply(grl, length)
gr.c <- do.call("c", unname(grl))
}
if (length(gr.c) < 1) logger.error("[addCountDataFromGRL] invalid GRL: Must be of length 1 or more")
sampleIds <- rep(names(grl), times=nElem)
sampleIds.cm <- getSamples(.object)
rts <- getRegionTypes(.object)
for (rt in rts){
if (!silent) logger.status(c("Counting reads in region set:", rt))
gr.ds <- getCoord(.object, rt)
oo <- findOverlaps(gr.ds, gr.c, ignore.strand=TRUE)
idxDt <- data.table::as.data.table(cbind(
queryHits(oo), #row indices (regions) in count matrix
match(sampleIds[subjectHits(oo)], sampleIds.cm) #column indices (samples) in count matrix
))
# count the number of occurrences between each index pair
idxDt <- idxDt[,.N, by=names(idxDt)]
# if you in the future encounter some weird error here (e.g. 'Error in `[.data.frame`(x, i, j) : undefined columns selected')
# this is likely due to some weird namespace issue. --> keep data.table in the 'depends' section of DESCRIPTION
idxM <- as.matrix(idxDt[,c(1,2)])
if (.object@diskDump){
# DelayedArray does not support indexing via matrix
# --> workaround
perColumnIdxIdx <- tapply(1:nrow(idxM),idxM[,2],c) #index in the index matrix for each column
for (cis in names(perColumnIdxIdx)){
ci <- as.integer(cis)
ii <- perColumnIdxIdx[[cis]]
.object@counts[[rt]][idxM[ii,1],ci] <- as.matrix(idxDt$N[ii])
}
} else {
.object@counts[[rt]][idxM] <- idxDt$N
}
if (.object@sparseCounts) .object@counts[[rt]] <- drop0(.object@counts[[rt]])
}
return(.object)
}
)
# # Old, slower method f(for reference)
# setMethod("addCountDataFromGRL",
# signature(
# .object="DsATAC"
# ),
# function(
# .object,
# grl
# ) {
# sids <- names(grl)
# if (length(sids)!=length(grl)){
# logger.error("The list of GRanges must be named")
# }
# if (!all(sids %in% getSamples(.object))){
# logger.error(c("DsATAC dataset does not contain samples:", paste(setdiff(sids, getSamples(.object)), collapse=", ")))
# }
# rts <- getRegionTypes(.object)
# for (rt in rts){
# logger.status(c("Counting reads in region set:", rt))
# gr.ds <- getCoord(.object, rt)
# for (sid in sids){
# # logger.status(c("sample:", sid))
# gr.c <- grl[[sid]]
# .object@counts[[rt]][,sid] <- as.matrix(countOverlaps(gr.ds, gr.c, ignore.strand=TRUE))
# }
# if (.object@sparseCounts) .object@counts[[rt]] <- drop0(.object@counts[[rt]])
# }
# return(.object)
# }
# )
#-------------------------------------------------------------------------------
if (!isGeneric("addSignalDataFromGRL")) {
setGeneric(
"addSignalDataFromGRL",
function(.object, ...) standardGeneric("addSignalDataFromGRL"),
signature=c(".object")
)
}
#' addSignalDataFromGRL-methods
#'
#' Add signal data to DsATAC object based on a list of GRanges objects
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param grl NAMED \code{GRangesList} or NAMED list of \code{GRanges} objects. Names must correspond to sample ids in the object
#' @param aggrFun aggregation method
#' @return a new \code{\linkS4class{DsATAC}} object with counts/signal for each sample and region set
#'
#' @rdname addSignalDataFromGRL-DsATAC-method
#' @docType methods
#' @aliases addSignalDataFromGRL
#' @aliases addSignalDataFromGRL,DsATAC-method
#' @author Fabian Mueller
#' @noRd
setMethod("addSignalDataFromGRL",
signature(
.object="DsATAC"
),
function(
.object,
grl,
aggrFun=function(x){mean(x, na.rm=TRUE)}
) {
sids <- names(grl)
if (length(sids)!=length(grl)){
logger.error("The list of GRanges must be named")
}
if (!all(sids %in% getSamples(.object))){
logger.error(c("DsATAC dataset does not contain samples:", paste(setdiff(sids, getSamples(.object)), collapse=", ")))
}
rts <- getRegionTypes(.object)
for (rt in rts){
logger.status(c("Aggregating scores in region set:", rt))
gr.ds <- getCoord(.object, rt)
for (sid in sids){
gr.c <- grl[[sid]]
scs <- elementMetadata(gr.c)[,"score"]
if (is.null(scs)) {
logger.error(c("GRanges must contain a score column (sample", sid, ")"))
}
# TODO: check for correctness
oo <- findOverlaps(gr.ds, gr.c, ignore.strand=TRUE)
.object@counts[[rt]][sort(unique(queryHits(oo))), sid] <- as.matrix(tapply(scs[subjectHits(oo)], queryHits(oo), aggrFun))
}
if (.object@sparseCounts) .object@counts[[rt]] <- drop0(.object@counts[[rt]])
}
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("addInsertionDataFromBam")) {
setGeneric(
"addInsertionDataFromBam",
function(.object, ...) standardGeneric("addInsertionDataFromBam"),
signature=c(".object")
)
}
#' addInsertionDataFromBam-methods
#'
#' Add insertion data to DsATAC object based on bam or fragment bed files
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param fns a named vector of bam or bed file locations. The names must correspond to sample identifiers in the object
#' @param pairedEnd flag indicating whether the bam files are from paired-end sequencing
#' @param .diskDump for internal use only (should fragment data be stored as RDS instead of GRanges)
#' @param mode either \code{"bam"} (default) for reading bam files or \code{"fragBed"} for fragment bed files
#' @return a new \code{\linkS4class{DsATAC}} object with fragments/insertions for each sample
#'
#' @rdname addInsertionDataFromBam-DsATAC-method
#' @docType methods
#' @aliases addInsertionDataFromBam
#' @aliases addInsertionDataFromBam,DsATAC-method
#' @author Fabian Mueller
#' @noRd
setMethod("addInsertionDataFromBam",
signature(
.object="DsATAC"
),
function(
.object,
fns,
pairedEnd=TRUE,
.diskDump=.object@diskDump.fragments,
mode = "bam"
) {
offsetTn <- TRUE
sids <- names(fns)
if (length(sids)!=length(fns)){
logger.error("The vector specifying filenames (fns) must be named")
}
if (!all(sids %in% getSamples(.object))){
logger.error(c("DsATAC dataset does not contain samples:", paste(setdiff(sids, getSamples(.object)), collapse=", ")))
}
# only relevant if fragments from multiple samples are written to the same file (chunkedFragmentFiles)
chunkedFragmentFiles <- .object@diskDump.fragments && .hasSlot(.object, "diskDump.fragments.nSamplesPerFile") && .object@diskDump.fragments.nSamplesPerFile > 1
curChunkL <- list()
curChunkFn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
for (sid in sids){
logger.start(c("Reading insertion data for sample:", sid))
if (!is.null(.object@fragments[[sid]])){
logger.warning(c("Overwriting insertion data for sample:", sid))
}
fragGr <- NULL
if (mode=="bam"){
ga <- NULL
if (pairedEnd){
ga <- GenomicAlignments::readGAlignmentPairs(fns[sid], use.names=FALSE, param=Rsamtools::ScanBamParam(flag=Rsamtools::scanBamFlag(isProperPair=TRUE)))
} else {
ga <- GenomicAlignments::readGAlignments(fns[sid], use.names=FALSE)
}
ga <- setGenomeProps(ga, .object@genome, onlyMainChrs=TRUE, silent=TRUE)
fragGr <- getATACfragments(ga, offsetTn=TRUE)
} else if (mode=="fragBed"){
fragGr <- rtracklayer::import(fns[sid], format = "BED")
fragGr <- setGenomeProps(fragGr, .object@genome, onlyMainChrs=TRUE, silent=TRUE)
if (offsetTn){
# shift inserts inward due to the Tn5 dimer offset:
# --> +4 bp
# -------------------------
# -------------------------
# -4 bp <--
fragGr <- GenomicRanges::resize(fragGr, width(fragGr)-4, fix="end", ignore.strand=TRUE)
fragGr <- GenomicRanges::resize(fragGr, width(fragGr)-4, fix="start", ignore.strand=TRUE)
}
}
logger.info(c(length(fragGr), "fragments found"))
if (.diskDump){
if (chunkedFragmentFiles){
curChunkL[[sid]] <- fragGr
fragGr <- curChunkFn
if (length(curChunkL) >= .object@diskDump.fragments.nSamplesPerFile){
saveRDS(curChunkL, curChunkFn, compress=TRUE)
# reset after writing chunk
curChunkL <- list()
curChunkFn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
}
} else {
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
saveRDS(fragGr, fn, compress=TRUE)
fragGr <- fn
}
}
.object@fragments[[sid]] <- fragGr
logger.completed()
}
if (.diskDump && length(curChunkL) > 0){
saveRDS(curChunkL, curChunkFn, compress=TRUE)
}
return(.object)
}
)
################################################################################
# Manipulating DsATAC objects
################################################################################
if (!isGeneric("removeRegions")) {
setGeneric(
"removeRegions",
function(.object, ...) standardGeneric("removeRegions"),
signature=c(".object")
)
}
#' removeRegions-methods
#'
#' Remove the specified sites or regions from an object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param indices a vector of indices of sites/regions to be removed. Can be numeric, integer or logical.
#' @param type character string specifying a name for the region type (sefault: sites)
## @param reaggregate redo region aggregation (only has an effect if type is sites and there are aggregated regions in the dataset)
#' @return a new \code{\linkS4class{DsATAC}} object with sites/regions removed
#'
#' @rdname removeRegions-DsATAC-method
#' @docType methods
## @aliases removeRegions
#' @aliases removeRegions,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("removeRegions",
signature(
.object="DsATAC"
),
function(
.object,
indices,
type
# reaggregate=TRUE
) {
if (!is.element(type, getRegionTypes(.object))) logger.error(c("Unsupported region type:", type))
if (!is.vector(indices) || !(is.numeric(indices) || is.logical(indices))){
logger.error(c("Unsupported type for index vector"))
}
nRegs <- getNRegions(.object, type)
inds2keep <- rep(TRUE, nRegs)
if (is.numeric(indices)){
if (any(indices > nRegs | indices < 1)) {
logger.error(c("Invalid values in indices"))
}
inds2keep[indices] <- FALSE
} else if (is.logical(indices)){
inds2keep <- !indices
}
if (sum(inds2keep)>=nRegs){
logger.info("Nothing to be done: keeping object as is")
return(.object)
}
.object@coord[[type]] <- .object@coord[[type]][inds2keep]
# .object@counts[[type]] <- .object@counts[[type]][inds2keep,]
.object@counts[[type]] <- ChrAccR::getCounts(.object, type, i=inds2keep, j=NULL, asMatrix=FALSE, naIsZero=FALSE)
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("removeRegionType")) {
setGeneric(
"removeRegionType",
function(.object, ...) standardGeneric("removeRegionType"),
signature=c(".object")
)
}
#' removeRegionType-methods
#'
#' Remove the specified region type from an object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying a name for the region type (sefault: sites)
#' @return a new \code{\linkS4class{DsATAC}} object with the region type removed
#'
#' @rdname removeRegionType-DsATAC-method
#' @docType methods
#' @aliases removeRegionType
#' @aliases removeRegionType,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("removeRegionType",
signature(
.object="DsATAC"
),
function(
.object,
type
) {
rts <- getRegionTypes(.object)
if (!is.element(type, rts)) logger.error(c("Unsupported region type:", type))
rts <- setdiff(rts, type)
.object@coord <- .object@coord[rts]
.object@counts <- .object@counts[rts]
.object@countTransform <- .object@countTransform[rts]
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("removeRegionData")) {
setGeneric(
"removeRegionData",
function(.object) standardGeneric("removeRegionData"),
signature=c(".object")
)
}
#' removeRegionData-methods
#'
#' Remove all region data from a \code{\linkS4class{DsATAC}} object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @return a new \code{\linkS4class{DsATAC}} object with region data removed
#'
#' @rdname removeRegionData-DsATAC-method
#' @docType methods
#' @aliases removeRegionData
#' @aliases removeRegionData,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("removeRegionData",
signature(
.object="DsATAC"
),
function(
.object
) {
.object@coord <- list()
.object@counts <- list()
.object@countTransform <- list()
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("removeSamples")) {
setGeneric(
"removeSamples",
function(.object, ...) standardGeneric("removeSamples"),
signature=c(".object")
)
}
#' removeSamples-methods
#'
#' Remove samples from a \code{\linkS4class{DsATAC}} object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param indices a vector of indices of samples to be removed. Can be numeric, integer or logical.
#' @return a new \code{\linkS4class{DsATAC}} object with sites/regions removed
#'
#' @rdname removeSamples-DsATAC-method
#' @docType methods
#' @aliases removeSamples
#' @aliases removeSamples,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("removeSamples",
signature(
.object="DsATAC"
),
function(
.object,
indices
# reaggregate=TRUE
) {
if (!is.vector(indices) || !(is.numeric(indices) || is.logical(indices))){
logger.error(c("Unsupported type for index vector"))
}
nSamples <- length(getSamples(.object))
inds2keep <- rep(TRUE, nSamples)
if (is.numeric(indices)){
if (any(indices > nSamples | indices < 1)) {
logger.error(c("Invalid values in indices"))
}
inds2keep[indices] <- FALSE
} else if (is.logical(indices)){
inds2keep <- !indices
}
if (sum(inds2keep)>=nSamples){
logger.info("Nothing to be done: keeping object as is")
return(.object)
}
.object@sampleAnnot <- .object@sampleAnnot[inds2keep,]
for (rt in getRegionTypes(.object)){
# .object@counts[[rt]] <- .object@counts[[rt]][,..inds2keep]
.object@counts[[rt]] <- .object@counts[[rt]][,inds2keep]
}
if (length(.object@fragments) == nSamples){
.object@fragments <- .object@fragments[inds2keep]
chunkedFragmentFiles <- .object@diskDump.fragments && .hasSlot(.object, "diskDump.fragments.nSamplesPerFile") && .object@diskDump.fragments.nSamplesPerFile > 1
if (chunkedFragmentFiles){
logger.warning("Repackaging of chunked fragment files not supported yet. Use 'ChrAccR:::undiskFragmentData' if you want to save the dataset later.")
# TODO: Make this possible. See code for 'filterChroms' for template
}
}
return(.object)
}
)
#-------------------------------------------------------------------------------
#' Subsetting DsATAC datasets by sample
#'
#' NOTE: '[' operator for DsATAC does not reorder samples or deal with index multiplicity
#'
#' @param x DsATAC object
#' @param i sample names or indices
setMethod("[", "DsATAC",
function(x, i){
if (missing(i)) return(x)
sampleIds <- getSamples(x)
nSamples <- length(sampleIds)
inds2keep <- rep(FALSE, nSamples)
if (is.numeric(i)){
if (any(i > nSamples | i < 1)) {
logger.error(c("Invalid values in i"))
}
inds2keep[i] <- TRUE
} else if (is.logical(i)){
inds2keep <- i
} else if (is.character(i)){
if (!all(i %in% sampleIds)){
logger.error(c("Could not find the following sample ids:", paste(setdiff(i, sampleIds), collapse=",")))
}
inds2keep <- sampleIds %in% i
}
if (sum(inds2keep)>=nSamples){
logger.info("Nothing to be done: keeping object as is")
return(x)
}
if (is.character(i) || is.numeric(i)){
logger.info("NOTE: '[' operator for DsATAC does not reorder samples")
if (any(duplicated(i))){
logger.info("NOTE: '[' operator for DsATAC does not deal with index multiplicity")
}
}
return(removeSamples(x, !inds2keep))
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("transformCounts")) {
setGeneric(
"transformCounts",
function(.object, ...) standardGeneric("transformCounts"),
signature=c(".object")
)
}
#' transformCounts-methods
#'
#' transform count data for an ATAC seq dataset
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param method transformation method to be applied. Currently only 'log2', 'log10', 'quantile' (quantile normalization), 'percentile' (percentile normalization),'rankPerc' (rank percentile), 'vst' (DESeq2 Variance Stabilizing Transformation), 'batchCorrect' (limma batch effect removal), tf-idf', 'CPM' (counts per million), and 'RPKM' (RPKM normalization) are supported
#' @param regionTypes character vector specifying a name for the region type in which count data should be normalized(default: all region types)
#' @param ... other arguments depending on the \code{method} used. For \code{'batchCorrect'} it should be arguments passed on to \code{limma::removeBatchEffect} (most importantly, the \code{batch} argument).
#' @return a new \code{\linkS4class{DsATAC}} object with normalized count data
#'
#' @rdname transformCounts-DsATAC-method
#' @docType methods
#' @aliases transformCounts
#' @aliases transformCounts,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("transformCounts",
signature(
.object="DsATAC"
),
function(
.object,
method="quantile",
regionTypes=getRegionTypes(.object),
...
) {
if (!all(regionTypes %in% getRegionTypes(.object))){
logger.error(c("Unsupported region type:", paste(setdiff(regionTypes, getRegionTypes(.object)), collapse=", ")))
}
if (!is.element(method, c("quantile", "percentile", "rankPerc", "log2", "log10", "RPKM", "CPM", "RPKMtss", "CPMtss", "vst", "batchCorrect", "tf-idf"))) logger.error(c("Unsupported normalization method type:", method))
# choose appropriate functions for row and column functions
# depending on the matrix type
rsFun <- rowSums
csFun <- colSums
# sparse matrices
if (.object@sparseCounts){
rsFun <- Matrix::rowSums
csFun <- Matrix::colSums
}
# DelayedArray
if (.object@diskDump){
rsFun <- BiocGenerics::rowSums
csFun <- BiocGenerics::colSums
}
if (method == "quantile"){
logger.start(c("Performing quantile normalization"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cnames <- colnames(.object@counts[[rt]])
# .object@counts[[rt]] <- data.table(normalize.quantiles(as.matrix(.object@counts[[rt]])))
.object@counts[[rt]] <- preprocessCore::normalize.quantiles(ChrAccR::getCounts(.object, rt, asMatrix=TRUE)) #design question: should we set the parameter naIsZero==FALSE?
if (!.object@diskDump && .object@sparseCounts){
.object@counts[[rt]] <- as(.object@counts[[rt]], "sparseMatrix")
}
if (.object@diskDump){
.object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c("quantileNorm", .object@countTransform[[rt]])
}
logger.completed()
} else if (method == "rankPerc"){
logger.start(c("Applying rank percentile transformation"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cnames <- colnames(.object@counts[[rt]])
.object@counts[[rt]] <- muRtools::normalizeRank(ChrAccR::getCounts(.object, rt, asMatrix=TRUE), out="percentile")
if (!.object@diskDump && .object@sparseCounts){
.object@counts[[rt]] <- as(.object@counts[[rt]], "sparseMatrix")
}
if (.object@diskDump){
.object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c("rankPercNorm", .object@countTransform[[rt]])
}
logger.completed()
} else if (method == "percentile"){
logger.start(c("Applying percentile transformation"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cnames <- colnames(.object@counts[[rt]])
.object@counts[[rt]] <- muRtools::normalizePercentile(ChrAccR::getCounts(.object, rt, asMatrix=TRUE))
if (!.object@diskDump && .object@sparseCounts){
.object@counts[[rt]] <- as(.object@counts[[rt]], "sparseMatrix")
}
if (.object@diskDump){
.object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c("percentile", .object@countTransform[[rt]])
}
logger.completed()
} else if (is.element(method, c("RPKM", "CPM", "RPKMtss", "CPMtss"))){
doRpkm <- grepl("^RPKM", method)
doTssBg <- grepl("tss$", method)
if (doTssBg){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="tssGr")
tssGr <- suppressWarnings(trim(resize(tssGr, width=101L, fix="center", ignore.strand=TRUE)))
}
logger.start(c("Performing", method, "normalization"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cnames <- colnames(.object@counts[[rt]])
cm <- as.matrix(.object@counts[[rt]])
# cm <- .object@counts[[rt]]
regLen <- rep(1000L, getNRegions(.object, rt)) # for CPM, treat all regions as length 1000
rGr <- NULL
if (doRpkm || doTssBg) rGr <- getCoord(.object, rt)
if (doRpkm) regLen <- width(rGr)
sizeFac <- NULL
if (doTssBg){
ovTss <- overlapsAny(rGr, tssGr, ignore.strand=TRUE)
sizeFac <- matrix(csFun(cm[ovTss,,drop=FALSE], na.rm=TRUE), ncol=ncol(cm), nrow=nrow(cm), byrow=TRUE)
} else {
sizeFac <- matrix(csFun(cm, na.rm=TRUE), ncol=ncol(cm), nrow=nrow(cm), byrow=TRUE)
}
# .object@counts[[rt]] <- data.table(cm/(regLen * sizeFac) * 1e3 * 1e6)
.object@counts[[rt]] <- cm/(regLen * sizeFac) * 1e3 * 1e6
if (.object@diskDump) .object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c(method, .object@countTransform[[rt]])
}
logger.completed()
} else if (is.element(method, c("log2", "log10"))){
c0 <- 1
logFun <- log10
if (method=="log2") logFun <- log2
logger.start(c(method, "transforming counts"))
if (.object@sparseCounts) logger.error("Generating sparse matrix for matrix with possible true zero entries (", method, ")")
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cm <- as.matrix(.object@counts[[rt]])
cnames <- colnames(cm)
idx <- !is.na(cm) & cm!=0
cm[idx] <- logFun(cm[idx] + c0)
.object@counts[[rt]] <- cm
if (.object@diskDump) .object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c(method, .object@countTransform[[rt]])
}
logger.completed()
} else if (method == "vst"){
logger.start(c("Applying DESeq2 VST"))
if (.object@sparseCounts) logger.warning("Generating sparse matrix for matrix with possible true zero entries (VST)")
for (rt in regionTypes){
logger.status(c("Region type:", rt))
dds <- DESeq2::DESeqDataSet(getCountsSE(.object, rt), design=~1)
# .object@counts[[rt]] <- data.table(assay(DESeq2::vst(dds, blind=TRUE)))
.object@counts[[rt]] <- assay(DESeq2::vst(dds, blind=TRUE))
if (!.object@diskDump && .object@sparseCounts){
.object@counts[[rt]] <- as(.object@counts[[rt]], "sparseMatrix")
}
if (.object@diskDump){
.object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
.object@countTransform[[rt]] <- c("deseq.vst", .object@countTransform[[rt]])
}
logger.completed()
} else if (method == "batchCorrect"){
logger.start(c("Applying batch effect correction"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
.object@counts[[rt]] <- limma::removeBatchEffect(.object@counts[[rt]], ...)
if (!.object@diskDump && .object@sparseCounts){
.object@counts[[rt]] <- as(.object@counts[[rt]], "sparseMatrix")
}
if (.object@diskDump){
.object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
}
.object@countTransform[[rt]] <- c("batchCorrect", .object@countTransform[[rt]])
}
logger.completed()
} else if (method == "tf-idf"){
# TF-IDF transformation as applied in LSI (Shendure lab) (Cusanovich, et al. (2018). A Single-Cell Atlas of In Vivo Mammalian Chromatin Accessibility. Cell, 1-35)
# Recycled some code from: https://github.com/shendurelab/mouse-atac
logger.start(c("Applying TF-IDF transformation"))
for (rt in regionTypes){
logger.status(c("Region type:", rt))
cm <- .object@counts[[rt]] > 0 #indicator matrix: are there any counts in that region
cnames <- colnames(cm)
if (class(cm)=="lgCMatrix"){
tf <- Matrix::t(Matrix::t(cm) / Matrix::colSums(cm)) #term frequency
idf <- tf * log(1 + ncol(cm) / Matrix::rowSums(cm)) # inverse document frequency
} else {
cm <- cm & !is.na(.object@counts[[rt]])
tf <- t(t(cm) / csFun(cm)) #term frequency
idf <- tf * log(1 + ncol(cm) / rsFun(cm)) # inverse document frequency
}
.object@counts[[rt]] <- idf
if (.object@diskDump) .object@counts[[rt]] <- as(.object@counts[[rt]], "HDF5Array")
colnames(.object@counts[[rt]]) <- cnames
.object@countTransform[[rt]] <- c("TF-IDF", .object@countTransform[[rt]])
}
logger.completed()
}
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("filterLowCovg")) {
setGeneric(
"filterLowCovg",
function(.object, ...) standardGeneric("filterLowCovg"),
signature=c(".object")
)
}
#' filterLowCovg-methods
#'
#' Filter regions with low read counts
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param thresh regions with read counts below this threshold will be considered lowly covered regions (default: regions with fewer than 1 read will be discarded)
#' @param reqSamples the percentile of samples required to meet or exceed the threshold in order for a region to be retained.
#' must be in the interval [0, 1) (default: 0.75 = 75 percent)
#' @param regionTypes character vector specifying the names of the region types to which filtering should be applied (default: all region types)
#' @return a new \code{\linkS4class{DsATAC}} object with low coverage regions removed
#'
#' @rdname filterLowCovg-DsATAC-method
#' @docType methods
#' @aliases filterLowCovg
#' @aliases filterLowCovg,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("filterLowCovg",
signature(
.object="DsATAC"
),
function(
.object,
thresh=1L,
reqSamples=0.75,
regionTypes=getRegionTypes(.object)
) {
if (!all(regionTypes %in% getRegionTypes(.object))){
logger.error(c("Unsupported region type:", paste(setdiff(regionTypes, getRegionTypes(.object)), collapse=", ")))
}
N <- length(getSamples(.object))
numAllowed <- reqSamples
if (numAllowed < 1 && numAllowed >=0){
numAllowed <- as.integer(ceiling(numAllowed * N))
}
if (numAllowed > N || numAllowed < 1){
logger.error(c("Invalid number of samples. Must be in the interval [0,1) or an integer in [1, n_samples]"))
}
percAllowed <- round(numAllowed/N, 2)
logger.status(c("Removing regions with read counts lower than", thresh, "in more than", N-numAllowed, "samples", paste0("(", (1-percAllowed)*100,"%)")))
for (rt in regionTypes){
rsFun <- rowSums
if (.object@sparseCounts) rsFun <- Matrix::rowSums
rem <- rsFun(getCounts(.object, rt, naIsZero=TRUE, allowSparseMatrix=TRUE) >= thresh) < numAllowed
nRem <- sum(rem)
nRegs <- getNRegions(.object, rt)
if (nRem >= nRegs) logger.error(c("Cannot remove all regions of type:", rt))
if (nRem > 0){
.object <- removeRegions(.object, rem, rt)
}
logger.info(c("Removed", nRem, "regions", paste0("(", round(nRem/nRegs, 4)*100, "%)"), "of type", rt))
}
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("filterChroms")) {
setGeneric(
"filterChroms",
function(.object, ...) standardGeneric("filterChroms"),
signature=c(".object")
)
}
#' filterChroms-methods
#'
#' Filter out regions based on chromosome list
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param exclChrom vector of chromosome names to filter out
#' @return a new \code{\linkS4class{DsATAC}} object filtered for chromosomes
#'
#' @rdname filterChroms-DsATAC-method
#' @docType methods
#' @aliases filterChroms
#' @aliases filterChroms,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("filterChroms",
signature(
.object="DsATAC"
),
function(
.object,
exclChrom=c("chrX", "chrY", "chrM")
) {
for (rt in getRegionTypes(.object)){
isExclChrom <- as.character(seqnames(getCoord(.object, rt))) %in% exclChrom
.object <- removeRegions(.object, isExclChrom, rt)
logger.info(c("Removed", sum(isExclChrom), "regions", paste0("(", round(sum(isExclChrom)/length(isExclChrom), 4)*100, "%)"), "of type", rt))
}
if (length(.object@fragments) > 0){
logger.status("Filtering fragment data")
chunkedFragmentFiles <- .object@diskDump.fragments && .hasSlot(.object, "diskDump.fragments.nSamplesPerFile") && .object@diskDump.fragments.nSamplesPerFile > 1
if (chunkedFragmentFiles){
isFile <- sapply(.object@fragments, is.character)
if (!all(isFile)) logger.error("Expected all disk-dumped fragment files")
fns <- unlist(.object@fragments)
names(fns) <- names(.object@fragments)
for (fn in unique(fns)){
fragGrl <- readRDS(fn)
sids <- names(fragGrl)
# filter
fragGrl_filt <- endoapply(fragGrl, FUN=function(x){
idx <- !(as.character(seqnames(x)) %in% exclChrom)
return(x[idx])
})
names(fragGrl_filt) <- sids
# save the list object to disk
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
saveRDS(fragGrl_filt, fn, compress=TRUE)
# replace old filename references
.object@fragments[sids] <- rep(list(fn), length(sids))
}
} else {
for (sid in names(.object@fragments)){
fragGr <- getFragmentGr(.object, sid)
idx <- !(as.character(seqnames(fragGr)) %in% exclChrom)
fragGr <- fragGr[idx]
if (.object@diskDump.fragments){
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
saveRDS(fragGr, fn, compress=TRUE)
fragGr <- fn
}
.object@fragments[[sid]] <- fragGr
}
}
}
return(.object)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("filterByGRanges")) {
setGeneric(
"filterByGRanges",
function(.object, ...) standardGeneric("filterByGRanges"),
signature=c(".object")
)
}
#' filterByGRanges-methods
#'
#' Filter out regions based on a GRanges object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param gr \code{GRanges} object used for filtering
#' @param method character string specifying the method. Can be \code{"white"} to treat \code{gr} as a whitelist
#' (i.e. only regions and fragments overlapping with it are retained) or \code{"black"} (default) to treat it as a blacklist.
#' @return a new, filtered \code{\linkS4class{DsATAC}} object
#'
#' @rdname filterByGRanges-DsATAC-method
#' @docType methods
#' @aliases filterByGRanges
#' @aliases filterByGRanges,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("filterByGRanges",
signature(
.object="DsATAC"
),
function(
.object,
gr,
method="black"
) {
if (class(gr)!="GRanges") logger.error("Invalid gr object. Expected GRanges.")
if (!is.element(method, c("white", "black"))) logger.error(c("invalid method for filtering:", method))
for (rt in getRegionTypes(.object)){
isExcl <- overlapsAny(getCoord(.object, rt), gr) # blacklist
if (method=="white") isExcl <- !isExcl
.object <- removeRegions(.object, isExcl, rt)
logger.info(c("Removed", sum(isExcl), "regions", paste0("(", round(sum(isExcl)/length(isExcl), 4)*100, "%)"), "of type", rt))
}
if (length(.object@fragments) > 0){
logger.status("Filtering fragment data")
chunkedFragmentFiles <- .object@diskDump.fragments && .hasSlot(.object, "diskDump.fragments.nSamplesPerFile") && .object@diskDump.fragments.nSamplesPerFile > 1
if (chunkedFragmentFiles){
isFile <- sapply(.object@fragments, is.character)
if (!all(isFile)) logger.error("Expected all disk-dumped fragment files")
fns <- unlist(.object@fragments)
names(fns) <- names(.object@fragments)
for (fn in unique(fns)){
fragGrl <- readRDS(fn)
sids <- names(fragGrl)
# filter
fragGrl_filt <- endoapply(fragGrl, FUN=function(x){
idx <- overlapsAny(x, gr) # whitelist
if (method=="black") idx <- !idx
return(x[idx])
})
names(fragGrl_filt) <- sids
# save the list object to disk
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
saveRDS(fragGrl_filt, fn, compress=TRUE)
# replace old filename references
.object@fragments[sids] <- rep(list(fn), length(sids))
}
} else {
for (sid in names(.object@fragments)){
fragGr <- getFragmentGr(.object, sid)
idx <- overlapsAny(fragGr, gr) # whitelist
if (method=="black") idx <- !idx
fragGr <- fragGr[idx]
if (.object@diskDump.fragments){
fn <- tempfile(pattern="fragments_", tmpdir=tempdir(), fileext = ".rds")
saveRDS(fragGr, fn, compress=TRUE)
fragGr <- fn
}
.object@fragments[[sid]] <- fragGr
}
}
}
return(.object)
}
)
################################################################################
# Analysis Utils
################################################################################
if (!isGeneric("regionSetCounts")) {
setGeneric(
"regionSetCounts",
function(.object, ...) standardGeneric("regionSetCounts"),
signature=c(".object")
)
}
#' regionSetCounts-methods
#'
#' Overlap the insertion data with a list of region sets
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param rsl \code{GRangesList} or NAMED list of \code{GRanges} objects. Each element corresponds to a region set for which the summary statistics are reported
#' @param bySample for internal use: iterate over samples (instead of retrieving one giant insertion list for all samples) in order to save memory (at the tradeoff of compute time)
#' @return a matrix of overlap counts for each region set and sample
#'
#' @rdname regionSetCounts-DsATAC-method
#' @docType methods
#' @aliases regionSetCounts
#' @aliases regionSetCounts,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("regionSetCounts",
signature(
.object="DsATAC"
),
function(
.object,
rsl,
bySample=FALSE
) {
nr <- length(rsl)
res <- matrix(as.numeric(NA), nrow=nr, ncol=length(getSamples(.object)))
if (bySample){
if (class(rsl)=="GRangesList"){
nc <- elementNROWS(rsl)
rslGr <- unlist(rsl, use.names=FALSE)
} else {
nc <- sapply(rsl, length)
rslGr <- do.call("c", unname(rsl))
}
idx.rsl <- rep(1:nr, times=nc)
res <- do.call("cbind", lapply(getSamples(.object), FUN=function(sid){
logger.status(c("Summarizing fragment counts for sample", sid))
insGr <- getInsertionSites(.object, sid)[[1]]
ov <- countOverlaps(rslGr, insGr, ignore.strand=TRUE)
rr <- tapply(ov, idx.rsl, sum)
rr <- rr[as.character(1:nr)] # make sure the counts are returned in the same order as in rsl (not sure, if really necessary)
return(rr)
}))
rownames(res) <- names(rsl)
colnames(res) <- getSamples(.object)
} else {
# insGrl <- getInsertionSites(.object)
res <- countPairwiseOverlaps(rsl, getInsertionSites(.object), ignore.strand=TRUE)
}
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getInsertionKmerFreq")) {
setGeneric(
"getInsertionKmerFreq",
function(.object, ...) standardGeneric("getInsertionKmerFreq"),
signature=c(".object")
)
}
#' getInsertionKmerFreq-methods
#'
#' compute kmer frequencies at insertion sites for each sample
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param samples sample identifiers
#' @param k length of the kmer
#' @param normGenome should the result be normalized by genome-wide kmer frequencies
#' @return a \code{matrix} containing kmer frequencies (one row for each kmer and one column for each sample in the dataset)
#'
#' @rdname getInsertionKmerFreq-DsATAC-method
#' @docType methods
#' @aliases getInsertionKmerFreq
#' @aliases getInsertionKmerFreq,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getInsertionKmerFreq",
signature(
.object="DsATAC"
),
function(
.object,
samples=getSamples(.object),
k=6,
normGenome=FALSE
) {
if (!requireNamespace("Biostrings")) logger.error(c("Could not load dependency: Biostrings"))
if (!all(samples %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(samples, getSamples(.object)), collapse=", ")))
go <- getGenomeObject(.object@genome)
res <- do.call("cbind", lapply(samples, FUN=function(sid){
logger.status(c("Preparing insertion kmer-frequencies for sample", sid))
insGr <- trim(resize(shift(getInsertionSites(.object, sid)[[1]], -ceiling(k/2)), width=k, fix="start", ignore.strand=TRUE))
kmerFreq <- Biostrings::oligonucleotideFrequency(Biostrings::Views(go, insGr), width=k, simplify.as="collapsed")
return(kmerFreq)
}))
colnames(res) <- samples
if (normGenome) {
logger.status(c("Normalizing using genome-wide kmer-frequencies"))
kmerFreq.g <- Biostrings::oligonucleotideFrequency(Biostrings::Views(go, getGenomeGr(.object@genome, onlyMainChrs=TRUE)), width=k, simplify.as="collapsed")
res <- res/kmerFreq.g
}
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("aggregateRegionCounts")) {
setGeneric(
"aggregateRegionCounts",
function(.object, ...) standardGeneric("aggregateRegionCounts"),
signature=c(".object")
)
}
#' aggregateRegionCounts-methods
#'
#' Agregate counts across a set of regions, e.g. for footprinting analysis
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionGr \code{GRanges} object specifying the regions to aggregate over
#' @param samples sample identifiers
#' @param countAggrFun aggration function to be used for summarizing the insertion counts at each position. Possible values include \code{"sum"}, \code{"mean"}, and \code{"median"}
#' @param norm method used for normalizing the resulting position-wise counts.
#' Currently only \code{'tailMean'} is supported, which computes normalization factors as the mean signal in the tails of the window
#' @param normTailW fraction of the region window to be used on each side of the window to be used for normalization if \code{norm} is one of \code{'tailMean'}
#' @param kmerBiasAdj compute Tn5 bias and use it to adjust the counts as in Corces, et al., Science, (2018)
#' @param k length of the kmer to be used for sequence bias correction. Only relevant if \code{kmerBiasAdj==TRUE}.
#' @param sampleCovg to save compute time, a sample coverage track list (as computed by \code{getCoverage(.object)}) can be supplied. If not, it will be computed on the fly.
#' @param sampleKmerFreqM to save compute time, a matrix of sample kmer frequency at insertion sites (as computed by \code{getInsertionKmerFreq(.object, ...)}) can be supplied.
#' If not, it will be computed on the fly. Only relevant if \code{kmerBiasAdj==TRUE}.
#' @param regionKmerFreqM to save compute time, a matrix of region kmer frequencies (kmers X window width).
#' Must have the same number of rows as the specified (or computed) \code{sampleKmerFreqM} (kmers)
#' and the same number of columns as the window width (median width of \code{regionGr}).
#' Only relevant if \code{kmerBiasAdj==TRUE}.
#' @param silent limit log messages
#' @return a \code{data.frame} containing position-wise counts (raw, normalized and optionally Tn5-bias-corrected) for each sample
#'
#' @rdname aggregateRegionCounts-DsATAC-method
#' @docType methods
#' @aliases aggregateRegionCounts
#' @aliases aggregateRegionCounts,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("aggregateRegionCounts",
signature(
.object="DsATAC"
),
function(
.object,
regionGr,
samples=getSamples(.object),
countAggrFun="sum",
norm="tailMean",
normTailW=0.1,
kmerBiasAdj=TRUE,
k=6,
sampleCovg=NULL,
sampleKmerFreqM=NULL,
regionKmerFreqM=NULL,
silent=FALSE
) {
if (!all(samples %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(samples, getSamples(.object)), collapse=", ")))
if (!is.element(countAggrFun, c("sum", "mean", "median"))) logger.error(c("Invalid value for countAggrFun:", countAggrFun))
ww <- width(regionGr)
wm <- as.integer(median(ww))
idx <- ww==wm
if (!all(idx)){
if (!silent) logger.warning(c("not all elements in GRanges have the same width. --> discarding", sum(!idx), "of", length(idx), "regions that do not."))
regionGr <- regionGr[idx]
}
if (kmerBiasAdj && !is.null(regionKmerFreqM) && ncol(regionKmerFreqM)!=wm){
logger.error(c("Invalid value for regionKmerFreqM. expected a window size of", wm, "rows, but input has", ncol(regionKmerFreqM)))
}
if (!is.element(norm, c("tailMean"))) logger.error("Invalid value for 'norm' (normalization method)")
if (normTailW < 1 && normTailW>0){
normTailW <- ceiling(wm*normTailW)
} else if (is.integer(normTailW) && normTailW > 0 && normTailW <= wm/2){
# do nothing: treat integer values as basepairs
normTailW <- normTailW
} else {
logger.error("Invalid value for tail-normalization window")
}
if (is.null(sampleCovg)){
if (!silent) logger.start("Computing sample coverage")
sampleCovg <- getCoverage(.object, samples=samples)
if (!silent) logger.completed()
} else {
if (!all(samples %in% names(sampleCovg))) logger.error("'sampleCovg' does not cover all samples")
}
kmerFreqM <- NULL
go <- NULL
tn5bias <- rep(as.numeric(NA), wm)
if (kmerBiasAdj){
go <- getGenomeObject(.object@genome)
if (is.null(sampleKmerFreqM)) {
logger.start("Computing sample kmer frequencies")
sampleKmerFreqM <- getInsertionKmerFreq(.object, samples=samples, k=k, normGenome=TRUE)
logger.completed()
} else {
if (!all(samples %in% colnames(sampleKmerFreqM))) logger.error("'sampleKmerFreqM' does not cover all samples")
}
if (is.null(regionKmerFreqM)) {
logger.start("Computing region kmer frequencies")
kmerFreqM <- do.call("cbind", lapply(0:(wm-1), FUN=function(i){
if (i%%50 == 0) logger.status(paste0("i=",i))
wGr <- trim(resize(GenomicRanges::shift(regionGr, i-ceiling(k/2)), width=k, fix="start", ignore.strand=TRUE))
rr <- Biostrings::oligonucleotideFrequency(Biostrings::Views(go, wGr), width=k, simplify.as="collapsed")
return(rr)
}))
logger.completed()
} else {
kmerFreqM <- regionKmerFreqM
}
if (!all(rownames(kmerFreqM)==rownames(sampleKmerFreqM))) logger.error("kmers in frequency matrices do not match")
}
# given a RleList object (cov) with genomic coverage (as computed by GenomicRanges::coverage) and a GRanges
# object (gr) specifying the genomic locations of the features of interests (of uniform length)
# returns the summed/piled-up coverages for each position across all elements in gr
# adapted and optimized code from Jeff Granja
fastFootprint <- function(cov, gr, aggrFun="sum"){
int <- intersect(names(cov), unique(seqnames(gr)))
cov <- cov[int]
gr <- gr[which(as.character(seqnames(gr)) %in% int)]
suppressWarnings(seqlengths(gr)[int] <- sapply(cov,length))
gr <- trim(gr)
w <- as.integer(median(width(gr)))
gr <- gr[width(gr)==w,] #only select elements that have the same length
grL <- split(gr, seqnames(gr))
covM <- do.call("rbind", lapply(names(cov), function(chrom){
v <- as.matrix(Biostrings::Views(cov[[chrom]], ranges(grL[[chrom]])))
v[is.na(v)] <- 0 #too handle errors should not be needed
# revert negative strand regions
revIdx <- as.character(strand(grL[[chrom]]))=="-"
if (any(revIdx)) v[revIdx,] <- v[revIdx, w:1]
return(v)
}))
res <- NULL
if (aggrFun=="sum"){
res <- colSums(covM)
} else if (aggrFun=="mean"){
res <- colMeans(covM)
} else if (aggrFun=="median"){
res <- matrixStats::colMedians(covM)
} else {
logger.error(c("Unknown aggrFun in fastFootprint:", aggrFun))
}
return(res)
}
if (!silent) logger.start("Aggregating counts")
countL <- lapply(samples, FUN=function(sid){
if (!silent) logger.status(c("Sample:", sid))
return(fastFootprint(sampleCovg[[sid]], regionGr, aggrFun=countAggrFun))
})
names(countL) <- samples
if (!silent) logger.completed()
res <- do.call("rbind", lapply(samples, FUN=function(sid){
cs <- countL[[sid]]
tn5Bias <- c()
if (kmerBiasAdj) {
tn5Bias <- as.vector(t(kmerFreqM) %*% matrix(sampleKmerFreqM[,sid]))
}
normFac <- as.numeric(NA)
normFac.tn5 <- as.numeric(NA)
if (norm=="tailMean"){
normFac <- 1/mean(cs[c(1:normTailW, (wm-normTailW+1):wm)], na.rm=TRUE)
if (kmerBiasAdj) {
normFac.tn5 <- 1/mean(tn5Bias[c(1:normTailW, (wm-normTailW+1):wm)], na.rm=TRUE)
}
}
sampleDf <- data.frame(
sampleId=sid,
pos=1:wm,
count=cs,
countNorm=cs*normFac,
stringsAsFactors=FALSE
)
if (kmerBiasAdj){
sampleDf <- cbind(sampleDf, data.frame(
countsBiasCor=cs/tn5Bias,
countNormBiasCor=(cs*normFac)/(tn5Bias*normFac.tn5),
Tn5bias=tn5Bias,
Tn5biasNorm=tn5Bias*normFac.tn5,
stringsAsFactors=FALSE
))
}
return(sampleDf)
}))
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getMotifEnrichment")) {
setGeneric(
"getMotifEnrichment",
function(.object, ...) standardGeneric("getMotifEnrichment"),
signature=c(".object")
)
}
#' getMotifEnrichment-methods
#'
#' Perform enrichment analysis for (TF) motifs of a query set of regions.
#' Fisher's Exact Test is employed to test the association of motif present in the
#' query set against the background of all regions of that type covered in the object
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying the region type
#' @param idx logical vector or indices of the same length as \code{length(getCoord(.object))}
#' specifies the query set
#' @param motifs either a character string (currently only "jaspar" and sets contained in \code{chromVARmotifs} ("homer", "encode", "cisbp") are supported) or an object containing PWMs
#' that can be used by \code{motifmatchr::matchMotifs} (such as an \code{PFMatrixList} or \code{PWMatrixList} object)
#' @return a \code{data.frame} summarizing Fisher's Exact Test enrichment statistics for each motif
#'
#' @rdname getMotifEnrichment-DsATAC-method
#' @docType methods
#' @aliases getMotifEnrichment
#' @aliases getMotifEnrichment,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getMotifEnrichment",
signature(
.object="DsATAC"
),
function(
.object,
type,
idx,
motifs="jaspar"
) {
res <- NULL
#count matrix
coords <- getCoord(.object, type)
if (is.logical(idx)){
if (length(idx)!=length(coords)){
logger.error(c("Could not get motif enrichment: the supplied index vector is logical, but its length does not match"))
}
} else if (is.numeric(idx)){
idx <- as.integer(idx)
if (any(idx < 0 | idx > length(coords))){
logger.error(c("Could not get motif enrichment: the supplied index vector numeric but indices are out of range"))
}
#create logical vector
tmp <- idx
idx <- rep(FALSE, length(coords))
idx[tmp] <- TRUE
rm(tmp)
} else {
logger.error(c("Could not get motif enrichment: invalid index vector"))
}
# for motifmatchr
mmObjs <- prepareMotifmatchr(.object@genome, motifs)
#MAIN
se <- getCountsSE(.object, type)
mm <- motifmatchr::matchMotifs(mmObjs[["motifs"]], se, genome=mmObjs[["genome"]])
regionMotifMatch <- as.matrix(motifmatchr::motifMatches(mm))
motifNames <- colData(mm)[["name"]]
names(motifNames) <- NULL
if (!any(duplicated(motifNames))){
colnames(regionMotifMatch) <- motifNames
} else {
motifNames <- colnames(regionMotifMatch)
}
res <- do.call("rbind", lapply(motifNames, FUN=function(mo){
# hasMotif <- regionMotifMatch[,mo]
# a <- sum(hasMotif & idx)
# b <- sum(!hasMotif & idx)
# c <- sum(hasMotif & !idx)
# d <- sum(!hasMotif & !idx)
# fr <- fisher.test(matrix(c(a,c,b,d), nrow=2, ncol=2), alternative="greater")
if (length(unique(idx)) < 2 || length(unique(regionMotifMatch[,mo])) < 2){
fr <- list(p.value=as.numeric(NA), estimate=as.numeric(NA))
} else {
fr <- fisher.test(x=idx, y=regionMotifMatch[,mo], alternative="greater")
}
df <- data.frame(
pVal=fr$p.value,
oddsRatio=fr$estimate
)
return(df)
}))
res[,"qVal"] <- rep(as.numeric(NA), nrow(res))
if (!any(is.na(res[,"pVal"]))) {
res[,"qVal"] <- tryCatch(
qvalue::qvalue(res[,"pVal"])$qvalue,
error = function(ee) {
if (ee$message == "missing or infinite values in inputs are not allowed"){
return(qvalue::qvalue(res[,"pVal"], lambda=0)$qvalue)
} else {
logger.warning(c("Could not compute qvalues:", ee$message))
return(rep(as.numeric(NA), nrow(res)))
}
}
)
}
res[,"motif"] <- motifNames
rownames(res) <- motifNames
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getChromVarDev")) {
setGeneric(
"getChromVarDev",
function(.object, ...) standardGeneric("getChromVarDev"),
signature=c(".object")
)
}
#' getChromVarDev-methods
#'
#' Compute chromVar deviations
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying the region type
#' @param motifs either a character string (currently only "jaspar" and sets contained in \code{chromVARmotifs} ("homer", "encode", "cisbp") are supported) or an object containing PWMs
#' that can be used by \code{motifmatchr::matchMotifs} (such as an \code{PFMatrixList} or \code{PWMatrixList} object)
#' @return Deviations object as returned by \code{chromVAR::computeDeviations}
#'
#' @rdname getChromVarDev-DsATAC-method
#' @docType methods
#' @aliases getChromVarDev
#' @aliases getChromVarDev,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getChromVarDev",
signature(
.object="DsATAC"
),
function(
.object,
type,
motifs="jaspar"
) {
res <- NULL
countSe <- getCountsSE(.object, type, naIsZero=TRUE)
genomeObj <- getGenomeObject(.object@genome)
# genome(countSe) <- BSgenome::providerVersion(genomeObj)
genome(countSe) <- S4Vectors::metadata(genomeObj)$genome # hack to override inconsistent naming of genome versions (e.g. hg38 and GRCh38)
countSe <- chromVAR::addGCBias(countSe, genome=genomeObj)
# for motifmatchr
mmInput <- prepareMotifmatchr(genomeObj, motifs)
mmObj <- motifmatchr::matchMotifs(mmInput[["motifs"]], countSe, genome=genomeObj)
ridx <- safeMatrixStats(assay(countSe), "rowSums") > 0 & safeMatrixStats(assay(mmObj), "rowSums") > 0 # only consider regions where there is an actual motif match and counts
res <- chromVAR::computeDeviations(object=countSe[ridx,], annotations=mmObj[ridx,])
return(res)
}
)
################################################################################
# Footprinting
################################################################################
if (!isGeneric("getMotifFootprints")) {
setGeneric(
"getMotifFootprints",
function(.object, ...) standardGeneric("getMotifFootprints"),
signature=c(".object")
)
}
#' getMotifFootprints-methods
#'
#' Perform enrichment analysis for (TF) motif footprinting
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param motifNames character vector of motif names
#' @param samples sample identifiers
#' @param motifFlank number of base pairs flanking the motif on each side
#' @param type (PLACEHOLDER ARGUMENT: NOT IMPLEMENTED YET) character string specifying the region type or \code{".genome"} (default) for genome-wide profiling
#' @param motifDb either a character string (currently only "jaspar" and sets contained in \code{chromVARmotifs} ("homer", "encode", "cisbp") are supported) or an object containing PWMs
#' that can be used by \code{motifmatchr::matchMotifs} (such as an \code{PFMatrixList} or \code{PWMatrixList} object) OR a list of \code{GRanges} objects specifying motif occurrences
#' @return a \code{list} of footprinting results with one element for each motif. Each motif's results contain summary data frames with aggregated counts
#' across all motif occurrences and a \code{ggplot} object for plotting footprints
#'
#' @rdname getMotifFootprints-DsATAC-method
#' @docType methods
#' @aliases getMotifFootprints
#' @aliases getMotifFootprints,DsATAC-method
#' @author Fabian Mueller
#' @export
#' @examples
#' \dontrun{
#' dsa <- ChrAccRex::loadExample("dsAtac_ia_example")
#' motifNames <- c("MA1419.1_IRF4", "MA0139.1_CTCF", "MA0037.3_GATA3")
#' # motifNames <- grep("(IRF4|CTCF|GATA3)$", names(prepareMotifmatchr("hg38", "jaspar")$motifs), value=TRUE, ignore.case=TRUE) # alternative by searching
#' samples <- c("TeffNaive_U_1001", "TeffNaive_U_1002", "TeffMem_U_1001", "TeffMem_U_1002")
#' fps <- getMotifFootprints(dsa, motifNames, samples)
#' fps[["MA1419.1_IRF4"]]$plot
#' }
setMethod("getMotifFootprints",
signature(
.object="DsATAC"
),
function(
.object,
motifNames,
samples=getSamples(.object),
motifFlank=250L,
type=".genome",
motifDb="jaspar"
) {
universeGr <- NULL
if (type!=".genome"){
logger.warning("Footprinting: Using types other than '.genome' is experimental")
universeGr <- getCoord(.object, type)
elementMetadata(universeGr) <- NULL
}
logger.start("Finding motif occurrences")
motifObj <- NULL
motifKmerFreqML <- NULL # region window k-mer frequencies for bias correction
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (!is.null(annoPkg) & is.character(motifDb)){
logger.info(c("Using annotation from package:", annoPkg))
motifObj <- get("getMotifAnnotation", asNamespace(annoPkg))(anno=motifDb, type="motifs")
motifGrl <- get("getMotifAnnotation", asNamespace(annoPkg))(anno=motifDb, type="motifOccGrl")
if (type==".genome"){
motifKmerFreqML <- get("getMotifAnnotation", asNamespace(annoPkg))(anno=motifDb, type="motifWindowKmerFreq")
}
} else if (is.list(motifDb) || any(grepl("GRanges",class(motifDb)))){
logger.info("Provided motif occurrences as (GRanges) list")
motifGrl <- motifDb
} else {
logger.info("Using motifmatchr")
if (is.character(motifDb)){
motifObj <- prepareMotifmatchr(.object@genome, motifDb)$motifs # currently only used for motif logo plotting. Could be omitted if that is not desired
}
motifGrl <- getMotifOccurrences(motifNames, motifDb=motifDb, genome=.object@genome)
}
motifLens <- sapply(motifNames, FUN=function(mn){
as.integer(median(width(motifGrl[[mn]])))
})
names(motifLens) <- motifNames
logger.completed()
obj_filt <- .object
if (!is.null(universeGr)){
# remove region aggregates to save time and space (we just need insertions)
for (rt in getRegionTypes(obj_filt)){
obj_filt <- removeRegionType(obj_filt, rt)
}
logger.start(c("Filtering insertions by region type:", type))
obj_filt <- filterByGRanges(obj_filt, universeGr, method="white")
logger.completed()
}
logger.start("Computing sample coverage")
sampleCovg <- getCoverage(obj_filt, samples=samples)
logger.completed()
logger.start("Computing sample insertion k-mer frequencies")
sampleKmerFreqM <- getInsertionKmerFreq(obj_filt, samples=samples, k=6, normGenome=TRUE)
logger.completed()
res <- lapply(motifNames, FUN=function(mn){
logger.start(c("Motif:", mn))
motifCenGr <- motifGrl[[mn]]
if (!is.null(universeGr)){
idx <- overlapsAny(motifCenGr, universeGr, type="within", ignore.strand=TRUE)
logger.info(paste0("Retaining ", sum(idx), " of ", length(idx), " (", round(100*sum(idx)/length(idx), 2), "%) motif occurrences overlapping with query region type"))
motifCenGr <- motifCenGr[idx]
}
motifCenGr <- unique(trim(resize(motifCenGr, width=2*motifFlank+1, fix="center", ignore.strand=TRUE)))
regKmerFreqM <- NULL # region window k-mer frequencies for bias correction
if (length(motifKmerFreqML) > 0){
regKmerFreqM <- motifKmerFreqML[[mn]]
}
footprintDf <- aggregateRegionCounts(obj_filt, motifCenGr, samples=samples, sampleCovg=sampleCovg, sampleKmerFreqM=sampleKmerFreqM, regionKmerFreqM=regKmerFreqM)
footprintDf$pos <- footprintDf$pos - (motifFlank+1) # offset the position: aggregateRegionCounts returns positions in [0,regionWidth]
motifLen <- motifLens[mn]
# Definitions according to [Corces, Granja, et al. (2018). The chromatin accessibility landscape of primary human cancers. Science, 362(6413)]
absPos <- abs(footprintDf[,"pos"])
i_base <- ceiling(motifLen/2) + 5
x_base <- footprintDf[absPos <= i_base, "countNormBiasCor"]
# 10% trimmed mean
x_base[x_base > quantile(x_base, 0.9) | x_base < quantile(x_base, 0.1)] <- NA
fp_baseMean <- mean(x_base, na.rm=TRUE)
fp_flankMean <- mean(footprintDf[absPos > i_base & absPos <= 50, "countNormBiasCor"], na.rm=TRUE)
fp_bgMean <- mean(footprintDf[absPos >= 200, "countNormBiasCor"], na.rm=TRUE)
# plot
pp <- plotFootprint(footprintDf, pwm=motifObj[[mn]], colorMap=NULL)
# motifUp <- -ceiling(motifLen/2) + 1
# motifDown <- floor(motifLen/2)
# ppm <- ggplot(footprintDf, aes(x=pos, y=countNormBiasCor, color=sampleId, group=sampleId, fill=sampleId)) +
# annotate("rect", xmin=motifUp, xmax=motifDown, ymin=-Inf, ymax=Inf, fill="#d9d9d9") +
# geom_line() #+ geom_smooth(aes(group=cellType),size=2,alpha=0.15,method="gam",formula=y ~ s(x, bs = "cs"))
# ppb <- ggplot(footprintDf, aes(x=pos, y=Tn5biasNorm, color=sampleId, group=sampleId, fill=sampleId)) +
# annotate("rect", xmin=motifUp, xmax=motifDown, ymin=-Inf, ymax=Inf, fill="#d9d9d9") +
# geom_line()
# # add sequence logo
# logo <- hmSeqLogo(motifObj[[mn]], unit(0.5, "npc"), unit(0.5, "npc"), 1, 1, ic.scale=TRUE)
# logoHeight <- (max(footprintDf[,"countNormBiasCor"], na.rm=TRUE) - min(footprintDf[,"countNormBiasCor"], na.rm=TRUE)) * 0.2
# logoY <- max(footprintDf[,"countNormBiasCor"], na.rm=TRUE) - logoHeight
# ppm <- ppm + cowplot::draw_plot(logo, 100, logoY, 100, logoHeight)
# pp <- cowplot::plot_grid(ppm, ppb + theme(legend.position="none"), ncol=1, rel_heights=c(2, 1), align="v", axis="lr")
rr <- list(
footprintDf=footprintDf,
plot=pp,
motifLen=motifLen,
nMotifs=length(motifCenGr),
fpFlankAcc=log2(fp_flankMean/fp_bgMean),
fpDepth=log2(fp_baseMean/fp_flankMean)
)
logger.completed()
return(rr)
})
names(res) <- motifNames
return(res)
}
)
################################################################################
# Differential analysis
################################################################################
if (!isGeneric("getDESeq2Dataset")) {
setGeneric(
"getDESeq2Dataset",
function(.object, ...) standardGeneric("getDESeq2Dataset"),
signature=c(".object")
)
}
#' getDESeq2Dataset-methods
#'
#' Retrieve a differential expression dataset computed with DESeq2
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionType character string specifying the region type
#' @param designCols column names in the sample annotation potentially used to create the design matrix
#' @param compTab if design columns are not specified, you can specify a comparison table directly. These comparison tables can be obtained by \code{getComparisonTable(...)}
#' @param ... parameters passed on to \code{DESeq2::DESeq}
#' @return \code{DESeqDataSet} as returned by \code{DESeq2::DESeq}
#'
#' @rdname getDESeq2Dataset-DsATAC-method
#' @docType methods
#' @aliases getDESeq2Dataset
#' @aliases getDESeq2Dataset,DsATAC-method
#' @author Fabian Mueller
#' @export
#'
#' @examples
#' \dontrun{
#' dsa <- ChrAccRex::loadExample("dsAtac_ia_example")
#' dds <- getDESeq2Dataset(dsa, "IA_prog_peaks", designCols=c("donor", "stimulus", "cellType"))
#' dds
#' }
setMethod("getDESeq2Dataset",
signature(
.object="DsATAC"
),
function(
.object,
regionType,
designCols=NULL,
compTab=NULL,
...
) {
if (!requireNamespace("DESeq2")) logger.error(c("Could not load dependency: DESeq2"))
if (!is.element(regionType, getRegionTypes(.object))) logger.error(c("Unsupported region type:", regionType))
# annotation data
ph <- getSampleAnnot(.object)
if (!is.null(compTab) && nrow(compTab) > 0){
vsAllIdx <- compTab[,"grp1Name"]==".ALL" | compTab[,"grp2Name"]==".ALL"
vsAllCols <- unique(compTab[vsAllIdx, "compCol"])
designCols <- union(designCols, unique(compTab[!vsAllIdx,"compCol"]))
if (length(vsAllCols) > 0){
logger.status(c("Adding 1-vs-all comparisons"))
for (i in which(vsAllIdx)){
gn <- setdiff(c(compTab[i, "grp1Name"], compTab[i, "grp2Name"]), ".ALL")
if (length(gn)!=1) logger.error(c("Invalid comparison:", compTab[i,"compName"]))
cn <- compTab[i,"compCol"]
addCn <- paste0(".", cn, ".", gn, ".vsAll")
ph[, addCn] <- ifelse(as.character(ph[,cn])==gn, gn, ".ALL")
designCols <- union(designCols, addCn)
}
}
}
if (!all(designCols %in% colnames(ph))){
logger.error(c("Invalid design columns. The following column names are not contained in the sample annotation:", paste(setdiff(designCols, colnames(ph)), collapse=",")))
}
#count matrix
cm <- ChrAccR::getCounts(.object, regionType, asMatrix=TRUE)
#remove columns from the design that do not have multiple levels
idx <- sapply(designCols, FUN=function(cc){
length(unique(ph[,cc]))>1
})
if (sum(idx) < length(designCols)){
logger.warning(c("The following design columns will not be considered because they do not have multiple levels:", paste(designCols[!idx], collapse=",")))
designCols <- designCols[idx]
}
#remove columns from the design that do not have replicates
idx <- sapply(designCols, FUN=function(cc){
is.numeric(ph[,cc]) || all(table(ph[,cc]) > 1 | table(ph[,cc]) == 0)
})
if (sum(idx) < length(designCols)){
logger.warning(c("The following design columns will not be considered because they do not have replicates:", paste(designCols[!idx], collapse=",")))
designCols <- designCols[idx]
}
designF <- as.formula(paste0("~", paste(designCols,collapse="+")))
dds <- DESeq2::DESeqDataSetFromMatrix(countData=cm, colData=ph, design=designF)
rowRanges(dds) <- getCoord(.object, regionType)
dds <- DESeq2::DESeq(dds, ...)
return(dds)
}
)
if (!isGeneric("getDiffAcc")) {
setGeneric(
"getDiffAcc",
function(.object, ...) standardGeneric("getDiffAcc"),
signature=c(".object")
)
}
#' getDiffAcc-methods
#'
#' Compute differential accessibility
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionType character string specifying the region type
#' @param comparisonCol column name in the sample annotation table to base the comparison on
#' @param grp1Name name of the first group in the comparison. if not specified, it will be taken as the first factor level specified in the
#' sample annotation table in \code{'comparisonCol'}.
#' @param grp2Name name of the second group (reference) in the comparison. if not specified, it will be taken as the first factor level specified in the
#' sample annotation table in \code{'comparisonCol'}.
#' @param adjustCols column names in the sample annotation potentially used to create the design matrix
#' @param method Method for determining differential accessibility. Currently only \code{'DESeq2'} is supported
#' @param diffObj optional differential object to avoid computing it for each comparison and thus reduce runtime
#' @return a \code{data.frame} containing differential accessibility information
#'
#' @rdname getDiffAcc-DsATAC-method
#' @docType methods
#' @aliases getDiffAcc
#' @aliases getDiffAcc,DsATAC-method
#' @author Fabian Mueller
#' @export
#'
#' @examples
#' \dontrun{
#' dsa <- ChrAccRex::loadExample("dsAtac_ia_example")
#' daTab <- getDiffAcc(dsa, "IA_prog_peaks", "stimulus", grp1Name="S", grp2Name="U", adjustCols=c("cellType", "donor"))
#' }
setMethod("getDiffAcc",
signature(
.object="DsATAC"
),
function(
.object,
regionType,
comparisonCol,
grp1Name=NULL,
grp2Name=NULL,
adjustCols=character(0),
method='DESeq2',
diffObj=NULL
) {
if (!is.element(method, c("DESeq2"))) logger.error(c("Invalid method for calling differential accessibility:", method))
if (!is.element(comparisonCol, colnames(getSampleAnnot(.object)))) logger.error(c("Comparison column not found in sample annotation:", comparisonCol))
comparisonCol0 <- comparisonCol
contrastF <- factor(getSampleAnnot(.object)[,comparisonCol])
if (length(levels(contrastF)) < 2) logger.error(c("Invalid comparison column. There should be at least 2 groups."))
if (is.null(grp1Name)) grp1Name <- levels(contrastF)[1]
if (is.null(grp2Name)) grp2Name <- levels(contrastF)[2]
if (!is.element(grp1Name, c(levels(contrastF), ".ALL"))) logger.error(c("Invalid group name (1). Sample annotation has no samples associated with that group:", grp1Name))
if (!is.element(grp2Name, c(levels(contrastF), ".ALL"))) logger.error(c("Invalid group name (2). Sample annotation has no samples associated with that group:", grp2Name))
sidx.grp1 <- which(contrastF==grp1Name)
if (grp1Name==".ALL") sidx.grp1 <- which(contrastF!=grp2Name)
sidx.grp2 <- which(contrastF==grp2Name)
if (grp2Name==".ALL") sidx.grp2 <- which(contrastF!=grp1Name)
if (method=="DESeq2"){
is1vsAll <- grp1Name==".ALL" || grp2Name==".ALL"
if (is1vsAll){
if (grp1Name==grp2Name) logger.error(".ALL vs .ALL is not legal")
gn <- setdiff(c(grp1Name, grp2Name), ".ALL")
comparisonCol <- paste0(".", comparisonCol, ".", gn, ".vsAll")
}
if (!is.null(diffObj)){
if (!is.element("DESeqDataSet", class(diffObj))) logger.error(c("Invalid 'diffObj' for method", method))
#check if the elements match
gr <- getCoord(.object, regionType)
if (length(gr)!=nrow(diffObj)) logger.error("dimensions of the 'diffObj' do not match.")
if (!all(colnames(diffObj)==getSamples(.object))) logger.error("Invalid 'diffObj'. Sample names must match the DsATAC object.")
dds <- diffObj
} else {
compTab <- NULL
designCs <- adjustCols
if (is1vsAll){
compTab <- getComparisonTable(.object, compNames=paste0(grp1Name, " vs ", grp2Name, " [", comparisonCol0, "]"))
} else {
designCs <- unique(c(designCs, comparisonCol))
}
dds <- getDESeq2Dataset(.object, regionType, designCs, compTab=compTab)
}
diffRes <- DESeq2::results(dds, contrast=c(comparisonCol, grp1Name, grp2Name))
dm <- data.frame(diffRes)
rankMat <- cbind(
# rank(-dm[,"baseMean"]), na.last="keep", ties.method="min"),
rank(-abs(dm[,"log2FoldChange"]), na.last="keep", ties.method="min"),
rank(dm[,"pvalue"], na.last="keep", ties.method="min")
)
dm[,"cRank"] <- matrixStats::rowMaxs(rankMat, na.rm=FALSE)
# dm[,"cRank"] <- rowMaxs(rankMat, na.rm=TRUE)
dm[!is.finite(dm[,"cRank"]),"cRank"] <- NA
dm[,"cRank_rerank"] <- rank(dm[,"cRank"], na.last="keep", ties.method="min")
l10fpkm <- log10(DESeq2::fpkm(dds, robust=TRUE)+1)
grp1.m.l10fpkm <- rowMeans(l10fpkm[, sidx.grp1, drop=FALSE], na.rm=TRUE)
grp2.m.l10fpkm <- rowMeans(l10fpkm[, sidx.grp2, drop=FALSE], na.rm=TRUE)
vstCounts <- assay(DESeq2::vst(dds, blind=FALSE))
grp1.m.vst <- rowMeans(vstCounts[, sidx.grp1, drop=FALSE], na.rm=TRUE)
grp2.m.vst <- rowMeans(vstCounts[, sidx.grp2, drop=FALSE], na.rm=TRUE)
res <- data.frame(
log2BaseMean=log2(dm[,"baseMean"]),
meanLog10FpkmGrp1=grp1.m.l10fpkm,
meanLog10FpkmGrp2=grp2.m.l10fpkm,
meanVstCountGrp1=grp1.m.vst,
meanVstCountGrp2=grp2.m.vst,
dm
)
# add group names to column names
for (cn in c("meanLog10FpkmGrp", "meanVstCountGrp")){
colnames(res)[colnames(res)==paste0(cn,"1")] <- paste0(cn, "1_", grp1Name)
colnames(res)[colnames(res)==paste0(cn,"2")] <- paste0(cn, "2_", grp2Name)
}
}
return(res)
}
)
################################################################################
# Export
################################################################################
if (!isGeneric("exportCountTracks")) {
setGeneric(
"exportCountTracks",
function(.object, ...) standardGeneric("exportCountTracks"),
signature=c(".object")
)
}
#' exportCountTracks-methods
#'
#' export count data as genome tracks (e.g. for visualization in the browser)
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param type character string specifying the region type
#' @param outDir output directory. Must be existing.
#' @param formats browser format. Currently only bed and "igv" are supported
#' @param groupBy a column in the sample annotation table to group by (the mean will be computed)
#' @return nothing of particular interest
#'
#' @rdname exportCountTracks-DsATAC-method
#' @docType methods
#' @aliases exportCountTracks
#' @aliases exportCountTracks,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("exportCountTracks",
signature(
.object="DsATAC"
),
function(
.object,
type,
outDir,
formats=c("bed", "igv"),
groupBy=NULL
) {
if (!is.element(type, getRegionTypes(.object))) logger.error(c("Unsupported region type:", type))
if (!dir.exists(outDir)) logger.error(c("Output directory:", outDir, "does not exist."))
#count matrix
cm <- ChrAccR::getCounts(.object, type, asMatrix=TRUE)
sampleNames <- getSamples(.object)
coords <- getCoord(.object, type)
if (!is.null(groupBy) && is.character(groupBy) && is.element(groupBy, colnames(getSampleAnnot(.object)))){
grps <- factor(getSampleAnnot(.object)[,groupBy])
cm <- do.call("cbind", tapply(getSamples(.object), grps, FUN=function(sids){
rowMeans(cm[,sids, drop=FALSE], na.rm=TRUE)
}, simplify=FALSE))
colnames(cm) <- levels(grps)
sampleNames <- levels(grps)
}
elementMetadata(coords) <- NULL
elementMetadata(coords) <- cm
if (is.element("bed", formats)){
for (sn in sampleNames){
fn <- file.path(outDir, paste0(sn, "_counts.bed"))
granges2bed(coords, fn, score=elementMetadata(coords)[,sn], addAnnotCols=FALSE, colNames=FALSE, doSort=TRUE, bigBed=FALSE)
}
}
if (is.element("igv", formats)){
fn <- file.path(outDir, "DsATAC_counts.igv")
granges2igv(coords, fn, addStrand=FALSE, addAnnotCols=TRUE, doSort=TRUE, toTDF=TRUE)
}
#TODO: work in progress
invisible(NULL)
}
)
################################################################################
# Data processing / inference
################################################################################
if (!isGeneric("callPeaks")) {
setGeneric(
"callPeaks",
function(.object, ...) standardGeneric("callPeaks"),
signature=c(".object")
)
}
#' callPeaks-methods
#'
#' Performs peak calling based on insertion sites
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param samples sample identifiers for which peak calling is performed
#' @param method peak calling method. Currently only \code{'macs2_summit_fw_no'} is supported. See details section.
#' @param methodOpts list of other options depending on the \code{'method'} parameter (see details section).
#' @return \code{GRangesList} of peak coordinates for each sample
#'
#' @details
#' The following methods are currently supported
#' \describe{
#' \item{\code{'macs2_summit_fw_no'}}{
#' Fixed-width, non-overlapping peaks based on MACS2 summit calls:
#' 1. Call peaks using system call to MACS2. You can specify the MACS2 executable in \code{methodOpts$macs2.exec}.
#' 2. Identify peak summits
#' 3. extend peak summits on each side by a number of basepairs (specified in \code{methodOpts$fixedWidth}; default: 250bp) to obtain unified peak widths
#' 4. Find non-overlapping peaks by taking the peak with the best MACS2 score from each set of partially overlapping peaks
#' }
#' }
#'
#' @rdname callPeaks-DsATAC-method
#' @docType methods
#' @aliases callPeaks
#' @aliases callPeaks,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("callPeaks",
signature(
.object="DsATAC"
),
function(
.object,
samples=getSamples(.object),
method='macs2_summit_fw_no',
methodOpts=list(
macs2.exec="macs2",
macs2.params=c(
"--shift", "-75",
"--extsize", "150",
"-p", "0.01"
),
fixedWidth=250,
genomeSizesFromObject=FALSE
)
) {
if (!is.element(method, c("macs2_summit_fw_no"))) logger.error(c("Invalid 'method':", method))
if (!all(samples %in% getSamples(.object))) logger.error(c("Invalid samples:", paste(setdiff(samples, getSamples(.object)), collapse=", ")))
if (!all(samples %in% names(.object@fragments))) logger.error(c("Object does not contain insertion information for samples:", paste(setdiff(samples, names(.object@fragments)), collapse=", ")))
peakGrl <- NULL
if (method=="macs2_summit_fw_no"){
if (!is.element("macs2.exec", names(methodOpts))) logger.error("Invalid 'methodOps' for method 'macs2_summit_fw_no' (missing 'macs2.exec')")
if (!is.element("macs2.params", names(methodOpts))) logger.error("Invalid 'methodOps' for method 'macs2_summit_fw_no' (missing 'macs2.params')")
if (!is.element("fixedWidth", names(methodOpts))) logger.error("Invalid 'methodOps' for method 'macs2_summit_fw_no' (missing 'fixedWidth')")
if (!is.element("genomeSizesFromObject", names(methodOpts))) logger.error("Invalid 'methodOps' for method 'macs2_summit_fw_no' (missing 'genomeSizesFromObject')")
argV <- c(
"--nomodel",
"--call-summits",
"--nolambda",
"--keep-dup", "all",
# "-B", "--SPMR",
methodOpts$macs2.params
)
genomeSizeArg <- ""
if (methodOpts$genomeSizesFromObject){
# # find the set of the largest tiling regions you can find in the object
# rts <- gtools::mixedsort(grep("^(tiling|t[0-9]+)", getRegionTypes(.object), value=TRUE), decreasing=TRUE)
# if (length(rts) < 1 || is.na(rts[1])) logger.error("No appropriate tiling region set found for determining genome size")
# ggr <- getCoord(.object, rts[1])
# genomeSizeArg <- as.character(sum(width(ggr)))
# logger.info(paste0("Using a genome size of ", genomeSizeArg, " bp (determined from region type: ", rts[1], ")"))
fgr <- do.call("c", unname(getFragmentGrl(.object, samples, asGRangesList=FALSE)))
fgr <- reduce(fgr, min.gapwidth=9L, ignore.strand=TRUE) #adjust gapwidth to account for Tn5 offsetting
genomeSizeArg <- as.character(sum(width(fgr)))
logger.info(paste0("Using a genome size of ", genomeSizeArg, " bp (determined from aggregating fragments)"))
} else if (is.element(.object@genome, c("hg19", "hg38"))){
genomeSizeArg <- "hs"
} else if (is.element(.object@genome, c("mm9", "mm10"))){
genomeSizeArg <- "mm"
} else {
logger.error(c("Unsupported genome for peak calling:", .object@genome))
}
nSamples <- length(samples)
samplePrefixes <- sapply(samples, FUN=function(x){getHashString(pattern=x)})
genomeAss <- .object@genome
callDir <- tempdir()
cmdr <- getConfigElement("muPipeR_cmdr")
doCmdR <- !is.null(cmdr) && inherits(cmdr, "CommandR")
if (doCmdR){
callDir <- getConfigElement("tmpDir")
}
logger.start("Preparing insertion bed files")
insFns <- sapply(seq_along(samples), FUN=function(i){
sid <- samples[i]
logger.status(c("Sample:", sid, paste0("(", i, " of ", nSamples,")")))
fp <- samplePrefixes[i]
insFn <- file.path(callDir, paste0(fp, "_ins.bed"))
# logger.status(c("[DEBUG:] Retrieving insertion sites..."))
insGr <- getInsertionSites(.object, sid)[[1]]
# logger.status(c("[DEBUG:] Writing to temp file..."))
coordOnly <- all(as.character(strand(insGr)) %in% c("*", "."))
granges2bed(insGr, insFn, score=NULL, addAnnotCols=FALSE, colNames=FALSE, doSort=TRUE, coordOnly=coordOnly)
return(insFn)
})
names(insFns) <- samples
logger.completed()
# variable environment (for parallel computation using lapplyExec)
envList <- list(
samples=samples,
insFns=insFns,
callDir=callDir,
samplePrefixes=samplePrefixes,
genomeAss=genomeAss,
methodOpts=methodOpts,
argV=argV,
genomeSizeArg=genomeSizeArg
)
# function to be used for peak calling
callPeakFun <- function(i){
sid <- samples[i]
logger.status(c("Calling peaks for sample:", sid, paste0("(", i, " of ", length(samples),")")))
# peakFn <- file.path(callDir, paste0(fp, "_summits.bed"))
peakFn <- file.path(callDir, paste0(samplePrefixes[i], "_peaks.narrowPeak"))
# logger.status(c("[DEBUG:] Calling MACS2..."))
aa <- c(
"callpeak",
"-g", genomeSizeArg,
"--name", samplePrefixes[i],
"--treatment", insFns[sid],
"--outdir", callDir,
"--format", "BED",
argV
)
system2(methodOpts$macs2.exec, aa, wait=TRUE, stdout="", stderr="") # MACS2 log messages to console
# system2(methodOpts$macs2.exec, aa, wait=TRUE, stdout=FALSE, stderr=FALSE) # suppress MACS2 log messages
# logger.status(c("[DEBUG:] Reading MACS2 output..."))
# peakGr <- rtracklayer::import(peakFn, format="BED")
peakGr <- readMACS2peakFile(peakFn)
peakGr <- setGenomeProps(peakGr, genomeAss, onlyMainChrs=TRUE, silent=TRUE)
peakGr <- peakGr[isCanonicalChrom(as.character(seqnames(peakGr)))]
elementMetadata(peakGr)[,"calledPeakStart"] <- start(peakGr)
elementMetadata(peakGr)[,"calledPeakEnd"] <- end(peakGr)
start(peakGr) <- end(peakGr) <- elementMetadata(peakGr)[,"summit"]
# scale scores to their percentiles
# scs <- elementMetadata(peakGr)[,"score"]
scs <- elementMetadata(peakGr)[,"negLog10qval"]
elementMetadata(peakGr)[,"score_norm"] <- ecdf(scs)(scs)
elementMetadata(peakGr)[,"name"] <- gsub(paste0("^", samplePrefixes[i]), sid, elementMetadata(peakGr)[,"name"])#replace the hashstring in the name by just the sample id
# logger.status(c("[DEBUG:] Extending summits..."))
peakGr <- trim(promoters(peakGr, upstream=methodOpts$fixedWidth, downstream=methodOpts$fixedWidth+1)) #extend each summit
peakGr <- peakGr[width(peakGr)==median(width(peakGr))] #remove too short regions which might have been trimmed
# logger.status(c("[DEBUG:] Finding non-overlapping peaks..."))
peakGr <- getNonOverlappingByScore(peakGr, scoreCol="score_norm")
# peakGr <- ChrAccR:::getNonOverlappingByScore(peakGr, scoreCol="score_norm")
peakGr <- peakGr[order(as.integer(seqnames(peakGr)),start(peakGr), end(peakGr), as.integer(strand(peakGr)))] #sort
logger.info(c("Identified", length(peakGr), "peaks"))
return(peakGr)
}
logger.start("Calling peaks using MACS2")
if (doCmdR){
peakGrl <- lapplyExec(
cmdr,
lapply(seq_along(samples), identity),
callPeakFun,
env=envList,
Rexec="Rscript",
name=paste0("chraccr_callPeaks")
)
} else {
peakGrl <- lapply(seq_along(samples), callPeakFun)
}
names(peakGrl) <- samples
# cleanup
fns <- file.path(callDir, paste0(samplePrefixes, "*"))
unlink(fns)
logger.completed()
peakGrl <- GRangesList(peakGrl)
}
return(peakGrl)
}
)
################################################################################
# Plotting
################################################################################
if (!isGeneric("plotInsertSizeDistribution")) {
setGeneric(
"plotInsertSizeDistribution",
function(.object, ...) standardGeneric("plotInsertSizeDistribution"),
signature=c(".object")
)
}
#' plotInsertSizeDistribution-methods
#'
#' Plot insert size distribution
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param sampleId sample to be plotted
#' @return \code{ggplot} object containing insert size distribution plot
#'
#' @rdname plotInsertSizeDistribution-DsATAC-method
#' @docType methods
#' @aliases plotInsertSizeDistribution
#' @aliases plotInsertSizeDistribution,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("plotInsertSizeDistribution",
signature(
.object="DsATAC"
),
function(
.object,
sampleId
) {
insSizeDf <- data.frame(
insertionSize=width(getFragmentGr(.object, sampleId))
)
pp <- ggplot(insSizeDf) + aes(x=insertionSize, y=..count..) + geom_density(alpha=.2, fill="#8c1515") +
xlim(c(0,700)) + ylab("#fragments")
return(pp)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getTssEnrichment")) {
setGeneric(
"getTssEnrichment",
function(.object, ...) standardGeneric("getTssEnrichment"),
signature=c(".object")
)
}
#' getTssEnrichment-methods
#'
#' Get TSS enrichment data and plot
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param sampleId sample to be plotted
#' @param tssGr \code{GRanges} object containing TSS coordinates or NULL to get default set from annotation package
#' @param flank number of bases flanking each TSS that will be added on each side
#' @param normTailW number of bases on each side whose counts will be used to normalize the data
#' @param smoothW radius of the window (in bp) that will be used to smooth the data, i.e. the total width of the
#' smoothing window will be twice that number
#' @param silent limit log messages
#' @return a list containing TSS enrichment data and a \code{ggplot} object containing TSS enrichment plot
#'
#' @rdname getTssEnrichment-DsATAC-method
#' @docType methods
#' @aliases getTssEnrichment
#' @aliases getTssEnrichment,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getTssEnrichment",
signature(
.object="DsATAC"
),
function(
.object,
sampleId,
tssGr=NULL,
flank=2000L,
normTailW=100L,
smoothW=25L,
silent=FALSE
) {
if (is.null(tssGr)){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="tssGr")
}
if (!all(width(tssGr)==1)) logger.error("tssGr must be a GRanges object in which each element has width=1")
#extend the window by the flanking and smoothing lengths
tssGr <- unique(trim(resize(tssGr, width=2*(flank+smoothW)+1, fix="center", ignore.strand=TRUE)))
# get (normalized) count data
tssCountDf <- aggregateRegionCounts(.object, tssGr, samples=sampleId, countAggrFun="mean", kmerBiasAdj=FALSE, normTailW=normTailW+smoothW, silent=silent)
# offset the position: aggregateRegionCounts returns positions in [0,regionWidth]
tssCountDf$pos <- tssCountDf$pos - (flank+smoothW+1)
# TSS enrichment: max normalized count within window
idx.main <- abs(tssCountDf[,"pos"])<=(flank-normTailW)
tsse <- max(tssCountDf[idx.main,"countNorm"], na.rm=TRUE)
tsse.s <- as.numeric(NA)
countCol <- "countNorm"
if (smoothW > 1){
# smoothing: take the mean normalized count in the corresponding window
smoothedCounts <- convolve(tssCountDf[,"countNorm"], rep(1, 2*smoothW+1), type="filter")/(smoothW*2+1)
tsse.s <- max(smoothedCounts[idx.main], na.rm=TRUE)
tssCountDf <- tssCountDf[abs(tssCountDf[,"pos"])<=flank,] # revert the extension by the smoothing window
tssCountDf[,"countNormSmoothed"] <- smoothedCounts
countCol <- "countNormSmoothed"
}
pp <- ggplot(tssCountDf) + aes_string(x="pos", y="countNorm") + geom_vline(xintercept=c(0), color="#4d4f53") + geom_point(color="#969696") +
geom_line(aes_string(x="pos", y=countCol), color="#8c1515", size=2)
res <- list(
countDf=tssCountDf,
tssEnrichment=tsse,
tssEnrichment.smoothed=tsse.s,
plot=pp
)
return(res)
}
)
################################################################################
# Single-cell/ high-sample number analyses
################################################################################
if (!isGeneric("getTssEnrichmentBatch")) {
setGeneric(
"getTssEnrichmentBatch",
function(.object, ...) standardGeneric("getTssEnrichmentBatch"),
signature=c(".object")
)
}
#' getTssEnrichmentBatch-methods
#'
#' Get TSS enrichment data and plot
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param tssGr \code{GRanges} object containing TSS coordinates or NULL to get default set from annotation package
#' @param sampleIds sampleIds for which TSS enrichment should be computed
#' @param tssW size of the core TSS window
#' @param flank number of bases flanking each TSS that will be added on each side
#' @param normTailW number of bases on each side whose counts will be used to normalize the data
#' @param smoothW diameter of the window (in bp) that will be used to smooth the data
#' @return a list containing TSS enrichment data
#'
#' @rdname getTssEnrichmentBatch-DsATAC-method
#' @docType methods
#' @aliases getTssEnrichmentBatch
#' @aliases getTssEnrichmentBatch,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getTssEnrichmentBatch",
signature(
.object="DsATAC"
),
function(
.object,
tssGr=NULL,
sampleIds=getSamples(.object),
tssW=201L,
flank=2000L,
normTailW=200L,
smoothW=51L
) {
if (is.null(tssGr)){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="tssGr")
}
if (!all(width(tssGr)==1)) logger.error("tssGr must be a GRanges object in which each element has width=1")
#extend the window by the flanking lengths
L <- 2*flank+1
tsswGr <- suppressWarnings(trim(resize(tssGr, width=L, fix="center", ignore.strand=TRUE)))
nRegs <- length(tssGr)
if (length(tsswGr) != nRegs) logger.error("length error: tss window set")
nSamples <- length(sampleIds)
logger.status("Retrieving fragment data ...")
fGr <- getFragmentGrl(.object, sampleIds, asGRangesList=FALSE)
# logger.status("[DEBUG] ...(1)...")
nFrags <- sapply(fGr, length)
# subsample fragments if they exceed the threshold (to avoid long computation and long vector errors)
# doSubsample <- .Machine$integer.max/2-1
doSubsample <- 1e9
if (sum(nFrags) > doSubsample){
prop <- doSubsample/sum(nFrags)
logger.warning(c("Subsampling to", round(prop*100, 2), "% of fragments"))
nFrags <- floor(nFrags*prop)
fGr <- lapply(1:length(fGr), FUN=function(i){
fGr[[i]][sort(sample(1:length(fGr[[i]]), nFrags[i]))]
})
}
# logger.status("[DEBUG] ...(2)...")
fGr <- do.call("c", unname(fGr))
# logger.status("[DEBUG] ...(3)...")
# fGr <- unlist(fGr, use.names=FALSE)
elementMetadata(fGr)[,".sampleIdx"] <- rep(seq_along(nFrags), nFrags)
logger.status("Retrieving joined insertion sites ...")
igr <- getInsertionSitesFromFragmentGr(fGr)
rm(fGr) # clean-up
cleanMem() # TODO: can this be removed?
logger.status("computing overlaps with TSS regions ...")
oo <- findOverlaps(tsswGr, igr, ignore.strand=TRUE)
tssGro <- tssGr[queryHits(oo)]
iGro <- igr[subjectHits(oo)]
rm(igr) # clean-up
cleanMem() # TODO: can this be removed?
# dd <- -grSignedDistance(tssGro, iGro)
dd <- start(iGro) - start(tssGro)
negIdx <- as.logical(strand(tssGro)=="-")
dd[negIdx] <- -dd[negIdx]
sampleIdx <- elementMetadata(iGro)[,".sampleIdx"]
# tssCut <- floor(tssW/2)
# nTss <- tapply(seq_along(iGro), sampleIdx, FUN=function(idx){
# idx <- idx[abs(dd[idx]) <= tssCut]
# length(unique(queryHits(oo)[idx]))
# })
# bgCut <- flank - normTailW
logger.status("computing TSS profiles ...")
countV <- rep(0L, 2*flank+1)
names(countV) <- as.character(-flank:flank)
profileMat <- do.call("rbind", tapply(dd, sampleIdx, FUN=function(x){
posCounts <- table(x)
res <- countV
res[names(posCounts)] <- posCounts
res
}))
if (!all(rownames(profileMat)==as.character(1:nSamples))) {
logger.warning("could not retrieve profiles for all samples. NAs will be returned")
missingIdxc <- setdiff(as.character(1:nSamples), rownames(profileMat))
missM <- matrix(as.integer(NA), nrow=length(missingIdxc), ncol=ncol(profileMat))
rownames(missM) <- missingIdxc
colnames(missM) <- colnames(profileMat)
profileMat <- rbind(profileMat, missM)
profileMat <- profileMat[as.character(1:nSamples),]
}
if (ncol(profileMat)!=L) logger.error("Computing profile matrix failed: incorrect dimensions")
rownames(profileMat) <- sampleIds
colnames(profileMat) <- NULL
# profileMat <- profileMat + 1 # pseudo-counts on aggregate matrix
bgIdx <- c(1:normTailW, (L-normTailW+1):L) # indices of the regions to use as background
tssWidx <- (flank-floor(tssW/2)):(flank+floor(tssW/2))+1 # indices corresponding to window around TSS
logger.status("normalizing TSS profiles ...")
bgMeans <- rowMeans(profileMat[,bgIdx], na.rm=TRUE)
normFactors <- pmax(bgMeans, 0.5) # low count samples/cells (no effect if pseudocounts are added)
profileMatNorm <- profileMat/normFactors # divide rows by row normalization factor
profileMatSmoothed <- t(apply(profileMatNorm, 1, FUN=function(x){
zoo::rollmean(x, smoothW, fill=1)
# convolve(c(fillV, x, fillV), rep(1, smoothW), type="filter")/smoothW
}))
profileMatNorm[!is.finite(profileMatNorm)] <- NA
profileMatSmoothed[!is.finite(profileMatSmoothed)] <- NA
tsse <- matrixStats::rowMaxs(profileMatNorm[,tssWidx], na.rm=TRUE)
tsse[!is.finite(tsse)] <- NA
tsse.s <- matrixStats::rowMaxs(profileMatSmoothed[,tssWidx], na.rm=TRUE)
tsse.s[!is.finite(tsse.s)] <- NA
res <- list(
profileMatNorm=t(profileMatNorm),
profileMatSmoothed=t(profileMatSmoothed),
pos=(-flank):flank,
tssEnrichment=tsse,
tssEnrichment.smoothed=tsse.s
)
return(res)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getQuickTssEnrichment")) {
setGeneric(
"getQuickTssEnrichment",
function(.object, ...) standardGeneric("getQuickTssEnrichment"),
signature=c(".object")
)
}
#' getQuickTssEnrichment-methods
#'
#' [Experimental] Quick, heuristic version of TSS enrichment to just get scores for each
#' sample in the dataset. Useful, e.g. for single cells.
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param tssGr \code{GRanges} object containing TSS coordinates
#' @param sampleIds sampleIds for which TSS enrichment should be computed
#' @param tssW width to consider arount the TSS
#' @param distBg number of bases flanking each TSS that will be added on each side
#' @return a vector of TSS enrichment values for each sample/cell in the dataset
#'
#' @details
#' Computes TSS enrichment
#' as the ratio of total insertion sites at a window (of width \code{tssW} bp) directly at the TSS and 2 background regions
#' symmetrically located (\code{distBg} bp) upstream and downstream of the TSS
#'
#' @rdname getQuickTssEnrichment-DsATAC-method
#' @docType methods
#' @aliases getQuickTssEnrichment
#' @aliases getQuickTssEnrichment,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getQuickTssEnrichment",
signature(
.object="DsATAC"
),
function(
.object,
tssGr=NULL,
sampleIds=getSamples(.object),
tssW=201L,
distBg=1900L
) {
if (is.null(tssGr)){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="tssGr")
}
if (!all(width(tssGr)==1)) logger.error("tssGr must be a GRanges object in which each element has width=1")
# background windows
# bgLeftGr <- suppressWarnings(trim(resize(GenomicRanges::shift(tssGr, -distBg), width=floor(tssW/2), fix="center", ignore.strand=TRUE)))
# bgRightGr <- suppressWarnings(trim(resize(GenomicRanges::shift(tssGr, distBg), width=floor(tssW/2), fix="center", ignore.strand=TRUE)))
bgLeftGr <- suppressWarnings(trim(resize(GenomicRanges::shift(tssGr, -distBg), width=tssW, fix="center", ignore.strand=TRUE)))
bgRightGr <- suppressWarnings(trim(resize(GenomicRanges::shift(tssGr, distBg), width=tssW, fix="center", ignore.strand=TRUE)))
#extend the window by the flanking and smoothing lengths
tsswGr <- suppressWarnings(trim(resize(tssGr, width=tssW, fix="center", ignore.strand=TRUE)))
nRegs <- length(tssGr)
if (length(tsswGr) != nRegs) logger.error("length error: tss window set")
if (length(bgLeftGr) != nRegs) logger.error("length error: left background set")
if (length(bgRightGr) != nRegs) logger.error("length error: right background set")
nSamples <- length(sampleIds)
logger.status("Retrieving joined insertion sites ...")
fGr <- getFragmentGrl(.object, sampleIds, asGRangesList=FALSE)
nFrags <- sapply(fGr, length)
fGr <- do.call("c", unname(fGr))
# fGr <- unlist(fGr, use.names=FALSE)
elementMetadata(fGr)[,".sampleIdx"] <- rep(seq_along(nFrags), nFrags)
igr <- getInsertionSitesFromFragmentGr(fGr)
logger.status("computing overlaps with TSS regions ...")
oo <- findOverlaps(tsswGr, igr, ignore.strand=TRUE)
# convenient: when constructing sparse matrices, if (i,j) indices are repeated, the x-values are summed up
cm_tss <- Matrix::sparseMatrix(
i=queryHits(oo),
j=elementMetadata(igr)[subjectHits(oo), ".sampleIdx"],
x=rep(1, length(queryHits(oo))),
dims=c(nRegs, nSamples)
)
colnames(cm_tss) <- sampleIds
logger.status("computing overlaps with background flanking regions ...")
ool <- findOverlaps(bgLeftGr, igr, ignore.strand=TRUE)
oor <- findOverlaps(bgRightGr, igr, ignore.strand=TRUE)
cm_bg <- Matrix::sparseMatrix(
i=c(queryHits(ool), queryHits(oor)),
j=elementMetadata(igr)[c(subjectHits(ool), subjectHits(oor)), ".sampleIdx"],
x=rep(1, length(queryHits(ool))+length(queryHits(oor))),
dims=c(nRegs, nSamples)
)
colnames(cm_bg) <- sampleIds
# tsse <- Matrix::colSums(cm_tss, na.rm=TRUE) / Matrix::colSums(cm_bg, na.rm=TRUE)
# normalize per basepair
normCounts_tss <- Matrix::colSums(cm_tss, na.rm=TRUE) / tssW
normCounts_bg <- Matrix::colSums(cm_bg, na.rm=TRUE) / (2*tssW) # 2 background regions per TSS
tsse <- normCounts_tss / pmax(normCounts_bg, 1)
return(tsse)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getMonocleCellDataSet")) {
setGeneric(
"getMonocleCellDataSet",
function(.object, ...) standardGeneric("getMonocleCellDataSet"),
signature=c(".object")
)
}
#' getMonocleCellDataSet-methods
#'
#' Obtain \code{cell_data_set} object for analysis using the \code{monocle3} package
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionType name of the region type to be exported
#' @param binarize should the counts be binarized
#' @return a \code{cell_data_set} object containing the counts for the specified region type
#'
#' @rdname getMonocleCellDataSet-DsATAC-method
#' @docType methods
#' @aliases getMonocleCellDataSet
#' @aliases getMonocleCellDataSet,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getMonocleCellDataSet",
signature(
.object="DsATAC"
),
function(
.object,
regionType,
binarize=TRUE
) {
if (!is.element(regionType, getRegionTypes(.object))) logger.error(c("Unsupported region type:", regionType))
regGr <- getCoord(.object, regionType)
regDf <- data.frame(
row.names = paste(seqnames(regGr),start(regGr),end(regGr),sep="_"),
site_name = paste(seqnames(regGr),start(regGr),end(regGr),sep="_"),
chr = gsub("chr", "", as.character(seqnames(regGr))),
bp1 = start(regGr),
bp2 = end(regGr)
)
names(regGr) <- regDf$site_name
cm <- ChrAccR::getCounts(.object, regionType, allowSparseMatrix=TRUE)
if (binarize){
if (.object@sparseCounts){
cm@x[cm@x > 0] <- 1
} else {
cm[cm > 0] <- 1
}
}
cdsObj <- suppressWarnings(monocle3::new_cell_data_set(
cm,
cell_metadata = getSampleAnnot(.object),
gene_metadata = regDf
))
monocle3::fData(cdsObj)$site_name <- as.character(monocle3::fData(cdsObj)$site_name)
monocle3::fData(cdsObj)$chr <- as.character(monocle3::fData(cdsObj)$chr)
monocle3::fData(cdsObj)$bp1 <- as.numeric(monocle3::fData(cdsObj)$bp1)
monocle3::fData(cdsObj)$bp2 <- as.numeric(monocle3::fData(cdsObj)$bp2)
cdsObj <- monocle3::detect_genes(cdsObj)
cdsObj <- monocle3::estimate_size_factors(cdsObj)
return(cdsObj)
}
)
#-------------------------------------------------------------------------------
if (!isGeneric("getCiceroGeneActivities")) {
setGeneric(
"getCiceroGeneActivities",
function(.object, ...) standardGeneric("getCiceroGeneActivities"),
signature=c(".object")
)
}
#' getCiceroGeneActivities-methods
#'
#' Obtain Cicero gene activities
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionType region type of regions that will be linked to the promoter (typical some sort of peak annotation)
#' @param promoterGr \code{GRanges} object of promoter coordinates
#' @param maxDist maximum distance to consider for region-region interactions
#' @param corCutOff cutoff of correlation coefficients (Pearson) to consider for region-region interactions
#' @param dimRedCoord matrix of reduced dimension coordinates. must have coordinates for all samples/cells in the dataset
#' @param knn.k parameter k for Cicero's k-nearest-neighbor method
#' @return an \code{SummarizedExperiment} object containing gene activities for all cells/samples in the dataset
#'
#' @rdname getCiceroGeneActivities-DsATAC-method
#' @docType methods
#' @aliases getCiceroGeneActivities
#' @aliases getCiceroGeneActivities,DsATAC-method
#' @author Fabian Mueller
#' @export
setMethod("getCiceroGeneActivities",
signature(
.object="DsATAC"
),
function(
.object,
regionType,
promoterGr=NULL,
maxDist=250000L,
corCutOff=0.35,
dimRedCoord=NULL,
knn.k=50
) {
if (!is.element(regionType, getRegionTypes(.object))) logger.error(c("Unsupported region type:", regionType))
if (is.null(promoterGr)){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
promoterGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="promoterGr")
}
if (is.null(names(promoterGr))){
emd <- elementMetadata(promoterGr)
if (is.element("gene_name", colnames(emd))){
names(promoterGr) <- emd[,"gene_name"]
} else {
logger.error("promoterGr must have names")
}
}
logger.start("Creating Cicero object")
cdsObj <- getMonocleCellDataSet(.object, regionType, binarize=TRUE)
if (is.null(dimRedCoord)){
logger.error("Automatic dimension reduction not implemented yet. Must supply dimension reduction coordinates")
} else {
if (!all(colnames(cdsObj) %in% rownames(dimRedCoord))) logger.error("all samples/cells must be contained in dimRedCoord object (which must have valid rownames)")
# ciceroObj <- cicero::make_cicero_cds(cdsObj, k=knn.k, reduced_coordinates=dimRedCoord[colnames(cdsObj),])
ciceroObj <- custom_cicero_cds(cdsObj, k=knn.k, reduced_coordinates=dimRedCoord[colnames(cdsObj),], silent=TRUE)
}
logger.completed()
logger.start("Computing grouped correlations")
gr <- getCoord(.object, regionType) # should be in the same order as fData(ciceroObj)
names(gr) <- monocle3::fData(ciceroObj)$site_name
oo <- suppressWarnings(as.matrix(findOverlaps(resize(resize(gr, 1, "center"), 2*maxDist + 1, "center"), resize(gr, 1, "center"), ignore.strand=TRUE)))
oo <- data.frame(i=matrixStats::rowMins(oo), j=matrixStats::rowMaxs(oo))
oo <- oo[!duplicated(oo) & oo[,"i"]!=oo[,"j"],]
# fast(ish) correlations
X <- monocle3::exprs(ciceroObj)
rownames(X) <- NULL
colnames(X) <- NULL
mu <- safeMatrixStats(X, "rowMeans", na.rm=TRUE)
logger.start("Computing Pearson correlation coefficients")
# chunked processing
chkSize <- 1e5
chkStarts <- seq(1, nrow(oo), by=chkSize)
chkEnds <- nrow(oo)
nChunks <- length(chkStarts)
if (nChunks > 1) chkEnds <- c(chkStarts[2:length(chkStarts)]-1, chkEnds)
cc <- do.call("c", lapply(1:nChunks, FUN=function(i){
logger.status(c("Chunk", i, "of", nChunks))
oo_cur <- oo[chkStarts[i]:chkEnds[i], ]
# centered submatrices
Xc1 <- as.matrix(X[oo_cur[,1],,drop=FALSE] - mu[oo_cur[,1]])
Xc2 <- as.matrix(X[oo_cur[,2],,drop=FALSE] - mu[oo_cur[,2]])
return(rowSums(Xc1*Xc2, na.rm=TRUE) / (sqrt(rowSums(Xc1^2, na.rm=TRUE)) * sqrt(rowSums(Xc2^2, na.rm=TRUE))))
}))
logger.completed()
logger.completed()
logger.start("Computing gene activities")
conns <- data.frame(
Peak1 = names(gr[oo[,"i"]]),
Peak2 = names(gr[oo[,"j"]]),
coaccess = cc,
stringsAsFactors = FALSE
)
promoterDf <- data.frame(chromosome=seqnames(promoterGr),start=start(promoterGr),end=end(promoterGr), gene=names(promoterGr))
cdsObj <- cicero::annotate_cds_by_site(cdsObj, promoterDf)
nSites <- safeMatrixStats(monocle3::exprs(cdsObj), "colSums")
names(nSites) <- row.names(monocle3::pData(cdsObj))
unnorm_ga <- cicero::build_gene_activity_matrix(cdsObj, conns, coaccess_cutoff=corCutOff)
naCells <- safeMatrixStats(unnorm_ga, "colSums")==0
if (sum(naCells) > 0) {
logger.warning(c("detected", sum(naCells), "cells/samples with no gene activities"))
ciceroGA <- Matrix::sparseMatrix(i=c(), j=c(), x=1, dims=c(nrow(unnorm_ga),ncol(unnorm_ga)), dimnames=list(rownames(unnorm_ga), colnames(unnorm_ga)))
rr <- cicero::normalize_gene_activities(unnorm_ga[,!naCells], nSites[!naCells])
ciceroGA[,colnames(rr)] <- rr
} else {
ciceroGA <- cicero::normalize_gene_activities(unnorm_ga, nSites)
}
logger.completed()
seCicero <- SummarizedExperiment::SummarizedExperiment(
assays = S4Vectors::SimpleList(gA = ciceroGA),
rowRanges = promoterGr[rownames(ciceroGA)],
colData = getSampleAnnot(.object)
)
S4Vectors::metadata(seCicero)$method <- "cicero"
S4Vectors::metadata(seCicero)$params <- list(
maxDist = maxDist,
corCutOff = corCutOff,
knn.k = knn.k
)
return(seCicero)
}
)
if (!isGeneric("getRBFGeneActivities")) {
setGeneric(
"getRBFGeneActivities",
function(.object, ...) standardGeneric("getRBFGeneActivities"),
signature=c(".object")
)
}
#' getRBFGeneActivities-methods
#'
#' [EXPERIMENTAL] Obtain gene activities by weighting counts using a Gaussian radial basis function (RBF)
#'
#' @param .object \code{\linkS4class{DsATAC}} object
#' @param regionType region type of regions that will be linked to the promoter (typical some sort of peak annotation)
#' @param tssGr \code{GRanges} object of TSS coordinates or gene body coordinates
#' @param maxDist maximum distance to consider for associating a region to a TSS
#' @param sigma decay parameter of the radial basis function (shape parameter (epsilon) of a gaussian RBF= 1/(sqrt(2)*sigma))
#' @param minWeight weight assymptote, i.e. the assymptotic minimum of the RBF
#' @param binarize binarize counts before weighting
#' @param useGeneBody include gene bodies in the analysis.
#' I.e. counts within the gene body will receive full weight
#' instead of downweighting starting from the TSS
#' @param normCounts normalize gene activity "counts". Can be \code{"none"} (no normalization),
#' \code{"total"} (normalize by total counts per cell) or
#' \code{"weighted"} (default; normalize by distance-weighted counts per cell)
#' @return an \code{SummarizedExperiment} object containing gene activities for all cells/samples in the dataset
#'
#' @rdname getRBFGeneActivities-DsATAC-method
#' @docType methods
#' @aliases getRBFGeneActivities
#' @aliases getRBFGeneActivities,DsATAC-method
#' @author Fabian Mueller
#' @export
#' @noRd
setMethod("getRBFGeneActivities",
signature(
.object="DsATAC"
),
function(
.object,
regionType,
tssGr=NULL,
maxDist=100000L,
sigma=10000,
minWeight=0.25,
binarize=FALSE,
useGeneBody=FALSE,
normCounts="weighted"
) {
if (!is.element(regionType, getRegionTypes(.object))) logger.error(c("Unsupported region type:", regionType))
if (!is.element(normCounts, c("none", "total", "weighted"))) logger.error("Invalid normCounts argument")
if (is.null(tssGr)){
annoPkg <- getChrAccRAnnotationPackage(.object@genome)
if (is.null(annoPkg)) logger.error("Annotation package needed")
if (useGeneBody){
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="geneGr")
} else {
tssGr <- get("getGeneAnnotation", asNamespace(annoPkg))(anno="gencode_coding", type="tssGr")
}
}
if (is.null(names(tssGr))){
emd <- elementMetadata(tssGr)
if (is.element("gene_name", colnames(emd))){
names(tssGr) <- emd[,"gene_name"]
} else {
logger.error("tssGr must have names")
}
}
logger.start("Computing RBF weights")
rGr <- getCoord(.object, regionType) # region coords
tz <- -1 # true zero replacement
hassym <- 0
if (minWeight > 0) hassym <- log(1 - minWeight)
weightM <- lapply(1:length(tssGr), FUN=function(i){
# print(i)
rr <- GenomicRanges::distance(tssGr[i], rGr, ignore.strand=TRUE)
rr[rr==0] <- tz # true zero distances (regions overlapping TSS)
rr[is.na(rr) | abs(rr)>maxDist] <- 0
rr <- as(rr, "sparseMatrix")
idxTz <- rr@x==tz # which entries are true zeroes
rr@x <- exp(-rr@x^2/(2*sigma^2) + hassym) + minWeight # apply RBF weights
rr@x[idxTz] <- 1 # reset true zero weights
return(rr)
})
weightM <- t(do.call("cbind", weightM))
rownames(weightM) <- names(tssGr)
logger.completed()
logger.start("Computing activity scores")
cm <- ChrAccR::getCounts(.object, regionType, allowSparseMatrix=TRUE)
if (!is(cm, 'sparseMatrix')) cm <- as(cm, "sparseMatrix")
if (binarize) cm@x[cm@x > 0] <- 1
# gaM <- weightM %*% cm
gaM <- as.matrix(weightM %*% cm) # multiply as sparse matrices: much faster
scaleFac <- rep(1, ncol(gaM))
if (normCounts == "total"){
scaleFac <- 1/Matrix::colSums(cm, na.rm=TRUE) # normalize by total counts
} else if (normCounts == "weighted"){
scaleFac <- 1/colSums(gaM, na.rm=TRUE) # normalize by weighted counts
}
gaM <- t(t(gaM) * scaleFac) # R operates column-wise while reusing the scale factors
se <- SummarizedExperiment::SummarizedExperiment(
assays = S4Vectors::SimpleList(gA = gaM),
rowRanges = tssGr,
colData = getSampleAnnot(.object)
)
S4Vectors::metadata(se)$method <- "RBF"
S4Vectors::metadata(se)$params <- list(
maxDist = maxDist,
sigma = sigma,
minWeight = minWeight,
binarize = binarize
)
logger.completed()
return(se)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.