# PACKAGE DOCUMENTATION
#' Coordinate Covariation Analysis (COCOA)
#'
#'
#' COCOA is a method for understanding epigenetic variation among samples.
#' COCOA can be used with epigenetic data that includes
#' genomic coordinates and an epigenetic signal,
#' such as DNA methylation and chromatin accessibility
#' data.
#' To describe the method on a high level, COCOA quantifies
#' inter-sample variation with either a supervised or unsupervised
#' technique then uses a database of "region sets" to
#' annotate the variation among samples. A region set is a set of
#' genomic regions that share a biological annotation,
#' for instance transcription factor (TF) binding regions,
#' histone modification regions, or open chromatin regions.
#' COCOA can identify region sets that are associated with
#' epigenetic variation between samples and
#' increase understanding of variation in your data.
#
# In contrast to some other common techniques, COCOA offers both
# supervised (known groups/phenotype) and unsupervised (no known groups/
# phenotype) analyses. Also, COCOA focuses on continuous variation
# between samples instead of having cutoffs. Because of this, COCOA can
# be used as a complementary method alongside "differential" methods
# that find discrete differences between groups of samples and
# it can also be used in situations where there are no groups.
# COCOA can identify biologically meaningful
# sources of variation between samples and
#'
#' @docType package
#' @name COCOA
#' @author John Lawson
#' @author Nathan Sheffield
#'
#' @references \url{http://github.com/databio}
#' @importFrom ggplot2 ggplot aes facet_wrap geom_boxplot geom_jitter geom_line
#' theme_classic xlab ylab geom_hline ylim scale_color_discrete
#' scale_x_discrete scale_fill_brewer scale_color_manual
#' scale_color_brewer theme element_line element_text geom_point
#' @importFrom ComplexHeatmap Heatmap draw
#' @import BiocGenerics S4Vectors IRanges GenomicRanges simpleCache
#' @importFrom data.table ":=" setDT data.table setkey fread setnames
#' setcolorder rbindlist setattr setorder copy is.data.table
#' setorderv as.data.table
#' @importFrom Biobase sampleNames
#' @importFrom stats lm coefficients poly wilcox.test ecdf pgamma p.adjust
#' @importFrom methods is
#' @importFrom MIRA binRegion
#' @importFrom tidyr gather
#' @importFrom grid grid.newpage grid.grabExpr grid.draw popViewport
#' pushViewport unit viewport
#' @importFrom grDevices dev.off
#' @importFrom methods hasArg
#' @importFrom fitdistrplus fitdist
#' @importFrom simpleCache simpleCache
NULL
# @importFrom ppcor pcor.test
# now package lists GenomicRanges in "Depends" instead of "Imports" in
# DESCRIPTION, still import package with @import though
# @importFrom GenomicRanges GRanges GRangesList elementMetadata strand
# seqnames granges
# Because of some issues,
# (see here: http://stackoverflow.com/questions/9439256/)
# I have to register stuff used in data.table as non-standard evaluation,
# in order to pass some R CMD check NOTES.
if (getRversion() >= "2.15.1") {
utils::globalVariables(c(
".", "..calcCols", "bin", "binID", "chr", "id", "colsToAnnotate",
"coordinateDT",
"coverage", "Group", "pOlap", "regionGroupID", "regionID", "theme",
"meanRegionSize", "regionSetCoverage", "rowIndex", "rsIndex",
"rsRegionID", "totalRegionNumber", "signalCoverage", ".SD",
"sumProportionOverlap"))
}
#########################################################################
#' Score a region set using feature contribution scores
#'
#' First, this function identifies which epigenetic features
#' overlap the region set.
#' Then the region set is scored using the feature contribution scores
#' (`signal` input)
#' according to the `scoringMetric` parameter.
#'
#' @template signal
#' @template signalCoord
#' @template signalCoordType
#' @templateVar refGenomeWarning TRUE
#' @template regionSet
#' @template signalCol
#' @template scoringMetric
#' @template verbose
# Useful when using
# aggregateSignal with 'apply' to do many region sets at a time.
# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring
# method. returns a 95% confidence interval from the Wilcoxon rank sum test
# instead of p value.
#' @template absVal
#' @template rsOL
#' @param pOlap Numeric vector. Only used if rsOL is given and scoringMetric
#' is "proportionWeightedMean". This vector should contain the proportion of
#' each regionSet region that is overlapped by a signalCoord region. The
#' order of pOlap should be the same as the overlaps in rsOL.
#' @template returnCovInfo
#' @template checkInput
#' @return a data.frame with various columns:
#' One column for each item of signalCol, representing scores for the region set for each signalCol.
#' Columns: signalCoverage (cytosine_coverage) shows the number of epigenetic features that overlapped with the region set.
#' regionSetCoverage: indicates the number of regions from regionSet that overlapped with any of the epigenetic features.
#' totalRegionNumber: represents the number of regions in regionSet.
#' meanRegionSize: gives the average size (in base pairs) of regions in regionSet.
#'
#' For "multiBase" data using the "proportionWeightedMean" scoring metric,
#' the output will also include a "sumProportionOverlap" column.
#' This column represents the sum of proportion overlaps between
#' each signalCoord region and overlapping regionSet region,
#' providing an additional measure of regionSet coverage.
#'
#' @examples
#' data("brcaATACCoord1")
#' data("brcaATACData1")
#' data("esr1_chr1")
#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation
#' rsScores <- aggregateSignal(signal=featureContributionScores,
#' signalCoord=brcaATACCoord1,
#' regionSet=esr1_chr1,
#' signalCol=c("PC1", "PC2"),
#' scoringMetric="default")
#' @export
aggregateSignal <- function(signal,
signalCoord,
regionSet,
signalCol = c("PC1", "PC2"),
signalCoordType = "default",
scoringMetric = "default",
verbose = FALSE,
absVal=TRUE,
rsOL=NULL, pOlap=NULL,
returnCovInfo=TRUE,
.checkInput=FALSE) {
################### checking inputs #################################
# if it was already checked outside this function, don't need to re-check
if (.checkInput) {
########## check that inputs are the correct class
checkConvertInputClasses(signal=signal,
signalCoord=signalCoord,
regionSet=regionSet,
signalCol=signalCol,
rsOL=rsOL)
########## check that dimensions of inputs are consistent
# length of signal coord = nrow of signal
if (length(signalCoord) != nrow(signal)) {
stop(cleanws("The number of coordinates in
signalCoord (length(signalCoord)) does not equal the number of
rows in `signal`"))
}
######### check that appropriate columns are present
# signalCol are column names of signal
if (!all(signalCol %in% colnames(signal))) {
missingCols = signalCol[!(signalCol %in% colnames(signal))]
stop(cleanws(paste0("Some signalCol are not
columns of signal: ", missingCols)))
}
######## check that scoringMetric is appropriate
if (!(scoringMetric %in% getScoringMethods("both"))) {
stop(cleanws("scoringMetric was not recognized.
Check spelling and available options."))
}
###### check that signalCoordType is appropriate
if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
#######
# what happens if there are NAs or Inf in `signal`?
# any NAs that overlap the regionSet will cause the score to be NA
if (is(signal, "data.table")) {
naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
} else {
naRows = apply(X = signal[, signalCol, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
}
if (any(naRows)) {
signal <- signal[!naRows, ]
signalCoord <- signalCoord[!naRows]
warning("Removing rows with NA from `signal`")
}
#################################################################
# detect signalCoordType
if (signalCoordType == "default") {
# when signalCoord is a GRanges object
if (any(start(signalCoord) != end(signalCoord))) {
signalCoordType <- "multiBase"
} else {
signalCoordType <- "singleBase"
}
}
# if "default" scoring method is given, choose based on signalCoordType
if (scoringMetric == "default") {
if (signalCoordType == "singleBase") {
scoringMetric <- "regionMean"
} else if (signalCoordType == "multiBase") {
scoringMetric <- "proportionWeightedMean"
} else {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
}
# make sure that scoringMetric is consistent with signalCoordType
if (signalCoordType == "singleBase") {
if (!(scoringMetric %in% getScoringMethods("singleBase"))) {
stop("The scoringMetric you selected is not available for
this data's signalCoordType")
}
} else if (signalCoordType == "multiBase") {
if (!(scoringMetric %in% getScoringMethods("multiBase"))) {
stop("The scoringMetric you selected is not available for
this data's signalCoordType")
}
}
}
# extreme positive or negative values both give important information
# take absolute value or not
if (absVal) {
signal <- abs(signal) # required for later code
}
# XX copies unnecessarily:reformat into data.table with chromosome location and weight
# make sure `signal` is the correct type for further steps
# (proportionWeightedMean method requires a matrix)
if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) {
signal <- as.data.table(signal)
} else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) {
signal <- as.matrix(signal)
}
# restricting to signalCol so unnecessary computations
# are not done
if (is(signal, "data.table")) {
loadingDT <- signal[, signalCol, with=FALSE]
# # naming does not work if only using one PC so add this line for that case
# setnames(loadiangDT, signalCol)
} else {
loadingDT <- signal[, signalCol, drop=FALSE]
}
###########################################################################
# scoring
# would rounding speed up aggregation?, potentially make a sparse matrix
# if a lot of entries became 0
# works for both singleBase and multiBase
if (scoringMetric == "simpleMean") {
loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT,
signalGR = signalCoord,
regionSet = regionSet,
calcCols= signalCol,
metric = "mean",
rsOL = rsOL,
returnCovInfo=returnCovInfo))
results <- .formatResults(loadAgMain, scoringMetric = scoringMetric,
regionSet=regionSet, signalCol = signalCol,
returnCovInfo=returnCovInfo)
} else if (scoringMetric == "simpleMedian") {
# scoring singleBase and multiBase both with this function for
# simpleMedian
loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT,
signalGR = signalCoord,
regionSet = regionSet,
calcCols= signalCol,
metric = "median",
rsOL=rsOL,
returnCovInfo = returnCovInfo))
results <- .formatResults(loadAgMain, scoringMetric = scoringMetric,
regionSet=regionSet, signalCol = signalCol,
returnCovInfo=returnCovInfo)
} else if (signalCoordType == "singleBase") {
# do the actual aggregation
if (scoringMetric == "regionMean") {
# specify aggregation operation
# will be done separately for each PC specified
aggrCommand <- buildJ(signalCol,
rep("mean", length(signalCol)))
# previously used BSAggregate from RGenomeUtils but now using local,
# modified copy
loadAgMain <- BSAggregate(BSDT = loadingDT,
regionsGRL = regionSet,
BSCoord = signalCoord,
jExpr = aggrCommand,
byRegionGroup = TRUE,
splitFactor = NULL,
returnOLInfo = returnCovInfo,
rsOL=rsOL)
results <- .formatResults(loadAgMain,
scoringMetric = scoringMetric,
regionSet=regionSet, signalCol = signalCol,
returnCovInfo=returnCovInfo)
} else if (scoringMetric == "regionMedian") {
aggrCommand <- buildJ(signalCol,
rep("median", length(signalCol)))
loadAgMain <- BSAggregate(BSDT = loadingDT,
regionsGRL = regionSet,
BSCoord = signalCoord,
jExpr = aggrCommand,
byRegionGroup = TRUE,
splitFactor = NULL,
returnOLInfo = returnCovInfo)
results <- .formatResults(loadAgMain,
scoringMetric = scoringMetric,
regionSet=regionSet, signalCol = signalCol,
returnCovInfo=returnCovInfo)
}
# } else if (scoringMetric == "meanDiff") {
# # if (is.null(pcLoadAv)) {
# # # calculate (should already be absolute)
# # pcLoadAv <- apply(X = loadingDT[, signalCol, with=FALSE],
# # MARGIN = 2, FUN = mean)
# # }
# loadMetrics <- signalOLMetrics(dataDT=loadingDT, regionSet=regionSet,
# signalCol = signalCol,
# metrics=c("mean", "sd"),
# alsoNonOLMet=TRUE)
# if (is.null(loadMetrics)) {
# results <- as.data.table(t(rep(NA, length(signalCol))))
# setnames(results, signalCol)
# results[, signalCoverage := 0]
# results[, regionSetCoverage := 0]
# results[, totalRegionNumber := numOfRegions]
# results[, meanRegionSize := round(mean(width(regionSet)), 1)]
# } else {
# # calculate mean difference
# # pooled standard deviation
# sdPool <- sqrt((loadMetrics$sd_OL^2 + loadMetrics$sd_nonOL^2) / 2)
#
# # mean difference
# # error if signalCoverage > (1/2) * totalCpGs
# meanDiff <- (loadMetrics$mean_OL - loadMetrics$mean_nonOL) /
# (sdPool * sqrt((1 / loadMetrics$signalCoverage) - (1 / (totalCpGs - loadMetrics$signalCoverage))))
# results <- as.data.table(t(meanDiff))
# colnames(results) <- loadMetrics$testCol
#
# # add information about degree of overlap
# results <- cbind(results, loadMetrics[1, .SD, .SDcols = c("signalCoverage", "regionSetCoverage", "totalRegionNumber", "meanRegionSize")])
# }
#
# } else if (scoringMetric == "rankSum") {
#
# # if (wilcox.conf.int) {
# # # returns confidence interval
# # wRes <- rsWilcox(dataDT = loadingDT, regionSet=regionSet,
# # signalCol = signalCol,
# # conf.int = wilcox.conf.int)
# #
# # if (is.null(wRes)) {
# # results <- as.data.table(t(rep(NA, length(signalCol) * 2)))
# # setnames(results, paste0(rep(signalCol, each=2),
# # c("_low", "_high")))
# # results[, signalCoverage := 0]
# # results[, regionSetCoverage := 0]
# # results[, totalRegionNumber := numOfRegions]
# # results[, meanRegionSize := round(mean(width(regionSet)), 1)]
# # } else {
# # results <- as.data.table(wRes)
# # }
# #
# # } else {
# # returns p value
# # one sided test since I took the absolute value of the loadings
# wRes <- rsWilcox(dataDT = loadingDT, regionSet=regionSet,
# signalCol = signalCol, alternative="greater")
#
# if (is.null(wRes)) {
# results <- as.data.table(t(rep(NA, length(signalCol))))
# setnames(results, signalCol)
# results[, signalCoverage := 0]
# results[, regionSetCoverage := 0]
# results[, totalRegionNumber := numOfRegions]
# results[, meanRegionSize := round(mean(width(regionSet)), 1)]
# } else {
# results <- as.data.table(wRes)
# }
# #}
} else {
# signalCoordType == "multiBase"
# for ATAC-seq
if (scoringMetric == "proportionWeightedMean") {
loadAgMain <- regionOLWeightedMean(signalMat = loadingDT,
signalGR = signalCoord,
regionSet = regionSet,
calcCols= signalCol,
rsOL = rsOL,
pOlap = pOlap,
returnCovInfo=returnCovInfo)
results <- .formatResults(as.data.table(loadAgMain),
scoringMetric = scoringMetric,
regionSet=regionSet, signalCol = signalCol,
returnCovInfo=returnCovInfo)
}
}
# signalOLMetrics() # make sure it works with no overlap
if (verbose) {
message(":", appendLF=FALSE)
}
return(as.data.frame(results))
}
# format results of scoring functions in aggregateSignal()
.formatResults <- function(loadAgMain, scoringMetric,
regionSet, signalCol, returnCovInfo=TRUE) {
numOfRegions <- length(regionSet)
# if no cytosines from loadings were included in regionSet, result is NA
if (is.null(loadAgMain)) {
results <- as.data.table(t(rep(NA, length(signalCol))))
setnames(results, signalCol)
if (returnCovInfo) {
results[, signalCoverage := 0]
results[, regionSetCoverage := 0]
results[, totalRegionNumber := numOfRegions]
results[, meanRegionSize := round(mean(width(regionSet)), 1)]
# this column is only added by this scoring method
if (scoringMetric == "proportionWeightedMean") {
results[, sumProportionOverlap := 0]
}
}
} else {
# regionMean, regionMedian, simpleMean for region data
results <- loadAgMain[, .SD, .SDcols = signalCol]
if (returnCovInfo) {
results[, signalCoverage := loadAgMain[, .SD, .SDcols = "signalCoverage"]]
results[, regionSetCoverage := loadAgMain[, .SD, .SDcols = "regionSetCoverage"]]
results[, totalRegionNumber := numOfRegions]
results[, meanRegionSize := round(mean(width(regionSet)), 1)]
################################
# proportionWeightedMean
# this column is only added by this scoring method
if (scoringMetric == "proportionWeightedMean") {
results[, sumProportionOverlap := loadAgMain[, .SD, .SDcols = "sumProportionOverlap"]]
}
}
##############################
# simple mean for base pair data
}
return(results)
}
#' Score many region sets
#'
#' This function will give each region set a score for each target variable
#' given by `signalCol` based on
#' the `scoringMetric` parameter. Based on these scores, you can determine
#' which region sets out of a region set database (given by `GRList`)
#' are most associated with the target variables. See the vignette "Introduction
#' to Coordinate Covariation Analysis" for help interpreting your
#' results.
#'
#' @template signal
#' @template signalCoord
#' @template signalCoordType
#' @template GRList
#' @template signalCol
#' @template scoringMetric
#' @template verbose
# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring
# method. returns a 95% confidence interval from the Wilcoxon rank sum test
# instead of p value.
#' @template absVal
#' @template olList
#' @template pOlapList
#' @template returnCovInfo
#' @return Data.frame of results, one row for each region set.
#' It has the following columns:
#' one column for each item of signalCol with names given by signalCol.
#' These columns have scores for the region set for each signalCol.
#' Other columns: signalCoverage (formerly cytosine_coverage) which
#' has number of epigenetic features that overlapped at all with regionSet,
#' regionSetCoverage which has number of regions from regionSet
#' that overlapped any of the epigenetic features,
#' totalRegionNumber that has number of regions in regionSet,
#' meanRegionSize that has average size in base pairs of regions in regionSet,
#' the average is based on all regions in regionSet and not just ones that overlap.
#'
#' For "multiBase" data, if the "proportionWeightedMean" scoring metric
#' is used, then the output will also have a "sumProportionOverlap" column.
#' During this scoring method, the proportion overlap between each signalCoord
#' region and overlapping regionSet region is calculated. This column is
#' the sum of all those proportion overlaps and is another way to quantify
#' coverage of regionSet in addition to regionSetCoverage.
#'
#'
#' @examples
#' data("brcaATACCoord1")
#' data("brcaATACData1")
#' data("esr1_chr1")
#' data("nrf1_chr1")
#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation
#' GRList <- GRangesList(esr1_chr1, nrf1_chr1)
#' rsScores <- aggregateSignalGRList(signal=featureContributionScores,
#' signalCoord=brcaATACCoord1,
#' GRList= GRList,
#' signalCol=c("PC1", "PC2"),
#' scoringMetric="default")
#'
#' @export
aggregateSignalGRList <- function(signal,
signalCoord,
GRList,
signalCol = c("PC1", "PC2"),
signalCoordType = "default",
scoringMetric = "default",
verbose = TRUE,
absVal=TRUE, olList=NULL,
pOlapList=NULL, returnCovInfo=TRUE) {
################### checking inputs #################################
########## check that inputs are the correct class
checkConvertInputClasses(signal=signal,
signalCoord=signalCoord,
regionSet=NULL,
signalCol = signalCol,
GRList=GRList,
olList=olList)
########## check that dimensions of inputs are consistent
# length of signal coord = nrow of signal
if (length(signalCoord) != nrow(signal)) {
stop(cleanws("The number of coordinates in
signalCoord (length(signalCoord)) does not equal the number of
rows in `signal`"))
}
######### check that appropriate columns are present
# signalCol are column names of signal
if (!all(signalCol %in% colnames(signal))) {
missingCols = signalCol[!(signalCol %in% colnames(signal))]
stop(cleanws(paste0("Some signalCol are not
columns of signal: ", missingCols)))
}
######## check that scoringMetric is appropriate
if (!(scoringMetric %in% getScoringMethods("both"))) {
stop(cleanws("scoringMetric was not recognized.
Check spelling and available options."))
}
###### check that signalCoordType is appropriate
if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
#######
# what happens if there are NAs or Inf in `signal`?
# any NAs that overlap the regionSet will cause the score to be NA
if (is(signal, "data.table")) {
naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
} else {
naRows = apply(X = signal[, signalCol, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
}
if (any(naRows)) {
signal <- signal[!naRows, ]
signalCoord <- signalCoord[!naRows]
warning("Removing rows with NA from `signal`")
}
#################################################################
# detect signalCoordType
if (signalCoordType == "default") {
# when signalCoord is a GRanges object
if (any(start(signalCoord) != end(signalCoord))) {
signalCoordType <- "multiBase"
} else {
signalCoordType <- "singleBase"
}
}
# if "default" scoring method is given, choose based on signalCoordType
if (scoringMetric == "default") {
if (signalCoordType == "singleBase") {
scoringMetric <- "regionMean"
} else if (signalCoordType == "multiBase") {
scoringMetric <- "proportionWeightedMean"
} else {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
}
# convert object class outside aggregateSignal to extra prevent copying
# (one scoring method needs `signal` as a matrix though)
if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) {
signal <- as.data.table(signal)
} else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) {
signal <- as.matrix(signal)
}
# take absolute value outside aggregateSignal to prevent extra copying
if (absVal) {
if (is(signal, "data.table")) {
signal[, signalCol] <- abs(signal[, signalCol, with=FALSE])
} else {
signal[, signalCol] <- abs(signal[, signalCol])
}
absVal <- FALSE
}
if (is.null(olList)) {
# apply over the list of region sets
resultsList <- lapplyAlias(GRList,
function(x) aggregateSignal(
signal = signal,
signalCoord = signalCoord,
signalCoordType = signalCoordType,
regionSet = x,
signalCol = signalCol,
scoringMetric = scoringMetric,
verbose = verbose,
absVal = absVal, pOlap=NULL,
returnCovInfo=returnCovInfo))
# resultsList <- mapply(FUN = function(x) aggregateSignal(
# signal = signal,
# signalCoord = signalCoord,
# signalCoordType = signalCoordType,
# regionSet = x,
# signalCol = signalCol,
# scoringMetric = scoringMetric,
# verbose = verbose,
# absVal = absVal, pOlap=y,
# returnCovInfo=returnCovInfo), x=GRList, y=pOlapList)
#wilcox.conf.int = wilcox.conf.int,
} else {
# # apply over the list of region sets
# resultsList <- lapplyAlias(olList,
# function(x) aggregateSignal(
# signal = signal,
# signalCoord = signalCoord,
# signalCoordType = signalCoordType,
# regionSet = NULL,
# signalCol = signalCol,
# scoringMetric = scoringMetric,
# verbose = verbose,
# absVal = absVal,
# rsOL=x, pOlap=pOlapList, returnCovInfo=returnCovInfo))
# #wilcox.conf.int = wilcox.conf.int,
if (is.null(pOlapList)) {
# mapply does not work if pOlapList is null and other argument is not null
resultsList <- lapplyAlias(olList, FUN = function(x) aggregateSignal(
signal = signal,
signalCoord = signalCoord,
signalCoordType = signalCoordType,
regionSet = NULL,
signalCol = signalCol,
scoringMetric = scoringMetric,
verbose = verbose,
absVal = absVal,
rsOL=x,
returnCovInfo=returnCovInfo))
} else {
resultsList <- mapply(FUN = function(x, y) aggregateSignal(
signal = signal,
signalCoord = signalCoord,
signalCoordType = signalCoordType,
regionSet = NULL,
signalCol = signalCol,
scoringMetric = scoringMetric,
verbose = verbose,
absVal = absVal,
rsOL=x, pOlap=y,
returnCovInfo=returnCovInfo),
x=olList, y=pOlapList)
}
}
resultsDT <- do.call(rbind, resultsList)
# # add names if they are present
# if (!is.null(names(GRList))) {
# # resultsDT[, rsName := names(GRList)]
# row.names(resultsDT) <- names(GRList)
# }
# Convert the first column to numeric for proper sorting
resultsDT[, 1] <- as.numeric(resultsDT[, 1])
# rsName <- row.names(resultsDT)
resultsDF <- as.data.frame(resultsDT) #, row.names = rsName)
# Sort the results based on the first column
resultsDF <- resultsDF[order(-resultsDF[, 1]), ]
return(resultsDF)
}
# @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal
# (each row corresponds to one genomic coordinate/range)
# @param featureMat Rows should be samples, columns should be "features"
# (whatever you want to get correlation with: eg PC scores),
# all columns in featureMat will be used (subset when passing to function
# in order to not use all columns)
# @param centerDataMat logical object. Should rows in dataMat be centered based on
# their means? (subtracting row mean from each row)
# @param centerFeatureMat logical. Should columns in featureMat be centered based
# on their means? (subtract column mean from each column)
# @param testType character object. Can be "cor" (Pearson correlation),
# "spearmanCor (Spearman correlation)
# "pcor" (partial correlation), "cov" (covariance (Pearson)),
# @param covariate
#
# If a row in dataMat has 0 stand. deviation, correlation will be set to 0
# instead of NA as would be done by cor()
#
# @return returns a matrix where rows are the genomic signal (eg a CpG or region) and
# columns are the columns of featureMat
# @examples dataMat = matrix(rnorm(50), 5, 10)
# featureMat = matrix(rnorm(20), 10, 2)
createCorFeatureMat <- function(dataMat, featureMat,
centerDataMat=TRUE, centerFeatureMat = TRUE,
testType="cor", covariate=NULL) {
featureMat <- as.matrix(featureMat)
featureNames <- colnames(featureMat)
nFeatures <- ncol(featureMat)
nDataDims <- nrow(dataMat)
if (centerDataMat) {
cpgMeans <- rowMeans(dataMat, na.rm = TRUE)
# centering before calculating correlation
dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans)
}
if (centerFeatureMat) {
featureMeans <- colMeans(featureMat, na.rm = TRUE)
# centering before calculating correlation(also, t() converts to matrix)
featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans))
if (dim(featureMat)[1] == 1) {
featureMat <- t(featureMat)
}
}
# avoid this copy and/or delay transpose until after calculating correlation?
dataMat <- as.data.frame(t(dataMat))
if (testType == "cor") {
# create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs)
# how much do features correlate with each PC?
# put epigenetic data first in cor()
featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="pearson")
} else if (testType == "spearmanCor") {
# xtfrm(x) ranking
featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="spearman")
# } else if (testType == "pcor") {
# # partial correlation (account for covariates), ppcor package
#
# featurePCCor <- apply(X = featureMat, MARGIN = 2, function(y) apply(X = dataMat, 2,
# FUN = function(x) pcor.test(x = x, y=y,
# z=covariate,
# method="pearson")$estimate))
#
} else if (testType == "cov") {
featurePCCor <- cov(dataMat, featureMat, use="pairwise.complete.obs")
} else {
stop("invalid testType")
}
# if standard deviation of the data was zero, NA will be produced
# set to 0 because no standard deviation means no correlation with attribute of interest
featurePCCor[is.na(featurePCCor)] <- 0
colnames(featurePCCor) <- featureNames
return(featurePCCor)
# corLoadRatio <- signal[, signalCol] / featurePCCor
# hist(corLoadRatio[, "PC10"])
}
#' Create a "meta-region" profile
#'
#' This profile can show enrichment
#' of genomic signals with high feature contribution scores
#' in the region set but not in the
#' surrounding genome, suggesting that variation is linked specifically
#' to that region set.
#'
#' All regions in a given region set
#' are combined into a single aggregate profile. Regions in `regionSet`
#' should be expanded on each side to include a wider area of the genome around
#' the regions of interest (see example and vignettes).
#' To make the profile, first we optionally take
#' the absolute value of `signal` (`absVal` parameter).
#' Then each expanded regionSet region is
#' split into `binNum` bins. The corresponding
#' bins from each region (e.g. all bin1's, all bin2's, etc.) are grouped.
#' All overlapping values from `signal` are aggregated in each bin group
#' according to the `aggrMethod` parameter to get a meta-region profile.
#' Since DNA strand information is not considered,
#' the profile is averaged symmetrically around the center.
#' A peak in the middle of this profile suggests that variability is specific
#' to the region set of interest and is not a product of the surrounding genome.
#' A region set can still be significant even if it does not have a peak.
#' For example, some histone modification region sets may be in large genomic blocks
#' and not show a peak, despite having variation across samples.
#'
#' @template signal
#' @template signalCoord
#' @template regionSet
#' @template signalCol
#' @template signalCoordType
#' @param binNum Number of bins to split each region into when
#' making the aggregate profile. More bins will
#' give a higher resolution but perhaps more noisy profile.
#' @template verbose
#' @templateVar usesAggrMethod TRUE
#' @template scoringMetric
#' @template absVal
#' @return A data.frame with the binned meta-region profile,
#' one row per bin. columns: binID and one column for each target variable
#' in signalCol. The function will return NULL if there
#' is no overlap between signalCoord and any of the bin groups that come
#' from regionSet (e.g. none of the bin1's overlapped signalCoord,
#' NULL returned).
#'
#' @examples
#' data("brcaATACCoord1")
#' data("brcaATACData1")
#' data("esr1_chr1")
#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation
#' esr1_chr1_expanded <- resize(esr1_chr1, 12000, fix="center")
#' mrProfile <- getMetaRegionProfile(signal=featureContributionScores,
#' signalCoord=brcaATACCoord1,
#' regionSet=esr1_chr1_expanded,
#' signalCol=c("PC1", "PC2"),
#' binNum=21)
#' @export
getMetaRegionProfile <- function(signal, signalCoord, regionSet,
signalCol = c("PC1", "PC2"),
signalCoordType = "default",
binNum = 21,
verbose=TRUE,
aggrMethod = "default", absVal=TRUE) {
################### checking inputs #################################
########## check that inputs are the correct class, convert
checkConvertInputClasses(signal=signal,
signalCoord=signalCoord,
regionSet=regionSet,
signalCol = signalCol)
########## check that dimensions of inputs are consistent
# length of signal coord = nrow of signal
if (length(signalCoord) != nrow(signal)) {
stop(cleanws("The number of coordinates in
signalCoord (length(signalCoord)) does not equal the number of
rows in `signal`"))
}
######### check that appropriate columns are present
# signalCol are column names of `signal`
if (!all(signalCol %in% colnames(signal))) {
missingCols = signalCol[!(signalCol %in% colnames(signal))]
stop(cleanws(paste0("Some signalCol are not
columns of signal: ", missingCols)))
}
######## check that aggregation method is appropriate
if (!(aggrMethod %in% getScoringMethods("metaRegionProfile"))) {
stop(cleanws("scoringMetric was not recognized.
Check spelling and available options."))
}
###### check that signalCoordType is appropriate
if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
#######
# what happens if there are NAs or Inf in `signal`?
#################################################################
# detect signalCoordType
if (signalCoordType == "default") {
# when signalCoord is a GRanges object
if (any(start(signalCoord) != end(signalCoord))) {
signalCoordType <- "multiBase"
} else {
signalCoordType <- "singleBase"
}
}
# if "default" aggregation method is given, choose based on signalCoordType
if (aggrMethod == "default") {
if (signalCoordType == "singleBase") {
aggrMethod <- "regionMean"
} else if (signalCoordType == "multiBase") {
aggrMethod <- "proportionWeightedMean"
} else {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
}
##################################################################
# take absolute value or not
if (absVal) {
loadingDT <- as.data.table(abs(signal))
} else {
loadingDT <- as.data.table(signal)
}
GRDT <- grToDt(regionSet)
loadProf <- BSBinAggregate(BSDT = loadingDT,
rangeDT = GRDT,
binCount = binNum,
BSCoord = signalCoord,
byRegionGroup = TRUE,
splitFactor = NULL,
signalCol = signalCol,
aggrMethod = aggrMethod)
# if loadProf is NULL, return NULL from function, otherwise make symmetrical
# it will be NULL when there was no overlap between data and any of the bins
if (!is.null(loadProf)) {
loadProf <- makeSymmetric(loadProf)
loadProf[, regionGroupID := seq_len(binNum)][]
setnames(loadProf, old = "regionGroupID", new="binID")
loadProf <- as.data.frame(loadProf)
} else {
warning("Insufficient overlap between regionSet and signalCoord")
}
return(loadProf)
}
makeSymmetric <- function(prof) {
symProf <- apply(prof, 2, .makeSymmetric)
symProf <- as.data.table(symProf)
return(symProf)
}
.makeSymmetric <- function(vec) {
symVec <- (vec + rev(vec)) / 2
return(symVec)
}
# Produced originally for binning Ewing RRBS data across various region sets
#
# @param BSDT A data.table. For COCOA, a data.table of loading values
# with the PCs to be annotated. One column for the signal of each
# target variable
# and also has columns with the coordinates for the epigenetic features:
# chr (chromosome) and start column (also possibly end)
# @param rangeDT A data.table with the sets of regions to be binned,
# with columns named start, end
# @param binCount Number of bins across the region
# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start"
# columns must be in BSDT.
# @param byRegionGroup Pass along to binCount (see ?binCount)
# @template signalCol
# @param verbose A "logical" object. Whether progress
# of the function should be shown, one
# bar indicates the region set is completed.
# useful when using BSBinAggregate with 'apply' to do many
# region sets at a time.
# @param aggrMethod see ?getMetaRegionProfile()
BSBinAggregate <- function(BSDT, rangeDT, binCount,
BSCoord=NULL,
byRegionGroup = TRUE,
splitFactor = NULL,
signalCol,
verbose = FALSE,
aggrMethod) {
if (!is(rangeDT, "data.table")) {
stop("rangeDT must be a data.table")
}
if (is.null(BSCoord)) {
BSCoord <- BSdtToGRanges(list(BSDT))
}
seqnamesColName <- "seqnames" # default column name
if (! "seqnames" %in% colnames(rangeDT)) {
if ("chr" %in% colnames(rangeDT)) {
# message("seqnames column name set to: chr")
seqnamesColName <- "chr"
} else {
# Got neither.
stop("rangeDT must have a seqnames column")
}
}
# message("Binning...")
binnedDT <- rangeDT[, MIRA::binRegion(start, end,
binCount, get(seqnamesColName))]
# output is a list of GRanges objects, does not play well with vapply
# one GRanges object for each bin, containing a segment of each original rangeDT region
binnedGR <- sapply(split(binnedDT, binnedDT$binID), dtToGr)
# message("Aggregating...")
if (aggrMethod == "proportionWeightedMean") {
binMeansList <- lapply(X = binnedGR,
FUN = function(x) regionOLWeightedMean(signalMat = BSDT,
# signalGR = dtToGr(BSDT[, .(chr, start, end)]),
signalGR = BSCoord,
regionSet = x,
calcCols = signalCol))
# any bins that had no overlap with data will be NULL
# if any bins had no data, return NULL
if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) {
return(NULL)
}
# rbindlist of all NULL's will still return an (empty) data.table
# otherwise any NULL items will just be skipped and other rows will
# be concatenated
binnedBSDT <- rbindlist(binMeansList)
regionGroupID = 1:length(binMeansList)
# regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)]
binnedBSDT[, regionGroupID := regionGroupID]
} else if (aggrMethod == "simpleMean") {
binMeansList <- lapply(X = binnedGR,
FUN = function(x) regionOLMean(signalDT = BSDT,
# signalGR = dtToGr(BSDT[, .(chr, start, end)]),
signalGR = BSCoord,
regionSet = x,
calcCols = signalCol))
# any bins that had no overlap with data will be NULL
# if any bins had no data, return NULL
if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) {
return(NULL)
}
binnedBSDT <- rbindlist(binMeansList)
regionGroupID = 1:length(binMeansList)
# regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)]
binnedBSDT[, regionGroupID := regionGroupID]
} else if (aggrMethod == "regionMean") { # aggrMethod == "regionMean"
# what is output if a region set has no overlap?
binnedBSDT <- BSAggregate(BSDT,
regionsGRL=GRangesList(binnedGR),
BSCoord = BSCoord,
jExpr=buildJ(signalCol,
rep("mean", length(signalCol))),
byRegionGroup = byRegionGroup,
splitFactor = splitFactor)
# if any bins had no data
if (nrow(binnedBSDT) < binCount) {
return(NULL)
}
} else if (aggrMethod == "regionMedian") {
# what is output if a region set has no overlap?
binnedBSDT <- BSAggregate(BSDT,
regionsGRL=GRangesList(binnedGR),
BSCoord = BSCoord,
jExpr=buildJ(signalCol,
rep("median", length(signalCol))),
byRegionGroup = byRegionGroup,
splitFactor = splitFactor)
# if any bins had no data
if (nrow(binnedBSDT) < binCount) {
return(NULL)
}
}
# RGenomeUtils::BSAggregate
# # If we aren't aggregating by bin, then don't restrict to min reads!
# if (byRegionGroup) {
# binnedBSDT <- binnedBSDT[readCount > minReads,]
# }
if (verbose) {
message(".", appendLF=FALSE)
}
return(binnedBSDT)
}
# modification of BSAggregate to just return mean per region
#
# @template signal
# @template signalCoord
# @template regionSet
# Must be from the same reference genome
# as the coordinates for the actual data/samples (signalCoord).
# @template signalCol
# @param returnQuantile "logical" object. If FALSE, return region averages. If TRUE,
# for each region, return the quantile of that region's average value
# based on the distribution of individual genomic signal/feature values
# @template absVal
# @return a data.table with region coordinates and average loading
# values for each region. Has columns chr, start, end, and a column for each
# target variable in signalCol.
# Regions are not in order along the rows of the data.table.
#
# @example averagePerRegion(BSDT = BSDT, regionsGRL,
# jCommand = MIRA:::buildJ(cols = "methylProp", "mean"))
# Devel note: I could add a column for how many cytosines are in each region
averagePerRegion <- function(signal,
signalCoord,
regionSet,
signalCol = c("PC1", "PC2"),
returnQuantile = FALSE,
absVal=TRUE) {
################### checking inputs #################################
########## check that inputs are the correct class and converts
checkConvertInputClasses(signal=signal,
signalCoord=signalCoord,
regionSet=regionSet,
signalCol = signalCol)
########## check that dimensions of inputs are consistent
# length of signal coord = nrow of signal
if (length(signalCoord) != nrow(signal)) {
stop(cleanws("The number of coordinates in
signalCoord (length(signalCoord)) does not equal the number of
rows in `signal`"))
}
######### check that appropriate columns are present
# signalCol are column names of signal
if (!all(signalCol %in% colnames(signal))) {
missingCols = signalCol[!(signalCol %in% colnames(signal))]
stop(cleanws(paste0("Some signalCol are not
columns of signal: ", missingCols)))
}
#######
# what happens if there are NAs or Inf in `signal`?
# any NAs that overlap the regionSet will cause the score to be NA
if (is(signal, "data.table")) {
naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
} else {
naRows = apply(X = signal[, signalCol, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
}
if (any(naRows)) {
signal <- signal[!naRows, ]
signalCoord <- signalCoord[!naRows]
warning("Removing rows with NA from `signal`")
}
#################################################################
#################################################################
# determine whether coordinates are single base or a range
# if (!("end" %in% colnames(coordinateDT))) {
# dataCoordType <- "singleBase"
# } else {
if (any(start(signalCoord) != end(signalCoord))) {
dataCoordType <- "multiBase"
} else {
dataCoordType <- "singleBase"
}
# }
# take absolute value or not
if (absVal) {
signalDT <- as.data.table(abs(signal))
} else {
signalDT <- as.data.table(signal)
}
# use different function for single base data and for region data
if (dataCoordType == "singleBase") {
# linking coordinates to loading values, has columns chr start, signalCol
BSDT <- signalDT[, .SD, .SDcols = signalCol]
jExpr <- buildJ(signalCol, rep("mean", length(signalCol)))
avPerRegion <- BSAggregate(BSDT = BSDT,
regionsGRL = regionSet,
BSCoord = signalCoord,
excludeGR = NULL,
regionsGRL.length = NULL,
splitFactor = NULL,
keepCols = NULL,
sumCols = NULL,
jExpr = jExpr,
byRegionGroup = FALSE,
keep.na = FALSE,
returnSD = FALSE,
returnOLInfo = FALSE,
meanPerRegion = TRUE,
returnQuantile = returnQuantile)
} else if (dataCoordType == "multiBase") {
avPerRegion <- weightedAvePerRegion(signalDT = signalDT,
signalCoord=signalCoord,
regionSet=regionSet,
calcCols = signalCol,
returnQuantile = returnQuantile)
} else {
stop("dataCoordType not recognized.")
}
return(avPerRegion)
}
# signalDT
# average by region for region-based data (eg ATAC-seq)
# @return data.table. chr, start, end columns. One column for each calcCols
# that has the average value in each region for that col
weightedAvePerRegion <- function(signalDT,
signalCoord,
regionSet,
calcCols = c("PC1", "PC2"),
returnQuantile = FALSE) {
if (is(signalCoord, "data.frame")) {
signalCoord <- dtToGr(signalCoord)
}
hits <- findOverlaps(query = signalCoord, subject = regionSet)
# if no overlap, return NULL
if (length(hits) == 0) {
return(NULL)
}
olap <- pintersect(signalCoord[queryHits(hits)],
regionSet[subjectHits(hits)])
polap <- width(olap) / width(regionSet[subjectHits(hits)])
# get total proportion overlap per region
# aggregate polap by region
pOlapDT <- data.table(signalDT[queryHits(hits), calcCols, with=FALSE],
rsRegionID = subjectHits(hits),
pOlap = polap)
pOlapByRegionDT <- pOlapDT[, .(regionPOlap = sum(pOlap)), by=rsRegionID]
# specify aggregation operation
# will be done separately for each PC specified
aggrCommand <- paste("list(", paste(paste0(calcCols, "=", "sum", "(",
calcCols, " * pOlap)"), collapse = ", "), ")")
weightedSumByRegionDT <- pOlapDT[, eval(parse(text=aggrCommand)), by=rsRegionID]
# weightedSumByRegionDT and pOlapByRegionDT should be in the same order
regionInd <- weightedSumByRegionDT$rsRegionID
olCoord <- grToDt(regionSet)[regionInd, .(chr, start, end)]
# divide by total proportion overlap to get mean value
jCommand <- paste("list(", paste(paste0(calcCols, "=", calcCols, " / regionPOlap"), collapse = ", "), ")")
meanPerRegion <- cbind(pOlapByRegionDT, weightedSumByRegionDT)
meanPerRegion <- meanPerRegion[, eval(parse(text=jCommand))]
meanPerRegion <- cbind(olCoord, meanPerRegion)
if (returnQuantile) {
for (i in seq_along(calcCols)) {
# perhaps this could be more efficient with mapply
# ecdf example: ecdf(distributionData)(getPercentileOfThis)
meanPerRegion[, c(calcCols[i]) := ecdf(signalDT[[calcCols[i]]])(meanPerRegion[[calcCols[i]]])]
# I tried set() to improve performance but it took about the same time
}
}
# meanPerRegion <- pOlapDT[, .(regionMean = sum(score * (pOlap/sum(pOlap))), by=rsRegionID]
return(meanPerRegion)
}
#' Get regions that are most associated with target variable
#'
#' Get a GRanges with top regions from the region set based on
#' average feature contribution scores
#' for the regions or the quantile of the region's average
#' feature contribution score based on the
#' distribution of all feature contribution scores for the target variable.
#' Returns average feature contribution score or quantile as GRanges metadata.
#'
#' @template signal
#' @template signalCoord
#' @template regionSet
#' @template signalCol
#' @param cutoff Numeric. Only regions with at least this value will be
#' returned (either above this average `signal` value or above this quantile
#' if returnQuantile=TRUE).
#' @param returnQuantile Logical. If FALSE, return region averages. If TRUE,
#' for each region, return the quantile of that region's average value
#' based on the distribution of individual feature values in `signal` for
#' that `signalCol`.
#' @return A GRanges object with region coordinates for regions with
#' scores/quantiles above "cutoff" for any target variable in signalCol.
#' The scores/quantiles
#' for signalCol are given as metadata in the GRanges.
# Are regions in order along the rows of the data.table?
#
#' @examples
#' data("brcaATACCoord1")
#' data("brcaATACData1")
#' data("esr1_chr1")
#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation
#' topRegions <- getTopRegions(signal=featureContributionScores,
#' signalCoord=brcaATACCoord1,
#' regionSet=esr1_chr1,
#' returnQuantile = TRUE)
#' @export
getTopRegions <- function(signal,
signalCoord,
regionSet,
signalCol = c("PC1", "PC2"),
cutoff = 0.8,
returnQuantile=TRUE) {
regionLoadDT <- averagePerRegion(signal=signal,
signalCoord=signalCoord, regionSet=regionSet,
signalCol = signalCol,
returnQuantile = returnQuantile)[]
keepInd <- regionLoadDT[, signalCol, with=FALSE] >= cutoff
# keep region if it is above cutoff in any of the PCs in signalCol
keepInd <- apply(X = keepInd, MARGIN = 1, FUN = any)
highGR <- dtToGr(regionLoadDT[keepInd, ])
values(highGR) <- as.data.frame(regionLoadDT[keepInd, signalCol, with=FALSE])
return(highGR)
}
# different scoring metrics
# remniscent of LOLA:
# support (number of regions, for us regions that have at least 1 cytosine and can be scored)
# mean loading value (or could do ratio of peak to surrounding regions)
# signal to noise ratio, how big is peak compared to noise of surrounding area
# with SNR, even a small peak could have a high SNR
# @param loadingProf
# loadingProfileSNR = function() {
# # magnitude of the peak divided by standard deviation of
# # noise (signal in surrounding areas)
# }
# support is just number of regions from each input region set that overlap at all with
# the cytosines that have loading values
# findOverlaps(regionSet, dtToGr(coordinateDT))
###########################################################################
# BSaggregate -- Aggregate a BSDT across regions or region groups,
# for multiple samples at a time.
# This function is as BScombineByRegion, but can handle not only multiple
# samples in BSDT, but also simultaneously multiple region sets by passing
# a regionsGRL (GRangesList object).
# you can use jExpr to do other functions.
# Given a bisulfite data table as input, with an identifier column for
# different samples; plus a GRanges objects with regions to aggregate.
#
# @param BSDT The bisulfite data.table (output from one of the parsing
# functions for methylation calls) that you wish to aggregate. It can
# be a combined table, with individual samples identified by column passed
# to splitFactor.
# @param regionsGRL Regions across which you want to aggregate. Should be
# from a single region set. eg GRangesList(regionSet)
# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start"
# columns must be in BSDT.
# @param excludeGR A GRanges object with regions you want to
# exclude from the aggregation function. These regions will be eliminated
# from the input table and not counted.
# @param jExpr You can pass a custom command in the j slot to data.table
# specifying which columns to aggregate, and which functions to use. You
# can use buildJ() to build a jExpr argument easily.
# @param byRegionGroup You can aggregate by regionID or by regionGroupID;
# this reflects the regionsGRL that you pass; by default, BSAggregate will
# aggregate each region individually -- scores will then be contiguous, and
# the output is 1 row per region.
# Turn on this flag to aggregate across all region groups, making the result
# uncontiguous, and resulting in 1 row per *region group*.
# @param returnSD Whether the standard deviation of the columns of interest
# should be returned. Standard deviation for rows that overlap with region set
# and also for rows that do not overlap with region set. Not currently functional.
# @param returnOLInfo if true, include number of overlapping cpgs and number
# of overlapping regions in the result
# @param meanPerRegion Will return the mean value in each region in a data.table
# instead of averaging all regions into a single value, returnOLInfo does
# nothing if meanPerRegion=TRUE (function exits before that code, which
# expects a different data structure)
# @param returnQuantile Only used if meanPerRegion=TRUE, instead of mean
# return the quantile/percentile of the mean of each region
# in relation to the distribution of original values in BSDT
# @template rsOL
#
BSAggregate <- function(BSDT, regionsGRL, BSCoord=NULL, excludeGR=NULL,
regionsGRL.length = NULL, splitFactor=NULL,
keepCols=NULL, sumCols=NULL,
jExpr=NULL, byRegionGroup=FALSE,
keep.na=FALSE, returnSD=FALSE,
returnOLInfo=FALSE, meanPerRegion=FALSE,
returnQuantile=FALSE, rsOL=NULL) {
# Assert that regionsGRL is a GRL.
# If regionsGRL is given as a GRanges, we convert to GRL
if(is(regionsGRL, "GRanges")) {
regionsGRL <- GRangesList(regionsGRL)
} else if (!is(regionsGRL, "GRangesList") && is.null(rsOL)) {
stop("regionsGRL is not a GRanges or GRangesList object")
}
# will cause error if BSDT is only a data.frame
if (is(BSDT, "data.frame") & !is(BSDT, "data.table")) {
BSDT <- as.data.table(BSDT)
}
if (!is(BSDT, "data.table")) {
stop("BSDT must be a data.table")
}
if(! is.null(excludeGR)) {
BSDT <- BSFilter(BSDT, minReads=0, excludeGR)
}
if (returnQuantile) {
# keep all data so quantiles can be calculated
# later code will change BSDT
origBSDT <- data.table::copy(BSDT)
}
# TODO: BSdtToGRanges needs to include end coordinate!!!
if (is.null(BSCoord)) {
bsgr <- BSdtToGRanges(list(BSDT))
} else {
bsgr <- BSCoord
}
additionalColNames <- setdiff(colnames(BSDT),
c("chr","start", "end",
"hitCount","readCount", splitFactor))
# specify that "character" outcome is expected from mode by
# supplying "a" as last vapply argument (any character object would work)
colModes <- vapply(BSDT, mode, "a")
if (is.null(sumCols)) {
sumCols <- setdiff(colnames(BSDT),c("chr", "start", "end",
"strand", splitFactor, keepCols))
# Restrict to numeric columns.
sumCols <- intersect(sumCols,
names(colModes[which(colModes == "numeric")]))
}
if (is.null(rsOL)) {
# It's required to do a findoverlaps on each region individually,
# Not on a GRL, because of the way overlaps with GRLs work. So,
# we must convert the GRL to a GR, but we must keep track of which
# regions came from which group.
regionsGR <- unlist(regionsGRL)
if(is.null(regionsGRL.length)) {
# if (length(regionsGRL) > 100) {
# message("BSAggregate: Calculating sizes. You can speed this up by supplying a regionsGRL.length vector...", appendLF=FALSE)
# }
# specify that output should be numeric with vapply
# (any number would work instead of 1)
regionsGRL.length <- vapply(regionsGRL, length, 1)
# message("Done counting regionsGRL lengths.")
}
# Build a table to keep track of which regions belong to which group
# BIOC note: sapply returns a list where each item is of different length
# therefore, I'm not using vapply
# one regionID per region of "regionsGR", withinGroupID: for when regionsGRL
# has multiple GRanges, Gives the index of each region within its corresponding
# GRanges object. regionGroupID: for when regionsGRL has multiple GRanges,
# indicates which regionsGRL GRanges object the region has come from (
# index of the GRanges object in regionsGRL)
region2group <- data.table(
regionID=seq_along(regionsGR),
chr=as.vector(seqnames(regionsGR)),
start=as.vector(start(regionsGR)),
end=as.vector(end(regionsGR)),
# withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))),
regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) # repeats each number regionGRL.length times
setkey(region2group, regionID)
#TODO: if overlap method is single, but "end" is present, find center
# and set that to start!
# if (all(start(bsgr[[1]]) != end(bsgr[[length(unique(queryHits(hits)))1]]))) {
if (all(start(bsgr) != end(bsgr))) {
stop("BSDT start and end coordinates are not the same. Choose a different aggrMethod.")
} else {
# fo <- findOverlaps(query = bsgr[[1]], subject = regionsGR)
fo <- findOverlaps(query = bsgr, subject = regionsGR)
}
if (length(subjectHits(fo)) < 1) {
warning("Insufficient overlap between signalCoord and the region set.")
return(NULL)
}
} else {
# Build a table to keep track of which regions belong to which group
# BIOC note: sapply returns a list where each item is of different length
# therefore, I'm not using vapply
# region2group <- data.table(
# regionID=seq_len(regionsGR),
# withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))),
# regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length))
if (length(subjectHits(rsOL)) < 1) {
warning("Insufficient overlap between signalCoord and the region set.")
return(NULL)
}
# only works for when one region set is given in regionsGRL (ie does
# not work for metaregion profiles)
region2group <- data.table(
regionID=seq_len(max(subjectHits(rsOL))),
regionGroupID=rep(1, max(subjectHits(rsOL)))) # assumes only 1 region set in regionsGRL
setkey(region2group, regionID)
fo <- rsOL
}
### use info from findOverlaps to see how many individual
# cytosines (or input regions) and how many regions (in region sets)
# overlap with one another
if (returnOLInfo) {
signalCoverage <- length(unique(queryHits(fo)))
regionSetCoverage <- length(unique(subjectHits(fo)))
}
# if("end" %in% colnames(BSDT)){
# setkey(BSDT, chr, start, end)
# } else{
# setkey(BSDT, chr, start)
# }
# Gut check:
# stopifnot(all(elementMetadata(bsgr[[1]])$readCount == BSDT$readCount))
# message("Setting regionIDs...")
BSDT <- BSDT[queryHits(fo),] #restrict the table to CpGs (or input region) in any region set.
BSDT[,regionID:=subjectHits(fo)] #record which region they overlapped.
#BSDT[queryHits(fo),regionID:=subjectHits(fo)]
#if (!keep.na) {
# BSDT = BSDT[queryHits(fo),]
#}
if (is.null(jExpr)) {
cols=c(sumCols, keepCols)
funcs <- c(rep("sum", length(sumCols)), rep("unique", length(keepCols)))
jExpr <- buildJ(cols, funcs)
}
# message("jExpr: ", jExpr)
# Define aggregation column. aggregate by region or by region group?
if (byRegionGroup) {
agCol <- "regionGroupID"
} else {
agCol <- "regionID" # Default
}
# Build the by string
if (is.null(splitFactor)) {
byString <- paste0("list(regionID)")
} else {
byString <- paste0("list(", paste("regionID",
paste0(splitFactor, ""),
collapse=", ", sep=", "), ")")
}
# Now actually do the aggregate: (aggregate within each region)
# message("Combining...")
bsCombined <- BSDT[,eval(parse(text=jExpr)), by=eval(parse(text=byString))]
setkey(bsCombined, regionID)
if (meanPerRegion) {
setkey(region2group, regionID)
avPerRegion <- merge(bsCombined, region2group)
avPerRegion[, c("regionID",
# "withinGroupID",
"regionGroupID") := NULL]
if (returnQuantile) {
for (i in seq_along(sumCols)) {
# perhaps this could be more efficient with mapply
# ecdf example: ecdf(distributionData)(getPercentileOfThis)
avPerRegion[, c(sumCols[i]) := ecdf(origBSDT[[sumCols[i]]])(avPerRegion[[sumCols[i]]])]
# I tried set() to improve performance but it took about the same time
}
}
return(avPerRegion)
}
# Now aggregate across groups.
# I do this in 2 steps to avoid assigning regions to groups,
# which takes awhile. I think this preserves memory and is faster.
# Define aggregation column. aggregate by region or by region group?
# byRegionGroup=TRUE means to aggregate within each individiual GRanges from
# regions.GRL
if (byRegionGroup) {
# must set allow=TRUE here in case there are multiple IDs (splitCol)
bsCombined[region2group, regionGroupID:=regionGroupID, allow=TRUE]
if (! is.null(splitFactor) ) {
byStringGroup <- paste0("list(", paste("regionGroupID",
paste0(splitFactor,
collapse=", "),
sep=", "), ")")
} else {
byStringGroup <- "list(regionGroupID)"
}
# aggregate by regionGroupID (region set/GRanges in regionsGRL)
bsCombined=bsCombined[,eval(parse(text=jExpr)),
by=eval(parse(text=byStringGroup))]
if (returnOLInfo) {
bsCombined[, signalCoverage := signalCoverage]
bsCombined[, regionSetCoverage := regionSetCoverage]
}
return(bsCombined)
} else {
e <- region2group[bsCombined,]
setkey(e, regionID)
return(e)
}
# WARNING: There are now 2^2 ways to aggregate, sum vs mean
# at each level: across regions, then across region sets. This
# doesn't give you a choice at this point.
}
# Given a BSDT (bisulfite data.table), remove any entries that overlap
# regions given in the excludeGR argument and/or filter out sites
# that have lower than a minimum number of reads.
# @param BSDT Bisulfite data.table to filter
# @param minReads Require at least this level of coverage at a cpg.
# @param excludeGR GRanges object with regions to filter.
#
# @return The BSDT with appropriate regions removed.
BSFilter <- function(BSDT, minReads = 10, excludeGR = NULL) {
# First, filter for minimum reads.
if (minReads > 0) {
BSDT <- BSDT[coverage >= minReads, ]
}
if (NROW(BSDT) == 0) { return(data.table(NULL)) }
# Now, filter entries overlapping a region in excludeGR.
if (!is.null(excludeGR)) {
gr <- dtToGr(BSDT)
fo <- findOverlaps(gr, excludeGR)
qh <- unique(queryHits(fo))
length(qh)
nrow(BSDT)
BSDT <- BSDT[-qh, ]
}
return(BSDT)
}
#' Get indices for top scored region sets
#'
#' For each target variable, get index of original region sets
#' but ordered by rsScores
#' ranking for each target variable.
#' The original index refers to that region set's position
#' in the `GRList` param given to `aggregateSignalGRList` which is
#' also that region set's
#' row index in the COCOA output. The first number in a given column
#' of this function's output will be the
#' original index of the region set ranked first for that target variable.
#' The second row for a
#' column will be the original index of the region set that ranked second
#' for that target variable, etc. You can use this function to make it easier
#' when you want to select the top region sets for further analysis or
#' just for sorting the results. Region set scores are sorted in decreasing
#' or increasing order according to the `decreasing` parameter.
#'
#' @template rsScores
#' @templateVar isRSRankingIndex TRUE
#' @template signalCol
#' @param decreasing Logical. Whether to sort rsScores in decreasing
#' or increasing order.
#' @param newColName Character. The names of the columns of the output data.frame.
#' The order should correspond to the order of the
#' input columns given by signalCol.
#' @return A data.frame with one column for each `signalCol`.
#' Column names are given by `signalCol` or `newColName` (if used).
#' Each column has been
#' sorted by score for region sets for that target variable
#' (order given by `decreasing`
#' param).
#' Original indices for region sets that were used to create rsScores
#' are given. Region sets with a score of NA are counted as having the
#' lowest scores and indices for these region sets will be at the bottom of the
#' returned data.frame (na.last=TRUE in sorting)
#' @examples data("rsScores")
#' rsRankInd = rsRankingIndex(rsScores=rsScores,
#' signalCol=c("PC1", "PC2"))
#' # region sets sorted by score for PC1
#' rsScores[rsRankInd$PC1, ]
#' # region sets sorted by score for PC2
#' rsScores[rsRankInd$PC2, ]
#'
#' @export
#'
rsRankingIndex <- function(rsScores, signalCol,
decreasing=TRUE, newColName = signalCol) {
if (!(is(rsScores, "data.frame") || is(rsScores, "matrix"))) {
stop("rsScores should be a data.frame. Check object class.")
}
rsScores <- as.data.table(rsScores)
if (!(is(signalCol, "character") | is(signalCol, "list"))) {
stop("signalCol should be a character vector or list of character vectors.")
}
if (is(signalCol, "list")) {
if (!all(vapply(X = signalCol, FUN = class, FUN.VALUE = "a") %in% "character")) {
stop("Items of signalCol should be character vectors.")
}
if (!length(unique(vapply(signalCol, FUN = length, FUN.VALUE = 2)))) {
stop("Items of signalCol should be the same length as each other.")
}
if (!all(unlist(signalCol) %in% colnames(rsScores))) {
stop("Some column names in signalCol are not present in rsScores.")
}
# the first item in the list is taken for newColName
if (is(newColName, "list")) {
newColName <- signalCol[[1]]
}
} else { # signalCol is character
if (!all(signalCol %in% colnames(rsScores))) {
stop("Some column names in signalCol are not present in rsScores.")
}
}
dtOrder <- rep(-99L, length(decreasing))
# how to sort scores
# -1 for decreasing order of scores
dtOrder[decreasing] <- -1L
# +1 for increasing order of scores
dtOrder[!decreasing] <- 1L
dtOrder <- rep(-99L, length(decreasing))
# how to sort scores
# -1 for decreasing order of scores
dtOrder[decreasing] <- -1L
# +1 for increasing order of scores
dtOrder[!decreasing] <- 1L
# so by references changes will not be a problem
rsScores <- copy(rsScores)
rsScores[, rsIndex := seq_len(nrow(rsScores))]
if (is(signalCol, "list")) {
if (length(newColName) != length(signalCol[[1]])) {
stop("newColName is not the same length as columns given in signalCol.")
}
rsEnSortedInd <- subset(rsScores, select= signalCol[[1]])
setnames(rsEnSortedInd, newColName)
colNameMat <- do.call(rbind, signalCol)
# then scores by each PC and make a column with the original index for sorted region sets
# this object will be used to pull out region sets that were top hits for each PC
for (i in seq_along(signalCol[[1]])) {
theseOrderCols <- colNameMat[, i]
setorderv(rsScores, cols = theseOrderCols, order=dtOrder, na.last=TRUE)
rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]]
}
} else if (is(signalCol, "character")) {
if (length(newColName) != length(signalCol)) {
stop("newColName is not the same length as columns given in signalCol.")
}
signalCol <- signalCol[signalCol %in% colnames(rsScores)]
rsEnSortedInd <- subset(rsScores, select= signalCol)
setnames(rsEnSortedInd, newColName)
# then scores by each PC and make a column with the original index for sorted region sets
# this object will be used to pull out region sets that were top hits for each PC
for (i in seq_along(signalCol)) {
setorderv(rsScores, cols = signalCol[i], order=dtOrder, na.last=TRUE)
rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]]
}
} else {
stop("signalCol should be a character vector or list of character vectors.")
}
# reset order
# setorderv(rsScores, cols = "rsIndex", order=1L)
return(as.data.frame(rsEnSortedInd))
}
#################### Metric functions ########################################
# scores, metrics, or statistical tests
# Instead of averaging within regions first as BSAggregate does,
# this function does a simple average and standard deviation
# for all CpGs that overlap
# with regions of a region set, also does average and
# standard deviation for non overlapping CpGs. Created to
# get metrics of loading values for each PC.
#
# Faster if given total average for each column of interest
#
# @param dataDT a data.table with
# columns to get metrics of eg (PC1, PC2). All columns
# will be considered
# columns to get the metrics from so no unnecessary columns should be
# included.
# @param dataGR GRanges. Coordinates for dataDT.
# @template regionSet
# Metrics will be calculated on
# only coordinates within this region set (and optionally separately
# on those outside this region set with alsoNonOLMet parameter)
# @param signalCol the columns to calculate the metrics on. The
# metrics will be calculated on each one of these columns separately.
# @param metrics character vector with the name of a function or functions
# to calculate on selected cytosines. Function should only require one
# input which should be the values of the cytosines.
# @param alsoNonOLMet also include same metrics
# for non overlapping CpGs (still also returns metrics for overlapping CpGs)
# param columnMeans Not functional/is deprecated. The idea was to use
# the mean of the column to speed up calculations (eg when calculating
# mean of overlapping CpGs, use that info and column mean to get
# mean for non overlapping CpGs without manually calculating it)
#
signalOLMetrics <- function(dataDT,
dataGR,
regionSet,
signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))],
metrics=c("mean", "sd"),
alsoNonOLMet=TRUE, rsOL=NULL) {
# convert DT to GR for finding overlaps
# dataGR <- BSdtToGRanges(list(dataDT))[[1]]
if (is.null(rsOL)) {
OL <- findOverlaps(query = dataGR, subject = regionSet)
# region set info
totalRegionNumber <- length(regionSet)
meanRegionSize <- round(mean(width(regionSet)), 1)
} else {
OL <- rsOL
}
# if no overlap, exit
if (length(OL) == 0) {
return(NULL)
}
# get indices for overlapping and non overlapping CpGs
olCpG <- queryHits(OL)
# get info on degree of overlap
# number of CpGs that overlap
signalCoverage <- length(unique(olCpG))
# number of regions that overlap
regionSetCoverage <- length(unique(subjectHits(OL)))
# gets metrics for all columns except chr, start, end
jExpr <- buildJ(cols=rep(signalCol, each=length(metrics)),
funcs=rep(metrics, length(signalCol)),
newColNames = paste0(rep(signalCol,
each=length(metrics)),
"_", metrics))
# getting the metrics
olMetrics <- as.data.frame(dataDT[olCpG, eval(parse(text=jExpr))])
# calculate average of nonOLCpGs based on columnMean if given
# if (!is.null())
#
# formatting so there is one row per PC/testCol
# output is a matrix with ncol = length(metrics)
# for vapply, FUN.VALUE should have length equal to a single output of FUN
olResults <- vapply(X = metrics,
FUN = function(x) as.numeric(olMetrics[, grepl(pattern = x, colnames(olMetrics))]),
as.numeric(seq_along(signalCol)))
olResults <- as.data.table(olResults)
setnames(olResults, old = colnames(olResults), new = paste0(colnames(olResults), "_OL"))
if (alsoNonOLMet) {
nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG]
# if no OL for region set, don't calculate for non region set
# TODO make conditional on region set having any overlap
nonOLMetrics <- as.data.frame(dataDT[nonOLCpG, eval(parse(text=jExpr))])
nonOLResults <- vapply(X = metrics,
FUN = function(x) as.numeric(nonOLMetrics[, grepl(pattern = x, colnames(nonOLMetrics))]),
as.numeric(seq_along(signalCol)))
nonOLResults <- as.data.table(nonOLResults)
setnames(nonOLResults, old = colnames(nonOLResults), new = paste0(colnames(nonOLResults), "_nonOL"))
if (is.null(rsOL)) {
metricDT <- cbind(data.table(testCol=signalCol),
olResults,
nonOLResults,
data.table(signalCoverage,
regionSetCoverage,
totalRegionNumber,
meanRegionSize))
} else {
metricDT <- cbind(data.table(testCol=signalCol),
olResults,
nonOLResults,
data.table(signalCoverage,
regionSetCoverage))
}
} else {
if (is.null(rsOL)) {
metricDT <- cbind(data.table(testCol=signalCol),
olResults,
data.table(signalCoverage,
regionSetCoverage,
totalRegionNumber,
meanRegionSize))
} else {
metricDT <- cbind(data.table(testCol=signalCol),
olResults,
data.table(signalCoverage,
regionSetCoverage))
}
}
return(metricDT)
}
# Wilcoxon rank sum test for a region set
# @param dataDT a data.table with chr, start, end columns as well
# as columns to get metrics of eg (PC1, PC2). All columns
# except chr, start, and end will be considered
# columns to get the metrics from so no unnecessary columns should be
# included.
# @template regionSet
# @param signalCol the columns of interest. You will do ranksum test separately
# on each of these columns (given test only uses info in one column)
# @param ... Additional parameters of wilcox.test function. See ?wilcox.test.
# For instance specify alternative hypothesis: alternative = "greater".
# @return A vector with a p value for each column other than chr, start or end.
# @examples data("brcaLoadings1")
# data("brcaMCoord1")
# data("nrf1_chr1")
# dataDT = as.data.table(cbind(brcaMCoord1, brcaLoadings1))
# rsWilcox(dataDT = dataDT, regionSet = nrf1_chr1, conf.int=TRUE)
rsWilcox <- function(dataDT,
regionSet,
signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))],
conf.int=FALSE,
...) {
# region set info
totalRegionNumber <- length(regionSet)
meanRegionSize <- round(mean(width(regionSet)), 1)
# convert DT to GR for finding overlaps
dataGR <- BSdtToGRanges(list(dataDT))[[1]]
OL <- findOverlaps(query = regionSet, subject = dataGR)
# if no overlap, exit
if (length(OL) == 0) {
return(NULL)
}
# get indices for overlapping and non overlapping CpGs
olCpG <- unique(subjectHits(OL))
nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG]
# get info on degree of overlap
# number of CpGs that overlap
signalCoverage <- length(unique(olCpG))
# number of regions that overlap
regionSetCoverage <- length(unique(queryHits(OL)))
# each confidence interval has length of 2: [low, high]
if (conf.int) {
# calculate Wilcoxon rank sum test for each column
# additional parameters given with ...
# confIntervals will be [low1, high1, low2, high2, etc.]
confIntervals <- as.numeric(vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])),
y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])),
conf.int = conf.int, ...)$conf.int, c(1, 1)))
names(confIntervals) <- paste0(rep(signalCol, each=2), c("_low", "_high"))
wRes <- data.frame(t(confIntervals),
signalCoverage,
regionSetCoverage,
totalRegionNumber,
meanRegionSize)
} else {
# calculate Wilcoxon rank sum test for each column
# additional parameters given with ...
pVals <- vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])),
y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])), ...)$p.value, 1)
wRes <- data.frame(t(pVals),
signalCoverage,
regionSetCoverage,
totalRegionNumber,
meanRegionSize)
}
return(wRes)
}
# I optimized upstream so that a matrix would be given to this function
# if this function is rewritten and no longer requires a matrix input,
# then in order to prevent unnecessary object copying,
# rewrite upstream code that converts signalDT to matrix class
# @param signalMat Data to be aggregated (e.g. raw data: ATAC-seq,
# region based DNA methylation or loading values)
# @param signalGR GRanges object with coordinates for signalMat
# @template regionSet
# The region set to score.
# @param calcCols character object. Column names. A weighted sum will be done
# for each of these columns (columns should be numeric).
# @template rsOL
# @param pOlap see "?aggregateSignal"
# @template returnCovInfo
# @value Returns data.frame with columns 'calcCols', signalCoverage col has
# number of signalGR regions that overlapped with any regionSet regions,
# regionSetCoverage has the sum of all proportion overlaps of regions from
# signalGR with regionSet (regionSet region is denominator)
# containing weighted mean for each col.
# Returns NULL if there is no overlap between signalGR and regionSet
regionOLWeightedMean <- function(signalMat, signalGR,
regionSet, calcCols, rsOL=NULL,
pOlap=NULL, returnCovInfo=TRUE) {
if (!is(signalMat, "matrix")) {
signalMat <- as.matrix(signalMat)
}
if (is.null(rsOL)) {
hits <- findOverlaps(query = signalGR, subject = regionSet)
} else {
hits <- rsOL
}
# if no overlap, return NULL
if (length(hits) == 0) {
return(NULL)
}
if (is.null(pOlap)) {
olap <- pintersect(signalGR[queryHits(hits)],
regionSet[subjectHits(hits)])
pOlap <- width(olap) / width(regionSet[subjectHits(hits)])
}
# some rows may be duplicated if a signalMat region overlapped multiple
# regions from signalGR but that is ok
# signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying
# weight the signalMat values by the proportion overlap (weighted average)
weightedSum <- t(pOlap) %*% signalMat[queryHits(hits), calcCols]
# weighted average
denom <- sum(pOlap)
weightedAve <- as.data.frame(weightedSum / denom)
colnames(weightedAve) <- calcCols
# add columns for coverage info
if (returnCovInfo) {
weightedAve$signalCoverage = length(unique(queryHits(hits)))
weightedAve$regionSetCoverage = length(unique(subjectHits(hits)))
weightedAve$sumProportionOverlap = denom
}
return(weightedAve)
}
# @param signalDT Data to be aggregated (e.g. raw data: ATAC-seq,
# region based DNA methylation or loading values)
# @param signalGR GRanges object with coordinates for signalDT
# @template regionSet
# The region set to score.
# @param calcCols character object. Column names. A mean will be calculated for
# each of these columns (columns should be numeric).
# @param metric character. "mean" or "median"
# @template rsOL
# @template returnCovInfo
# @value Returns data.frame with columns 'calcCols', signalCoverage col has
# number of signalGR regions that overlapped with any regionSet regions,
# regionSetCoverage has the number of regions from
# signalGR that overlapped with regionSet
# Returns NULL if there is no overlap between signalGR and regionSet
regionOLMean <- function(signalDT, signalGR, regionSet,
calcCols, metric="mean", rsOL=NULL, returnCovInfo=TRUE) {
if (is.null(rsOL)) {
hits <- findOverlaps(query = signalGR, subject = regionSet)
} else {
hits <- rsOL
}
# if no overlap, return NULL
if (length(hits) == 0) {
return(NULL)
}
# some rows may be duplicated if a signalDT region overlapped multiple
# regions from signalGR but that is ok
if (metric == "mean") {
# mean of the overlapping signalDT values
signalAve <- as.data.frame(t(colMeans(signalDT[queryHits(hits),..calcCols])))
} else if (metric == "median") {
# median of the overlapping signalDT values
signalAve <- as.data.frame(t(apply(X = signalDT[queryHits(hits),..calcCols], 2, median)))
} else {
stop("Error in regionOLMean function. Invalid metric specified.")
}
if (returnCovInfo) {
# add columns for coverage info
signalAve$signalCoverage <- length(unique(queryHits(hits)))
signalAve$regionSetCoverage <- length(unique(subjectHits(hits)))
}
return(signalAve)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.