#' @title Annotate regions.
#'
#' @description Annotate a \code{\link[GenomicRanges]{GRanges}} object with
#' sets of reference \code{\link[GenomicRanges]{GRanges}} or
#' \code{\link[GenomicRanges]{GRangesList}} objects, with
#' respect to overlaps and nearest neighbors.
#'
#' @author Michael Stadler
#'
#' @param x The \code{\link[GenomicRanges]{GRanges}} object to annotate.
#' @param hasOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, a logical vector "X.hasOverlap" will be added to the
#' \code{mcols} of the result, with \code{TRUE} for each tile that overlaps
#' any region in that element. "X" is obtained from \code{names(hasOverlap)}.
#' @param fracOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, a numeric vector "X.fracOverlap" will be added to the
#' \code{mcols} of the result, with a value between 0 and 1 giving the fraction
#' of bases in a tile that overlaps with any region in that element. "X" is
#' obtained from \code{names(fracOverlap)}.
#' @param numOverlap Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, two numeric vectors "X.numOverlapWithin" and
#' "X.numOverlapAny" will be added to the \code{mcols} of the result, giving
#' the number of ranges in that element that are fully contained within
#' a tile, or that overlap with a tile in any way, respectively. "X" is
#' obtained from \code{names(numOverlap)}.
#' @param nearest Named \code{list} with \code{\link[GenomicRanges]{GRanges}}
#' or \code{\link[GenomicRanges]{GRangesList}} object(s).
#' For each list element, two numeric vectors "X.nearestName" and
#' "X.nearestDistance" will be added to the \code{mcols} of the result, giving
#' the name and distance of the nearest range in that element for each tile. "X" is
#' obtained from \code{names(nearest)}, and the values of "X.nearestName" from
#' \code{names(nearest$X)}. If multiple nearest ranges are at the same
#' distance from a tile, an arbitrary one is reported in "X.nearestName".
#' @param ignore.strand Logical scalar passed to
#' \code{\link[GenomicRanges]{findOverlaps}} when searching for overlaps
#' between \code{x} and reference regions.
#'
#' @return A \code{\link[GenomicRanges]{GRanges}} similar to \code{x}, with
#' annotations added to its metadata columns (\code{mcols}).
#'
#' @examples
#' library(GenomicRanges)
#'
#' x <- GRanges("chr1", IRanges(c(1, 12), width = 10))
#' tss <- GRanges("chr1", IRanges(c(1, 10, 30), width = 1,
#' names = paste0("t", 1:3)))
#' blacklist <- GRanges("chr1", IRanges(20, width = 5))
#' annotateRegions(x, hasOverlap = list(Blacklist = blacklist),
#' fracOverlap = list(Blacklist = blacklist),
#' numOverlap = list(TSS = tss),
#' nearest = list(TSS = tss))
#'
#' @seealso \code{\link{getGenomicTiles}} that uses this function,
#' \code{\link[GenomicRanges]{findOverlaps}} and
#' \code{\link[GenomicRanges]{nearest}} in package \pkg{GenomicRanges} used
#' internally.
#'
#' @importFrom GenomicRanges GRanges width findOverlaps nearest distance
#' @importFrom methods is
#' @importFrom IRanges overlapsAny
#' @importFrom S4Vectors queryHits subjectHits
#' @importFrom BiocGenerics unlist
#'
#' @export
annotateRegions <- function(x,
hasOverlap = list(),
fracOverlap = list(),
numOverlap = list(),
nearest = list(),
ignore.strand = TRUE) {
## check arguments
stopifnot(exprs = {
# x
is(x, "GRanges")
# hasOverlap
is.list(hasOverlap)
length(hasOverlap) == 0L || !is.null(names(hasOverlap))
all(unlist(lapply(hasOverlap, is, "GRanges"))) || all(unlist(lapply(hasOverlap, is, "GRangesList")))
# fracOverlap
is.list(fracOverlap)
length(fracOverlap) == 0L || !is.null(names(fracOverlap))
all(unlist(lapply(fracOverlap, is, "GRanges"))) || all(unlist(lapply(fracOverlap, is, "GRangesList")))
# numOverlap
is.list(numOverlap)
length(numOverlap) == 0L || !is.null(names(numOverlap))
all(unlist(lapply(numOverlap, is, "GRanges"))) || all(unlist(lapply(numOverlap, is, "GRangesList")))
# nearest
is.list(nearest)
length(nearest) == 0L || !is.null(names(nearest))
all(unlist(lapply(nearest, is, "GRanges"))) || all(unlist(lapply(nearest, is, "GRangesList")))
all(unlist(lapply(nearest, function(x) !is.null(names(x)))))
# ignore.strand
is.logical(ignore.strand)
length(ignore.strand) == 1L
})
## annotate tiles
df <- S4Vectors::mcols(x)
## ... hasOverlap
for (i in seq_along(hasOverlap)) {
nm <- paste0(names(hasOverlap)[i], ".hasOverlap")
df[[nm]] <- IRanges::overlapsAny(query = x, subject = hasOverlap[[i]],
ignore.strand = ignore.strand)
}
## ... fracOverlap
for (i in seq_along(fracOverlap)) {
gr2 <- fracOverlap[[i]]
if (is(gr2, "GRangesList")) {
gr2 <- BiocGenerics::unlist(gr2)
}
ov <- GenomicRanges::findOverlaps(query = x, subject = gr2,
ignore.strand = ignore.strand)
tilesov <- pmin(GenomicRanges::end(x[S4Vectors::queryHits(ov)]),
GenomicRanges::end(gr2[S4Vectors::subjectHits(ov)])) -
pmax(GenomicRanges::start(x[S4Vectors::queryHits(ov)]),
GenomicRanges::start(gr2[S4Vectors::subjectHits(ov)])) + 1
tilesovSum <- tapply(tilesov, S4Vectors::queryHits(ov), sum)
posPerTile <- rep(0, length(x))
posPerTile[as.numeric(names(tilesovSum))] <- tilesovSum
nm <- paste0(names(fracOverlap)[i], ".fracOverlap")
df[[nm]] <- posPerTile / width(x)
}
## ... numOverlap
for (i in seq_along(numOverlap)) {
gr2 <- numOverlap[[i]]
nm1 <- paste0(names(numOverlap)[i], ".numOverlapWithin")
nm2 <- paste0(names(numOverlap)[i], ".numOverlapAny")
# remark: findOverlaps(..., type = "within") refers to the query
ov1 <- GenomicRanges::findOverlaps(query = gr2, subject = x,
type = "within",
ignore.strand = ignore.strand)
ov1perGr <- table(S4Vectors::subjectHits(ov1))
nwithin <- rep(0L, length(x))
nwithin[as.numeric(names(ov1perGr))] <- as.integer(ov1perGr)
df[[nm1]] <- nwithin
df[[nm2]] <- GenomicRanges::countOverlaps(query = x, subject = gr2,
type = "any",
ignore.strand = ignore.strand)
}
## ... nearest
for (i in seq_along(nearest)) {
gr2 <- nearest[[i]]
if (is(gr2, "GRangesList")) {
gr2 <- BiocGenerics::unlist(gr2)
}
nm1 <- paste0(names(nearest)[i], ".nearestName")
nm2 <- paste0(names(nearest)[i], ".nearestDistance")
j <- GenomicRanges::nearest(x = x, subject = gr2, select = "arbitrary",
ignore.strand = ignore.strand)
df[[nm1]] <- names(gr2)[j]
dists <- rep(NA_integer_, length(x))
ok <- !is.na(j)
dists[ok] <- GenomicRanges::distance(x = x[ok], y = gr2[j[ok]],
ignore.strand = ignore.strand)
df[[nm2]] <- dists
}
## return result
S4Vectors::mcols(x) <- df
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.