#' Calculate metadata profiles across GRanges or GRangesList
#'
#' This function calculates coverage profiles across genomic regions.
#' @param query A single stranded \code{\link[GenomicRanges:GenomicRanges-class]{GenomicRanges}} or \code{\link[GenomicRanges:GRangesList-class]{GenomicRanges}} objects, possibly named.
#' @param subject A \code{\link[IRanges:SimpleRleList-class]{IRanges}}, usually generated by the \code{\link[IRanges:coverage-methods]{coverage}} function.
#' @param nbin A single integer. It determines the number of bins in which each region will be divided.
#' @return A \code{\link[data.table:data.table-class]{data.table}} of the normalised binned coverage across the genomic regions.
#'
#' @import IRanges
#' @import BiocGenerics
#' @import GenomicRanges
#' @import GenomicFeatures
#' @importFrom data.table ":=" ".N" ".GRP" ".I" data.table as.data.table setkey setkeyv setnames rbindlist
#' @export
#' @examples
#' library("TxDb.Dmelanogaster.UCSC.dm3.ensGene")
#' TxDb = TxDb.Dmelanogaster.UCSC.dm3.ensGene
#'
#' query = keepSeqlevels(exonsBy(TxDb, by="tx", use.names=TRUE), "chr4", pruning.mode="coarse")
#' query = query[strand(query)=="+"]
#' query = query[sum(width(query))>0]
#'
#' library("pasillaBamSubset")
#' library("GenomicAlignments")
#'
#' fl1 <- untreated1_chr4()
#' subject = coverage(keepSeqlevels(readGAlignments(fl1), "chr4"))
#'
#' profile = profileStrandedRegions(query, subject)
#' profile = profile[, list(value = mean(value)), by="bin"]
#'
#' library("ggplot2")
#'
#' ggplot(profile, aes(x=bin, y=value)) +
#' geom_line() +
#' scale_x_continuous("Position") +
#' scale_y_continuous("Average normalised coverage") +
#' theme_bw()
profileStrandedRegions <- function(query, subject, nbin=100) {
# If needed, convert the GRangesList to a GRanges object
if( class(query)[1]%in%c("CompressedGRangesList", "GRangesList") ){
if( is.null(names(query)) ){
names(query) = paste("region", seq_along(query), sep="_")
}
subset_stranded = unlist(query, use.names=FALSE)
if( is.null(names(subset_stranded)) ){
subset_stranded = unlist(query, use.names=TRUE)
}else{
# Note: Although is only a check, the line below slows down a lot the execution
names(subset_stranded) = rep(names(query), sapply(query, length))
}
}else if( class(query)[1]%in%c("GenomicRanges", "GRanges") ){
if( is.null(names(query)) )
paste("region", seq_along(query), sep="_")
subset_stranded = query
}else{
stop("Query is not a GRanges or GRangesList object")
}
# Add Region identifiers
subset_stranded$region_id = names(subset_stranded)
# Add information to trace back each region and fragment
subset_dt = data.table(region_id = subset_stranded$region_id)
subset_dt[, fragment_id := .I]
subset_dt[, region_index := .GRP, by="region_id"]
setkey(subset_dt, fragment_id)
# Calculate the coverage and correct orientation
results = as.data.table(subject[subset_stranded], key="group")
if( unique(unlist(strand(subset_stranded), use.names=FALSE))=="+" ){
results[, value := as.numeric(value)][, pos := 1:.N, by="group"]
}else{
results[, value := as.numeric(value)][, pos := .N:1, by="group"]
}
setkey(results, group)
# Order regions and fragments and add relative position
results = results[subset_dt, nomatch=0]
results = results[order(region_index, group, pos)]
results[, pos := 1:.N, by="region_index"]
# Create binning and normalise over region bin width
results[, bin := findInterval(pos, seq(0.5, max(pos)+0.5, length.out=nbin+1)), by="region_index"]
results[, value := value/(max(pos)/nbin), by="region_id"]
results = results[, list(value = sum(value)), by=c("region_id", "bin")]
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.