R/profileStrandedRegions.R

Defines functions profileStrandedRegions

Documented in profileStrandedRegions

#' Calculate metadata profiles across GRanges or GRangesList
#' 
#' This function calculates coverage profiles across genomic regions.
#' @param query A single stranded \code{\link[GenomicRanges:GenomicRanges-class]{GenomicRanges}} or \code{\link[GenomicRanges:GRangesList-class]{GenomicRanges}} objects, possibly named.
#' @param subject A \code{\link[IRanges:SimpleRleList-class]{IRanges}}, usually generated by the \code{\link[IRanges:coverage-methods]{coverage}} function.
#' @param nbin A single integer. It determines the number of bins in which each region will be divided.
#' @return A \code{\link[data.table:data.table-class]{data.table}} of the normalised binned coverage across the genomic regions.
#'
#' @import IRanges
#' @import BiocGenerics
#' @import GenomicRanges
#' @import GenomicFeatures
#' @importFrom data.table ":=" ".N" ".GRP" ".I" data.table as.data.table setkey setkeyv setnames rbindlist
#' @export
#' @examples
#' library("TxDb.Dmelanogaster.UCSC.dm3.ensGene")
#' TxDb = TxDb.Dmelanogaster.UCSC.dm3.ensGene
#' 
#' query = keepSeqlevels(exonsBy(TxDb, by="tx", use.names=TRUE), "chr4", pruning.mode="coarse")
#' query = query[strand(query)=="+"]
#' query = query[sum(width(query))>0]
#'
#' library("pasillaBamSubset")
#' library("GenomicAlignments")
#' 
#' fl1 <- untreated1_chr4()
#' subject = coverage(keepSeqlevels(readGAlignments(fl1), "chr4"))
#'
#' profile = profileStrandedRegions(query, subject)
#' profile = profile[, list(value = mean(value)), by="bin"]
#'
#' library("ggplot2")
#'
#' ggplot(profile, aes(x=bin, y=value)) + 
#'   geom_line() +
#'   scale_x_continuous("Position") +
#'   scale_y_continuous("Average normalised coverage") +
#'   theme_bw()
   
profileStrandedRegions <- function(query, subject, nbin=100) {

      # If needed, convert the GRangesList to a GRanges object
      if( class(query)[1]%in%c("CompressedGRangesList", "GRangesList") ){
         if( is.null(names(query)) ){
            names(query) = paste("region", seq_along(query), sep="_")
         }
         subset_stranded = unlist(query, use.names=FALSE)
         if( is.null(names(subset_stranded)) ){
            subset_stranded = unlist(query, use.names=TRUE)
         }else{
            # Note: Although is only a check, the line below slows down a lot the execution
            names(subset_stranded) = rep(names(query), sapply(query, length))
         }
      }else if( class(query)[1]%in%c("GenomicRanges", "GRanges") ){
         if( is.null(names(query)) )
            paste("region", seq_along(query), sep="_")    
         subset_stranded = query
      }else{
         stop("Query is not a GRanges or GRangesList object")
      }
      # Add Region identifiers
      subset_stranded$region_id = names(subset_stranded)

      # Add information to trace back each region and fragment
      subset_dt = data.table(region_id = subset_stranded$region_id)
      subset_dt[, fragment_id := .I]
      subset_dt[, region_index := .GRP, by="region_id"]
      setkey(subset_dt, fragment_id)

      # Calculate the coverage and correct orientation
      results = as.data.table(subject[subset_stranded], key="group")
      if( unique(unlist(strand(subset_stranded), use.names=FALSE))=="+" ){
            results[, value := as.numeric(value)][, pos := 1:.N, by="group"]
      }else{
            results[, value := as.numeric(value)][, pos := .N:1, by="group"]
      }
      setkey(results, group)

      # Order regions and fragments and add relative position
      results = results[subset_dt, nomatch=0]
      results = results[order(region_index, group, pos)]
      results[, pos := 1:.N, by="region_index"]

      # Create binning and normalise over region bin width
      results[, bin := findInterval(pos, seq(0.5, max(pos)+0.5, length.out=nbin+1)), by="region_index"]
      results[, value := value/(max(pos)/nbin), by="region_id"]
      results = results[, list(value = sum(value)), by=c("region_id", "bin")]
      
      return(results)
   }
fagostini/Mimir documentation built on Dec. 3, 2019, 7:53 p.m.