R/getWinOverlapGRanges.R

Defines functions getWinOverlapGRanges

Documented in getWinOverlapGRanges

#' @title Get the sliding windows that overlap a GRanges object
#' 
#' @description Get the sliding windows that overlap a GRanges object. 
#' 
#' @details 
#' This finds the windows that overlaps the positive/negative strand of a 
#' GRanges object. The GRanges object, which is \code{mustKeepRanges} in the 
#' \code{filterDNA} method, defines the coordinates of the ranges in the 
#' reference genome that all reads mapped to those ranges must be kept by the 
#' filtering method \code{filterDNA}. 
#' This method makes use of the method \code{getWinOverlapEachIRange} by 
#' pretending each given range as the range of a read. Since the widths of 
#' \code{x} are not necessarily the same (as normal read lengths), we
#' use \code{nbOverlapBases} to specify the minimum number of bases that a 
#' window should overlap with a range of \code{x}, instead of using proprotion 
#' as \code{readProp} in \code{getWinOverlapEachIRange}.
#' 
#' @param x a GRanges object, which defines the coordinates of 
#' the ranges in the reference genome that all reads mapped to those ranges
#' must be kept by the filtering method \code{filterDNA}.
#' @param seqInfo a data frame that contains some key information of the 
#' sequences
#' @param winWidth the length of the sliding window, 1000 by default.
#' @param winStep the step length to sliding the window, 100 by default.
#' @param nbOverlapBases a window is considered to overlap with a range of 
#' \code{x} if it overlaps with at least \code{nbOverlapBases} bases.
#' @return A list of two logical vectors (for positive and negative strand) 
#' defining which windows that overlap with the given GRanges object.
#' @export
#' @importFrom GenomicRanges start<-
#' @importFrom GenomicRanges end<-
#' @importFrom GenomicRanges ranges<-
#' @importFrom BiocGenerics strand
#' @importFrom GenomeInfoDb seqlevels
#' @examples 
#' library(GenomicRanges)
#' x <- GRanges(seqnames = "10",ranges = IRanges(start = c(10000,15000),
#' end=c(20000,30000)),strand = c("+","-"))
#' seqInfo <- data.frame("Sequence"=10,"FirstBaseInPart"=1)
#' getWinOverlapGRanges(x,seqInfo)
#' seqInfo <- data.frame("Sequence"=10,"FirstBaseInPart"=10000000)
#' getWinOverlapGRanges(x,seqInfo)


getWinOverlapGRanges <- function(
    x, seqInfo, winWidth = 1000L, winStep = 100L, nbOverlapBases = 1
) 
{   
    # Check the correct columns are in the seqInfo df
    reqCols <- c("Sequence","FirstBaseInPart")
    if (!all(reqCols %in% names(seqInfo))) {
        stop("seqInfo must contain the column ", reqCols)
    }
    stopifnot(is.numeric(winWidth) || is.numeric(winStep))
    
    # Calculate start/end position of each sequence in the partition
    for (i in seq_along(seqInfo$Sequence)) {
        r <- which(as.vector(seqnames(x)) == seqInfo$Sequence[i])
        if (length(r) > 0) {
            end(ranges(x)[r]) <- 
                end(ranges(x)[r]) + seqInfo$FirstBaseInPart[i] - 1
            start(ranges(x)[r]) <- 
                start(ranges(x)[r]) + seqInfo$FirstBaseInPart[i] - 1
        }
    }
    
    # Calculate the windows that overlap the '+' ranges of x
    mustKeepPos <- getWinOverlapEachIRange(
        ranges(x)[strand(x) != "-", ], winWidth, winStep, 
        readProp = nbOverlapBases/width(x)
        )
    mustKeepPos <- coverage(mustKeepPos) > 0
    
    # Calculate the windows that overlap the '-' ranges of x
    mustKeepNeg <- getWinOverlapEachIRange(
        ranges(x)[strand(x) != "+", ], winWidth, winStep, 
        readProp = nbOverlapBases/width(x)
        )
    mustKeepNeg <- coverage(mustKeepNeg) > 0
    
    list(Positive = mustKeepPos, Negative = mustKeepNeg)
}
UofABioinformaticsHub/rnaCleanR documentation built on Aug. 11, 2021, 11:51 p.m.