R/getGRegionsStat2.R

Defines functions getGRegionsStat2

Documented in getGRegionsStat2

#' @rdname getGRegionsStat2
#' @title Statistic of Genomic Regions
#' @description A function to estimate the summarized measures of a specified
#'     variable given in a GRanges object (a column from the metacolums of the
#'     GRanges object) after split the GRanges object into intervals.
#' @details This function split a Grange object into intervals genomic regions
#'     (GRs) of fixed size A summarized statistic (mean, median, geometric mean
#'     or sum) is calculated for the specified variable values from each region.
#'     Notice that if win.size == step.size, then non-overlapping windows are
#'     obtained.
#' @param GR A GRange object carying the variables of interest in the 
#'     GRanges metacolumn.
#' @param win.size An integer for the size of the windows/regions size of the
#'     intervals of genomics regions.
#' @param step.size Interval at which the regions/windows must be defined
#' @param grfeatures A GRanges object corresponding to an annotated genomic
#'     feature. For example, gene region, transposable elements, exons,
#'     intergenic region, etc. If provided, then parameters 'win.size' and
#'     step.size are ignored and the statistics are estimated for 'grfeatures'.
#' @param stat Statistic used to estimate the summarized value of the variable
#'     of interest in each interval/window. Posible options are: "mean",
#'     geometric mean ("gmean"), "median", "density", "count", "denCount" and
#'     "sum" (default). Here, we define "density" as the sum of values from the
#'     variable of interest in the given region devided by the length of the
#'     region. The option 'count' compute the number/count of positions in the
#'     specified regions with values greater than zero in the selected 'column'.
#'     The statistic "denCount" is defined as the "count" in the given region 
#'     devided by the length of the region.
#' @param absolute Optional. Logic (default: FALSE). Whether to use the absolute
#'     values of the variable provided. For example, the difference of
#'     methylation levels could take negative values (TV) and we would be
#'     interested on the sum of abs(TV), which is sum of the total variation
#'     distance.
#' @param select.strand Optional. If provided,"+" or "-", then the summarized
#'     statistic is computed only for the specified DNA chain.
#' @param maxgap,minoverlap,type See 
#'     \code{\link[IRanges]{findOverlaps-methods}} in the
#'     \strong{IRanges} package for a description of these arguments.
#' @param ignore.strand When set to TRUE, the strand information is ignored in
#'     the overlap calculations.
#' @param scaling integer (default 1). Scaling factor to be used when
#'     stat = "density". For example, if scaling = 1000, then density * scaling
#'     denotes the sum of values in 1000 bp.
#' @param logbase A positive number: the base with respect to which logarithms
#'     are computed when parameter 'entropy = TRUE' (default: logbase = 2).
#' @param missings Whether to write '0' or 'NA' on regions where there is not
#'     data to compute the statistic.
#' @param naming Logical value. If TRUE, the rows GRanges object will be 
#'     given the names(grfeatures). Default is FALSE.
#' @param na.rm Logical value. If TRUE, the NA values will be removed.
#' @param verbose Logical. Default is TRUE. If TRUE, then the progress of the
#'     computational tasks is given.
#' @return A GRanges object with the new genomic regions and their corresponding
#'     summarized statistic.
#' @examples
#' library(GenomicRanges)
#' set.seed(1)
#' gr <- GRanges(seqnames = Rle( c("chr1", "chr2", "chr3", "chr4"),
#'             c(5, 5, 5, 5)),
#'             ranges = IRanges(start = 1:20, end = 1:20),
#'             strand = rep(c("+", "-"), 10),
#'             A = seq(1, 0, length = 20))
#' gr$B <- runif(20)
#' grs <- getGRegionsStat2(gr, win.size = 4, step.size = 4)
#' grs
#' 
#' ## Selecting the positive strand
#' grs <- getGRegionsStat2(gr, win.size = 4, step.size = 4, select.strand = "+")
#' grs
#' 
#' ## Selecting the negative strand
#' grs <- getGRegionsStat2(gr, win.size = 4, step.size = 4, select.strand = "-")
#' grs
#' 
#' @importFrom GenomeInfoDb seqnames seqlengths seqlevels
#' @importFrom GenomicRanges GRanges findOverlaps
#' @importFrom IRanges IRanges width
#' @importFrom stats median
#' @importFrom dplyr group_by summarise_all '%>%'
#' @importFrom S4Vectors subjectHits queryHits DataFrame mcols 
#' @importFrom S4Vectors mcols<-
#' @importFrom BiocGenerics strand start end
#' @importFrom MethylIT uniqueGRanges sortBySeqnameAndStart unlist
#' @importFrom utils txtProgressBar
#' @seealso \code{\link{getGRegionsStat}}
#' @author Robersy Sanchez (\url{https://github.com/genomaths}).
#' @export
getGRegionsStat2 <- function(GR, win.size=350, step.size=350, grfeatures=NULL,
            stat = c("sum", "mean", "gmean", "median", "density", "count",
                   "denCount"),
            columns = NULL, absolute = FALSE, select.strand = NULL, maxgap =-1L, 
            minoverlap = 0L, select = "all", ignore.strand = FALSE, 
            type = c("any", "start", "end", "within", "equal"), scaling = 1000L, 
            logbase = 2, missings = 0, naming = FALSE, na.rm = TRUE, 
            verbose = TRUE, ...) {
   
   ## These NULL quiet: no visible binding for global variable 'x2'
   if (class( GR ) != "GRanges") stop( "GR object must be a GRanges object!")
   if (!is.null(grfeatures) && !inherits(grfeatures,"GRanges")) {
       stop("* 'grfeatures', if provided, must be a GRanges object")
   }
   stat <- match.arg(stat, c("sum", "mean", "gmean", "median", "density", 
                               "count", "denCount"))
       
   if (!is.element(missings, c(0, NA))) missings <- NA
       
   type <- match.arg(type, c("any", "start", "end", "within", "equal"))

   ## === Some functions to use ===
   stats <- function(x, stat = c(), absolute, na.rm) {
           if (absolute) x = abs(x)
           x <- switch(stat,
                       count = sum(x != 0, na.rm = na.rm),
                       sum = sum(x, na.rm = na.rm),
                       mean = mean(x, na.rm = na.rm),
                       gmean = exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)),
                       median = median(x, na.rm = na.rm),
                       density = sum(x, na.rm = na.rm),
                       denCount = sum(x != 0, na.rm = na.rm))
   }
   fn <- function(x) stats(x, stat = stat, absolute = absolute, na.rm = na.rm)

   ## =============================== ##
   if (!is.null(select.strand)) {
       ## possible values "-", "+", NULL
       if (!is.element(select.strand, unique(strand(GR)))) {
           stop("* The GRanges object does not have strand named ", "'",
               select.strand, "'")
       }
       idx <- which(as.character(strand(GR)) == select.strand)
       GR <- GR[idx]
   }
   chrs <- as.character(unique(seqnames(GR)))
   
   ## Progress bar
   if(verbose) {
      # setup progress bar
      pb <- txtProgressBar(max = 100, style = 3) 
      on.exit(close(pb)) # on exit, close progress bar
   }
   
   ## === If genomic features are not specified ===
   if (is.null(grfeatures)) {
       if (length(GR) < win.size || length(GR) < step.size) 
           stop("* 'GR'length is lesser of 'win.size' or 'step.size'")
       if(verbose) setTxtProgressBar(pb, 1) # update progress bar
      
       all.wins <- sapply(seq_len(length(chrs)), function(k) {
                   ## get max length of chromosome
                   max.length <- max(IRanges::end(GR[seqnames(GR) == chrs[k],]))
                   ## get sliding windows
                   numTiles <- floor((max.length -
                                   (win.size - step.size)) / step.size) + 1
                   ranges <- IRanges(start=(1 + 0:(numTiles - 1) * step.size),
                                   width=rep(win.size, numTiles))
                   temp.wins <- GRanges(seqnames = rep(chrs[k], numTiles),
                                   ranges = ranges)
                   return(temp.wins)} 
       )
           
       all.wins <- MethylIT::unlist(all.wins)
               
       ## sites of interest inside of the windows
       Hits <- findOverlaps(GR, all.wins, maxgap = maxgap,
                           minoverlap = minoverlap, select = select,
                           ignore.strand = ignore.strand, type = type)
       if (length(Hits) > 0) {
           m <- ncol(mcols(GR))
           if (m  > 1) {
               mcols(all.wins) <- matrix(missings, nrow = length(all.wins),
                                           ncol = m)
           } else mcols(all.wins) <- missings
           all.wins <- all.wins[subjectHits(Hits)]
           GR <- GR[queryHits(Hits)]
           mcols(all.wins) <- mcols(GR)
           chr <- seqnames(all.wins)

           ## Variable to mark the limits of each GR
           all.wins$cluster.id <- paste(chr, start(all.wins),
                                        end(all.wins), sep = "_")
           GR <- all.wins; rm(all.wins); gc()
           GR <- as.data.frame(GR)
           GR <- GR[, -c(1:5)]
           if(verbose) setTxtProgressBar(pb, 25) # update progress bar
           
           GR <- GR %>% group_by(cluster.id) %>% summarise_all(list(fn))
           
           if(verbose) setTxtProgressBar(pb, 75) # update progress bar
           
           strands <- matrix(unlist(strsplit(GR$cluster.id, "_")), 
                               ncol = 3, byrow = TRUE)
           GR <- data.frame(GR[, -1], chr = strands[,1], 
                           start = as.numeric(strands[, 2]),
                           end = as.numeric(strands[, 3]), strand =  "*")
           GR <- makeGRangesFromDataFrame(GR, keep.extra.columns = TRUE)
           if (stat == "density" || stat == "denCount") {
              widths <- width(GR)
              mcols(GR) <- (scaling * as.matrix(mcols(GR))/widths)
           }
       } else {
               m <- ncol(mcols(GR))
               if (m  > 1) {
                   mcols(all.wins) <- matrix(missings, nrow = length(all.wins), 
                                           ncol = m)
               } else mcols(all.wins) <- missings
               colnames(mcols(all.wins)) <- colnames(mcols(GR))
               GR <- all.wins
               warnings("There is not overlap between the 'GR' & the regions")
       }
   } else {
       ## sites of interest inside of the windows
       if(verbose) setTxtProgressBar(pb, 1) # update progress bar
      
       Hits <- findOverlaps(GR, grfeatures, maxgap=maxgap, select = select,
                           minoverlap=minoverlap, ignore.strand=ignore.strand,
                           type=type)
       if (length(Hits) > 0) {
           m <- ncol(mcols(GR))
           if (m > 1) {
               mcols(grfeatures) <- matrix(missings, nrow = length(grfeatures), 
                                           ncol = m)
           }
           else mcols(grfeatures) <- missings
           grfeatures <- grfeatures[subjectHits(Hits)]
           GR <- GR[queryHits(Hits)]
           mcols(grfeatures) <- mcols(GR)
           chr <- seqnames(grfeatures)
               
           if (class(names(grfeatures)) == "character" && naming) 
               grfeatures$cluster.id <- paste(chr, start(grfeatures),
                                               end(grfeatures), 
                                               names(grfeatures), sep = "_")
           else grfeatures$cluster.id  <- paste(chr, start(grfeatures),
                                               end(grfeatures), sep = "_")
               
           GR <- grfeatures; rm(grfeatures); gc()
           GR <- as.data.frame(GR)
           GR <- GR[, -c(1:5)]
           if(verbose) setTxtProgressBar(pb, 25) # update progress bar
           
           GR <- GR %>% group_by(cluster.id) %>% summarise_all(list(fn))
           
           if(verbose) setTxtProgressBar(pb, 75) # update progress bar
           if (naming) strands <- matrix(unlist(strsplit(GR$cluster.id, "_")), 
                                           ncol = 4, byrow = TRUE)
           else strands <- matrix(unlist(strsplit(GR$cluster.id, "_")), 
                                  ncol = 3, byrow = TRUE)
           GR <- data.frame(GR[, -1], chr = strands[,1], 
                           start = as.numeric(strands[, 2]),
                           end = as.numeric(strands[, 3]), strand =  "*")
           GR <- makeGRangesFromDataFrame(GR, keep.extra.columns = TRUE)
           if (naming) names(GR) <- strands[, 4]
           if (stat == "density") {
              widths <- width(GR)
              mcols(GR) <- (scaling * as.matrix(mcols(GR))/widths)
           }
       } else {
           m <- ncol(mcols(GR))
           if (m > 1) {
               mcols(grfeatures) <- matrix(missings, nrow = length(grfeatures), 
                                           ncol = m)
           } else mcols(grfeatures) <- missings
           colnames(mcols(grfeatures)) <- colnames(mcols(GR))
           GR <- grfeatures
           warnings("There is not overlap between the 'GR' & the regions")
       }
   }
   GR <- sortBySeqnameAndStart(GR)
   if(verbose) setTxtProgressBar(pb, 100) # update progress bar
   return(GR)
}
genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.