R/binOverFeature.R

Defines functions binOverFeature

Documented in binOverFeature

#' Aggregate peaks over bins from the TSS
#' 
#' Aggregate peaks over bins from the feature sites.
#' 
#' 
#' @param \dots Objects of GRanges to be analyzed
#' @param annotationData An object of
#' \link[GenomicRanges:GRanges-class]{GRanges} or \link{annoGR} for annotation
#' @param select Logical: annotate the peaks to all features or the nearest one
#' @param radius The radius of the longest distance to feature site
#' @param nbins The number of bins
#' @param minGeneLen The minimal gene length
#' @param aroundGene Logical: count peaks around features or a given site of
#' the features.  Default = FALSE
#' @param mbins if aroundGene set as TRUE, the number of bins intra-feature.
#' The value will be normalized by value * (radius/genelen) * (mbins/nbins)
#' @param featureSite which site of features should be used for distance
#' calculation
#' @param PeakLocForDistance which site of peaks should be used for distance
#' calculation
#' @param FUN the function to be used for score calculation
#' @param errFun the function to be used for errorbar calculation or values for
#' the errorbar.
#' @param xlab titles for each x axis
#' @param ylab titles for each y axis
#' @param main overall titles for each plot
#' @return A data.frame with bin values.
#' @author Jianhong Ou
#' @keywords misc
#' @export
#' @import GenomicRanges
#' @importFrom BiocGenerics strand start end width score
#' @importFrom graphics segments axis abline
#' @examples
#' 
#' bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
#' gr1 <- toGRanges(bed, format="BED", header=FALSE)
#' data(TSS.human.GRCh37)
#' binOverFeature(gr1, annotationData=TSS.human.GRCh37,
#'                radius=5000, nbins=10, FUN=length, errFun=0)
#' 
binOverFeature <- function(..., annotationData=GRanges(),
                           select=c("all", "nearest"),
                           radius=5000L, nbins=50L,
                           minGeneLen=1L, aroundGene=FALSE, mbins=nbins, 
                           featureSite=c("FeatureStart", "FeatureEnd", 
                                         "bothEnd"),
                           PeakLocForDistance=c("all", "end", 
                                                "start", "middle"), 
                           FUN=sum, errFun=sd, xlab, ylab, main)
{
    ###check inputs
    PeaksList <- list(...)
    isGRangesList <- FALSE
    if(length(PeaksList)==1){
        if(is(PeaksList[[1]], "GRangesList")){
            PeaksList <- PeaksList[[1]]
            names <- names(PeaksList)
            isGRangesList <- TRUE
        }
    }
    ##save dots arguments names
    if(!isGRangesList){
        dots <- substitute(list(...))[-1]
        names <- unlist(sapply(dots, deparse))
    }
    
    n <- length(PeaksList)
    if(!n){
        stop("Missing required argument Peaks!")
    }else{
        nr <- ceiling(sqrt(n))
        nc <- ceiling(n/nr)
        op <- par(mfrow=c(nr, nc))
        on.exit(par(op))
    }
    if (missing(annotationData)) {
        stop("No AnnotationData as GRanges or annoGR is passed in.")
    }
    if(!inherits(annotationData, c("GRanges", "annoGR")) || 
           length(annotationData)<1){
        stop("No AnnotationData as GRanges or annoGR is passed in.")
    }
    if(is(annotationData, "annoGR"))
        annotationData <- as(annotationData, "GRanges")
    annotationData <- unique(annotationData)
    if (!all(as.character(strand(annotationData)) %in% c("+", "-", "*")))
        stop("strands of annotationData must be +, - or *")
    select <- match.arg(select)
    featureSite <- match.arg(featureSite)
    PeakLocForDistance <- match.arg(PeakLocForDistance)
    if(!is.list(FUN)) FUN <- list(FUN)
    if(!is.list(errFun)) errFun <- list(errFun)
    lapply(FUN, function(fun){
        if(mode(fun)!="function")
            stop("The mode of FUN must be function. 
                 The FUN could be any function such as 
                 median, mean, sum, length, ...")
    })
    lapply(errFun, function(fun){
        if(mode(fun)!="function" & !is.numeric(fun))
            stop("The mode of errFun must be function. 
                 The errFun could be any function such as sd")
    })
    
    annotatedPeaksList<-lapply(PeaksList, function(Peaks){
        if (!inherits(Peaks, "GRanges")) {
            stop("No valid Peaks passed in. It needs to be GRanges object")
        }
        if(is.null(score(Peaks))){
            message("score of GRanges object is required for calculation. 
                    It will be the input of FUN. Setting score as 1.")
            Peaks$score <- 1
        }
        ## annotate the peaks
        if(select=="all"){
            annotatedPeaks <- 
                annotatePeakInBatch(Peaks, AnnotationData=annotationData,
                                    output="overlapping", maxgap = radius, 
                                    select="all") 
        }else{
            annotatedPeaks <- 
                annotatePeakInBatch(Peaks, AnnotationData=annotationData,
                                    output="nearestLocation", select="all")
        }
        ## filter the annotation
        annotatedPeaks <- annotatedPeaks[!is.na(annotatedPeaks$feature_strand)]
        annotatedPeaks <- 
            annotatedPeaks[
                as.numeric(as.character(annotatedPeaks$end_position))-
                    as.numeric(as.character(annotatedPeaks$start_position))+1>=
                    minGeneLen]
        
        ###if insideFeature==inside or includeFeature, 
        ###featureSite=="bothEnd.intergenic", the annotation should be removed
        #    if(featureSite=="bothEnd.intergenic") 
        #      annotatedPeaks <- 
        #        annotatedPeaks[!annotatedPeaks$insideFeature %in% 
        #                c("inside", "includeFeature"),]
        annotatedPeaks
    })
    
    plotErrBar <- function(x, y, err){
        yplus <- y+err
        yminus <- y-err
        segments(x, yminus, x, yplus)
        xcoord <- par()$usr[1:2]
        smidge <- 0.015*(xcoord[2]-xcoord[1])/2
        segments(x-smidge, yminus, x+smidge, yminus)
        segments(x-smidge, yplus, x+smidge, yplus)
    }
    
    if(missing(xlab)) {
        xlab <- if(aroundGene){
            paste("Bins from", featureSite)
        }else{
            paste("distance from", featureSite)
        }
    }
    if(missing(ylab)) ylab <- "Score" 
    if(missing(main)) main <- paste(names, "binding over", featureSite)
    
    binValue <- mapply(function(annotatedPeaks, fun, errfun,
                                xlab.ele, ylab.ele, main.ele){
        ##genelength
        genelen <- 
            annotatedPeaks$end_position - annotatedPeaks$start_position + 1
        ## change the function if it is length
        if(identical(fun, length)){
            fun <- sum
            annotatedPeaks$score <- 1
        }
        ###step 1 calculate the distance,
        strand <- annotatedPeaks$feature_strand=="-"
        if(PeakLocForDistance=="all"){
            ##dist, the start distance to feature loc, dist2, 
            ##the end distance to feature loc
            PeakLoc.start <- start(annotatedPeaks)
            PeakLoc.end <- end(annotatedPeaks)
            if(featureSite=="bothEnd"){
                ##bothEnd, TODO
                stop("Sorry, can not handle the combination of 
                     PeakLocForDistance=='all' AND featureSite=='bothEnd'")
            }else{
                FeatureLoc<-
                    switch(featureSite,
                           FeatureStart=ifelse(strand, 
                                               annotatedPeaks$end_position, 
                                               annotatedPeaks$start_position),
                           FeatureEnd=ifelse(strand, 
                                             annotatedPeaks$start_position, 
                                             annotatedPeaks$end_position),
                           0)
                dist1 <- ifelse(strand, 
                                FeatureLoc-PeakLoc.end, 
                                PeakLoc.start-FeatureLoc)
                dist2 <- ifelse(strand, 
                                FeatureLoc-PeakLoc.start, 
                                PeakLoc.end-FeatureLoc)
                score <- score(annotatedPeaks)
                weight <- (radius * mbins)/(genelen * nbins)
                if(aroundGene){
                    if(featureSite=="FeatureStart"){
                        ibin1 <- 
                            ifelse(dist1<0, 
                                   nbins+floor(dist1*nbins/radius),
                                   ifelse(dist1<genelen, 
                                          nbins+floor(dist1*mbins/genelen), 
                                          nbins+mbins+
                                              floor((dist1-genelen)*
                                                        nbins/radius)))
                        ibin2 <- 
                            ifelse(dist2<0, 
                                   nbins+floor(dist2*nbins/radius),
                                   ifelse(dist2<genelen, 
                                          nbins+floor(dist2*mbins/genelen), 
                                          nbins+mbins+
                                              floor((dist2-genelen)*
                                                        nbins/radius)))
                        b <- c(-1*c(nbins:1),0:(mbins+nbins-1))
                        distance2 <- -1*radius
                        distance1 <- genelen+radius
                    }else{##featureSite=="FeatureEnd"
                        ibin1 <- 
                            ifelse(dist1>=0, 
                                   nbins+mbins+floor(dist1*nbins/radius),
                                   ifelse(dist1>-1*genelen, 
                                          nbins+mbins+
                                              floor(dist1*mbins/genelen), 
                                          nbins+
                                              floor((dist1+genelen)*
                                                        nbins/radius)))
                        ibin2 <- 
                            ifelse(dist2>=0, 
                                   nbins+mbins+floor(dist2*nbins/radius),
                                   ifelse(dist2>-1*genelen, 
                                          nbins+mbins+
                                              floor(dist2*mbins/genelen), 
                                          nbins+
                                              floor((dist2+genelen)*
                                                        nbins/radius)))
                        b <- c(-1*c((mbins+nbins):1), 0:(nbins-1))
                        distance2 <- -1*(radius+genelen)
                        distance1 <- radius
                    }
                    numbins <- 2*nbins + mbins
                    b.type <- c(rep(FALSE, nbins), 
                                rep(TRUE, mbins), 
                                rep(FALSE, nbins))
                }else{
                    ibin1 <- round(nbins+floor(dist1*nbins/radius))
                    ibin2 <- round(nbins+floor(dist2*nbins/radius))
                    b <- c(-1*c(nbins:1), 0:(nbins-1))
                    numbins <- 2*nbins
                    distance2 <- -1*radius
                    distance1 <- radius
                    genelen <- 0
                    minGeneLen <- -1
                    b.type <- rep(FALSE, 2*nbins)
                }
                ids <- dist2>=distance2 & 
                    dist1<distance1 & 
                    genelen > minGeneLen
                score <- score[ids]
                weight <- weight[ids]
                ibin1 <- ibin1[ids]
                ibin2 <- ibin2[ids]
                ibin1[ibin1<0] <- 0
                ibin2[ibin2>numbins] <- numbins
                gps <- lapply(0:(numbins-1), function(.id){
                    ##split the scores
                    .idx <- ibin1<=.id & ibin2>=.id
                    if(b.type[.id+1]){
                        score[.idx] * weight[.idx]
                    }else{
                        score[.idx]
                    }
                })
                names(gps) <- formatC(1:numbins, 
                                      width=nchar(numbins), 
                                      flag="0")
            }
        }else{##PeakLocForDistance %in% start, middle, end
            PeakLoc <- 
                switch(PeakLocForDistance,
                       middle=round(rowMeans(cbind(start(annotatedPeaks), 
                                                   end(annotatedPeaks)))),
                       start=start(annotatedPeaks),
                       end=end(annotatedPeaks),
                       0)
            if(featureSite=="bothEnd"){##only consider outside of bothEnd
                FeatureStart=ifelse(strand, 
                                    annotatedPeaks$end_position, 
                                    annotatedPeaks$start_position)
                FeatureEnd=ifelse(strand, 
                                  annotatedPeaks$start_position, 
                                  annotatedPeaks$end_position)
                dist1 <- ifelse(strand, 
                                FeatureStart-PeakLoc, 
                                PeakLoc-FeatureStart) ##frome feature start
                dist2 <- ifelse(strand, 
                                FeatureEnd-PeakLoc, 
                                PeakLoc-FeatureEnd) ##from feature end
                score <- score(annotatedPeaks)
                if(aroundGene){
                    stop("Sorry, can not handle the combination of 
                         aroundGene==TRUE AND featureSite=='bothEnd'")
                }else{
                    ibin1 <- round(nbins+floor(dist1*nbins/radius))
                    ibin2 <- round(nbins+floor(dist2*nbins/radius))
                    score1 <- score[ibin1<nbins & ibin1>=0]
                    ibin1 <- ibin1[ibin1<nbins & ibin1>=0]
                    score2 <- score[ibin2>=nbins & ibin2<2*nbins]
                    ibin2 <- ibin2[ibin2>=nbins & ibin2<2*nbins]
                    numbins <- 2*nbins
                    b <- c(-1*c(nbins:1), 0:(nbins-1))
                    b.type <- rep(FALSE, length(b))
                    ##insert the empty bins
                    ibin1 <- formatC(ibin1, width=nchar(numbins), flag="0")
                    ibin2 <- formatC(ibin2, width=nchar(numbins), flag="0")
                    gps1 <- split(score1, ibin1)
                    gps2 <- split(score2, ibin2)
                    gps1 <- gps1[formatC(0:numbins, 
                                         width=nchar(numbins), 
                                         flag="0")]
                    gps2 <- gps2[formatC(0:numbins, 
                                         width=nchar(numbins), 
                                         flag="0")]
                    names(gps1) <- formatC(0:numbins, 
                                           width=nchar(numbins), 
                                           flag="0")
                    names(gps2) <- formatC(0:numbins, 
                                           width=nchar(numbins), 
                                           flag="0")
                    gps <- c(gps1[1:nbins], gps2[(nbins+1):(2*nbins)])
                }
            }else{##featureSite %in% FeatureStart, FeatureEnd
                FeatureLoc <-
                    switch(featureSite,
                           FeatureStart=ifelse(strand, 
                                               annotatedPeaks$end_position, 
                                               annotatedPeaks$start_position),
                           FeatureEnd=ifelse(strand, 
                                             annotatedPeaks$start_position, 
                                             annotatedPeaks$end_position),
                           0)
                dist1 <- ifelse(strand, 
                                FeatureLoc - PeakLoc, 
                                PeakLoc - FeatureLoc)
                weight <- (radius * mbins)/(genelen * nbins)
                if(aroundGene){
                    if(featureSite=="FeatureStart"){
                        ibin1 <- 
                            ifelse(dist1<0, 
                                   nbins+floor(dist1*nbins/radius),
                                   ifelse(dist1<genelen, 
                                          nbins+floor(dist1*mbins/genelen), 
                                          nbins+mbins+
                                              floor((dist1-genelen)*
                                                        nbins/radius)))
                        b <- c(-1*c(nbins:1),0:(mbins+nbins-1))
                        distance1 <- radius+genelen
                        distance2 <- -1*radius
                    }else{##featureSite=="FeatureEnd"
                        ibin1 <- 
                            ifelse(dist1>=0, 
                                   nbins+mbins+floor(dist1*nbins/radius),
                                   ifelse(dist1>-1*genelen, 
                                          nbins+mbins+
                                              floor(dist1*mbins/genelen), 
                                          nbins+
                                              floor((dist1+genelen)*
                                                        nbins/radius)))
                        b <- c(-1*c((mbins+nbins):1), 0:(nbins-1))
                        distance1 <- radius
                        distance2 <- -1*(radius+genelen)
                    }
                    numbins <- 2*nbins + mbins
                    b.type <- c(rep(FALSE, nbins), 
                                rep(TRUE, mbins), 
                                rep(FALSE, nbins))
                }else{
                    ibin1 <- round(nbins+floor(dist1*nbins/radius))
                    numbins <- 2*nbins
                    b <- c(-1*c(nbins:1), 0:(nbins-1))
                    genelen <- 0
                    minGeneLen <- -1
                    distance1 <- radius
                    distance2 <- -1*radius
                    b.type <- rep(FALSE, 2 * nbins)
                }
                ids <- dist1>=distance2 & 
                    dist1<distance1 & 
                    genelen > minGeneLen
                score <- score(annotatedPeaks)[ids]
                weight <- weight[ids]
                ibin1 <- ibin1[ids]
                ibin1[ibin1<0] <- 0
                ibin1[ibin1>numbins] <- numbins
                score[b.type[ibin1+1]] <- 
                    score[b.type[ibin1+1]] * weight[b.type[ibin1+1]]
                ibin1 <- formatC(ibin1, 
                                 width=nchar(numbins), 
                                 flag="0")
                gps <- split(score, ibin1)
                gps <- gps[formatC(0:numbins, 
                                   width=nchar(numbins),
                                   flag="0")]
                names(gps) <- formatC(0:numbins, 
                                      width=nchar(numbins), 
                                      flag="0")
                ##insert the empty bins
                gps <- gps[1:numbins]
            }
        }
        gps <- lapply(gps, function(.ele){
            if(is.null(.ele[1])){
                    0
                }else{
                    .ele
                }
            })
        value <- unlist(lapply(gps, fun))
        std <- if(mode(errfun)=="function") unlist(lapply(gps, errfun)) else 
            rep(errfun, length(gps))
        std[is.na(std)] <- 0
        ##plot the figure
        ylim.min <- min(value[!is.na(value)] - std[!is.na(value)])
        ylim.max <- max(value[!is.na(value)] + std[!is.na(value)])
        ylim.dis <- (ylim.max - ylim.min)/20
        blabel <- if(aroundGene){
            c(seq.int(-radius, -radius/nbins, length.out=nbins)+radius/nbins/2,
              1:mbins,
              seq.int(0, radius-radius/nbins, length.out=nbins)+radius/nbins/2)
        }else{
            seq.int(-radius, radius-radius/nbins, length.out=2*nbins) +
                radius/nbins/2
        }
        if(aroundGene){
            plot(b, value, 
                 ylim=c(ylim.min-ylim.dis, ylim.max+ylim.dis),
                 xlab=xlab.ele, 
                 ylab=ylab.ele, 
                 main=main.ele,
                 xaxt="n")
            b.type.at <- b[which(b.type)]
            b.type.at <- 
                b.type.at[c(1, length(b.type.at))] + c(-.5, .5)
            abline(v=b.type.at, lty=2)
            b.at <- c(b[1]-.5, b[nbins]+.5, b[nbins+mbins]+.5, b[length(b)]+.5)
            b.label <- c(-1 * radius, "Feature Start", "Feature End", radius)
            axis(1, at=b.at, labels=b.label)
            if(!all(std==0)) plotErrBar(b, value, std)
        }else{
            plot(blabel, value, 
                 ylim=c(ylim.min-ylim.dis, ylim.max+ylim.dis),
                 xlab=xlab.ele, 
                 ylab=ylab.ele, 
                 main=main.ele)
            if(!all(std==0)) plotErrBar(blabel, value, std)
        }
        
        names(value) <- blabel
        value
    }, annotatedPeaksList, FUN, errFun, xlab, ylab, main)
    
    colnames(binValue) <- names
    ###output statistics
    return(invisible(binValue))
}
LihuaJulieZhu/ChIPpeakAnno documentation built on Aug. 5, 2020, 12:02 a.m.