R/binOverGene.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

Documented in binOverGene

#' coverage of gene body
#' 
#' calculate the coverage of gene body per gene per bin.
#' 
#' 
#' @param cvglists A list of \link[IRanges:AtomicList-class]{SimpleRleList} or
#' \link[IRanges:AtomicList-class]{RleList}. It represents the coverage for
#' samples.
#' @param TxDb An object of \code{\link[GenomicFeatures:TxDb-class]{TxDb}}.  It
#' is used for extracting the annotations.
#' @param upstream.cutoff,downstream.cutoff cutoff length for upstream or
#' downstream of transcript.
#' @param nbinsGene,nbinsUpstream,nbinsDownstream The number of bins for gene,
#' upstream and downstream.
#' @param includeIntron A logical value which indicates including intron or
#' not.
#' @param minGeneLen,maxGeneLen minimal or maximal length of gene.
#' @author Jianhong Ou
#' @seealso \link{binOverRegions}, \link{plotBinOverRegions}
#' @export
#' @import IRanges
#' @import GenomicRanges
#' @importFrom S4Vectors mcols
#' @importFrom BiocGenerics start end width strand do.call
#' @importFrom GenomeInfoDb seqinfo seqnames
#' @examples
#' 
#' if(Sys.getenv("USER")=="jianhongou"){
#' path <- system.file("extdata", package="ChIPpeakAnno")
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(rtracklayer)
#' files <- dir(path, "bigWig")
#' if(.Platform$OS.type != "windows"){
#' cvglists <- lapply(file.path(path, files), import,
#'                    format="BigWig", as="RleList")
#' names(cvglists) <- sub(".bigWig", "", files)
#' d <- binOverGene(cvglists, TxDb.Hsapiens.UCSC.hg19.knownGene)
#' plotBinOverRegions(d)
#' }
#' }
#' 
binOverGene <- function(cvglists, TxDb, 
                        upstream.cutoff=0L, 
                        downstream.cutoff=upstream.cutoff, 
                        nbinsGene=100L,
                        nbinsUpstream=20L,
                        nbinsDownstream=nbinsUpstream,
                        includeIntron=FALSE,
                        minGeneLen=nbinsGene,
                        maxGeneLen=Inf){
  if(inherits(cvglists, c("SimpleRleList", "RleList", "CompressedRleList"))){
    cvglistsName <- substitute(deparse(cvglists))
    cvglists <- list(cvglists)
    names(cvglists) <- cvglistsName
  }
  if(!is(cvglists, "list")){
    stop("cvglists must be a list of SimpleRleList or RleList")
  }
  cls <- sapply(cvglists, inherits, 
                what = c("SimpleRleList", "RleList", "CompressedRleList"))
  if(any(!cls))
    stop("cvglists must be a list of SimpleRleList or RleList")
  stopifnot(is(TxDb, "TxDb"))
  stopifnot(maxGeneLen>minGeneLen)
  stopifnot(minGeneLen>=nbinsGene)
  if(includeIntron){
    features <- toGRanges(TxDb, feature="transcript")
  }else{
    features <- toGRanges(TxDb, feature="exon")
  }
  ## reduce by gene
  GRapply <- function(X, FUN, by, ...){
    stopifnot(by!="by_id")
    stopifnot(is(X, "GRanges"))
    x <- GRanges(seqnames = mcols(X)[, by], 
                 ranges = ranges(X),
                 strand = strand(X))
    x <- FUN(x, ...)
    x$by_id <- as.character(seqnames(x))
    x <- 
      GRanges(seqnames = 
                as.character(seqnames(X[match(x$by_id, 
                                              mcols(X)[, by])])),
              ranges = ranges(x),
              strand = strand(x),
              by_id = x$by_id)
    colnames(mcols(x)) <- by
    seqinfo(x) <- seqinfo(X)[seqlevels(x)]
    x
  }
  features <- features[lengths(features$gene_id)>0]
  features$gene_id <- sapply(features$gene_id, `[`, 1)
  features <- GRapply(features, reduce, "gene_id")
  
  ## filter by minGeneLen, maxGeneLen
  features <- trim(features)
  f.width <- rowsum(width(features), features$gene_id, reorder = FALSE)
  features <- features[features$gene_id %in% 
                         rownames(f.width)[f.width[, 1]>=minGeneLen & 
                                             f.width[, 1]<maxGeneLen]]
  
  if(length(features)<2){
    stop("less than 2 gene left.")
  }
  ## add upstream and downstream
  if(upstream.cutoff>0 && downstream.cutoff>0){
    genes <- GRapply(features, range, "gene_id")
    suppressWarnings({
      upstream <- promoters(genes,
                            upstream = upstream.cutoff,
                            downstream = 0)})
    upstream <- trim(upstream)
    upstream <- upstream[width(upstream)>=nbinsUpstream]
    revert.strand <- function(gr){
      l <- levels(strand(gr))
      l.plus <- which(l=="+")
      l.minus <- which(l=="-")
      if(length(l.plus)==1){
        l[l.plus] <- "-"
      }
      if(length(l.minus)==1){
        l[l.minus] <- "+"
      }
      levels(strand(gr)) <- l
      gr
    }
    genes.rev.strand <- revert.strand(genes)
    suppressWarnings({downstream <- 
      promoters(genes.rev.strand,
                upstream = downstream.cutoff,
                downstream = 0)})
    downstream <- revert.strand(downstream)
    downstream <- trim(downstream)
    downstream <- downstream[width(downstream)>=nbinsDownstream]
    upstream$feature_type <- "upstream"
    downstream$feature_type <- "downstream"
    features$feature_type <- "gene"
    gene_id <- unique(features$gene_id)
    features <- c(features, upstream, downstream)
    features <- 
      features[order(as.numeric(factor(features$gene_id, 
                                       levels = gene_id)),
                     as.numeric(factor(features$feature_type,
                                       levels = c("upstream", 
                                                  "gene", 
                                                  "downstream"))),
                     start(features))]
  }else{
    ## make sure features are sorted by position
    gene_id <- unique(features$gene_id)
    features$feature_type <- "gene"
    features <- 
      features[order(as.numeric(factor(features$gene_id, 
                                       levels = gene_id)),
                     start(features))]
  }
  ## split the features by seqnames and generate Views for cvglist
  seqn <- Reduce(intersect, lapply(cvglists, names))
  seqn <- intersect(seqn, seqlevels(features))
  if(length(seqn)<1){
    stop("Please check the names of cvglist. ",
         "None of them in the seqlevels of TxDb.")
  }
  features <- features[seqnames(features) %in% seqn]
  features <- 
    features[!features$gene_id %in% 
               features$gene_id[start(features)<1]] ## remove the start < 1
  len <- length(unique(features$gene_id))
  features.s <- split(features, seqnames(features))
  features.s <- features.s[seqn]
  cvglists <- lapply(cvglists, function(.ele) .ele[seqn])
  features.l <- as(features.s, "IntegerRangesList")
  
  bins=c("upstream"=nbinsUpstream,
         "gene"=nbinsGene,
         "downstream"=nbinsDownstream)
  
  features.view <- lapply(cvglists, function(.ele) {
    vw <- Views(.ele, features.l)
    cntByChr <- lapply(vw, function(.vw){
      .df <- elementMetadata(.vw)
      if(length(.df$feature_type) == 0) .df$feature_type <- "gene"
      .gr <- GRanges(paste(.df$feature_type, .df$gene_id), ranges(.vw), 
                     feature_type=.df$feature_type, gene_id=.df$gene_id)
      .gr.s <- split(.gr, seqnames(.gr))
      .cvg <- subject(.vw)
      .cvgs <- rep(list(.cvg), length(.gr.s))
      .gr.l <- as(.gr.s, "IntegerRangesList")
      names(.cvgs) <- names(.gr.l)
      .cvgs <- as(.cvgs, "SimpleRleList")
      .cvg.sub <- .cvgs[.gr.l]
      ### split the RleList into bins
      .ir <- IRanges(1, lengths(.cvg.sub))
      .ir <- 
        IRanges::tile(.ir, n=bins[sub("^(.*?) .*$", "\\1", names(.cvg.sub))])
      names(.ir) <- names(.cvg.sub)
      .cnt <- viewMeans(Views(.cvg.sub, .ir))
      .cnt <- split(.cnt, sub("^(.*?) .*$", "\\1", names(.cnt)))
      ## make sure 5'->3'
      .cnt <- lapply(.cnt, function(x){
        strd <- 
          as.character(strand(features))[match(sub("^(.*?) (.*$)", 
                                                   "\\2", names(x)), 
                                               features$gene_id)]
        x <- split(x, strd)
        x <- lapply(x, BiocGenerics::do.call, what=rbind)
        if("-" %in% names(x)){
          x[["-"]] <- x[["-"]][, ncol(x[["-"]]):1, drop=FALSE]
        }
        do.call(rbind, x)
      })
      lapply(.cnt, colSums)
    })
    cntByFeature <- swapList(cntByChr)
    cntByFeature <- lapply(cntByFeature, function(.ele){
      colSums(do.call(rbind, .ele))/len
    })
  })
  d <- swapList(features.view)
  feature_type <- c("upstream", "gene", "downstream")
  d <- d[feature_type[feature_type %in% names(d)]]
  d <- lapply(d, do.call, what=cbind)
  return(d)
}

Any scripts or data that you put into this service are public.

ChIPpeakAnno documentation built on April 1, 2021, 6:01 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ChIPpeakAnno
Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/binOverGene.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

Defines functions binOverGene

Documented in binOverGene

Try the ChIPpeakAnno package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPpeakAnno Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/binOverGene.R In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

Defines functions binOverGene

Documented in binOverGene

Try the ChIPpeakAnno package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPpeakAnno
Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/binOverGene.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges