R/regionReads.R

#' @title Count ChIP-Seq Reads for Genomic Regions
#'
#' @description Count reads for customized regions or genome wide bins for
#' multiple ChIP-Seq samples in BAM (.bam) or BIGWIG (.bw or .bigwig) format.
#' When no customized peaks provided, this function will generate bin-level
#' counts based on genome information embedded in BAM or BIGWIG files. For BAM
#' files, counts will be generated using Rsubread package; for BIGWIG files,
#' counts will be generated by function in rtracklayer package.
#'
#' @param samples A vector of paths to ChIP-seq files in Bam or Bigwig format.
#' @param colData A DataFrame object contains at lease one column representing
#' the conditions of samples for comparison in differential binding analysis.
#' This object includes the meta-data describing \code{samples}. It can
#' store any number of descriptive columns for each sample. See detail in
#' \code{SummarizedExperiment::colData}. The default here uses sample names to
#' represent conditions.
#' @param peaks GRanges or \code{NULL}. If \code{NULL}, genome-wide
#' bins with length \code{binsize} will be pre-selected as potential
#' peaks; otherwise, ChIP-seq peaks should be provided as a GRanges object,
#' e.g. a union of peaks identified by peak caller from all samples. (Default:
#' NULL)
#' @param binsize A number specify the bin size. This parameter is only
#' appliable when \code{peaks} is NULL. (Default: 300)
#' @param bincut A numeric cut off on read counts to nominate bins as high
#' coverage bins. This parameter is only appliable when \code{peaks} is
#' NULL. (Default: 0)
#' @param readlen Sample fragment length with which Bigwig files are created.
#' The signal in Bigwig file is \code{readlen} times of original reads number
#' when created by software as Bedtools. One way to count original reads is
#' to privde \code{readlen} here. This parameter
#' is only appliable when \code{samples} are Bigwig files. If \code{NULL}, it
#' is recommended to multiply \code{bincut} by a scale factor (normally
#' fragment length). (Default: NULL)
#' @param read2pos Parameter for function \code{featureCounts} in package
#' Rsubread. This parameter is only appliable when \code{samples} are bam
#' files. (Default: '5')
#' @param ignoreDup Parameter for function \code{featureCounts} in package
#' Rsubread. This parameter is only appliable when \code{samples} are bam
#' files. (Default: TRUE)
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomeInfoDb
#' @import GenomicRanges
#' @import SummarizedExperiment
#' @importFrom matrixStats rowMaxs
#' @importFrom Rsamtools BamFileList
#' @importFrom rtracklayer BigWigFile
#' @importFrom rtracklayer summary
#' @importFrom Rsubread featureCounts
#' @importFrom methods is
#'
#' @return
#' A RangedSummarizedExperiment object containing read counts for genome-wide
#' bins or given peaks..
#'
#' @export
#'
#' @examples
#' library(GenomicRanges)
#' bams <- c(system.file("extdata", "control.bam", package="ComplexDiff"),
#'           system.file("extdata", "treated.bam", package="ComplexDiff"))
#' bins <- GRanges("chr1",IRanges(start = seq(1000000,2000000,300),
#'                       end = seq(1000000,2000000,300)+300-1))
#' regionReads(samples=bams,peaks=bins)

regionReads <- function(samples, colData=DataFrame(cond=basename(samples)),
                        peaks=NULL, binsize=300L, bincut=0L,
                        readlen=NULL, read2pos='5', ignoreDup=TRUE){
    stopifnot(is.character(samples))
    stopifnot(is(colData,'DataFrame') && nrow(colData)==length(samples))
    stopifnot(is.null(peaks) || is(peaks,"GRanges"))
    stopifnot(is.numeric(binsize) && length(binsize) == 1 &&
              is.numeric(bincut) && length(bincut) == 1)
    stopifnot(is.null(readlen) || (is.numeric(readlen) &&
                                   length(readlen) == 1))
    binsize <- round(binsize)
    ## bam or bw
    filext <- sapply(strsplit(basename(samples),"\\."),
                     function(x) rev(x)[1])
    isbam <- all(toupper(filext) %in% "BAM")
    isbw <- all(toupper(filext) %in% c("BW","BIGWIG"))
    if(isbam){
        seqs <- sortSeqlevels(seqinfo(BamFileList(samples)))
    }else if(isbw){
        seqs <- sortSeqlevels(seqinfo(BigWigFile(samples[1])))
    }else{
        stop("sample files must have file extension as bam, bw or BigWig\n")
    }
    seqs <- seqlengths(seqs)[!grepl("_", seqnames(seqs))]
    ## peaks or windows
    if(is.null(peaks)) {
        starts <- lapply(seqs, function(i) seq(1, i, binsize))
        ends <- lapply(seqs, function(i) {
            if(i%%binsize == 0) seq(binsize, i, binsize)
            else c(seq(binsize, i, binsize), i)
        })
        chrs <- rep(names(seqs), times=sapply(starts, length))
        regions <- GRanges(chrs, IRanges(start=unlist(starts),
                                         end=unlist(ends)))
        seqlengths(regions) <- seqs[seqlevels(regions)]
    }else{
        regions <- sortSeqlevels(peaks)
        seqlevels(regions, force=TRUE) <- names(seqs)
        seqlengths(regions) <- seqs[seqlevels(regions)]
    }
    ## read counts
    if(isbam){
        ## bam files
        anno <- data.frame(GeneID=seq_len(length(regions)),
                           Chr=seqnames(regions), Start=start(regions),
                           End=end(regions), Strand=strand(regions))
        count <- featureCounts(files=samples, annot.ext=anno,
                               read2pos=read2pos, ignoreDup=ignoreDup
				)$counts
        rownames(count) <- colnames(count) <- NULL
    }else{
        ## bigwig files
        count <- c()
        for(i in seq_len(length(samples))){
            meancov <- unlist(summary(BigWigFile(samples[i]),
                                      regions,size=1,type="mean"))
            if(is.null(readlen)){
                count <- cbind(count, round(mcols(meancov)$score *
                                            width(meancov)))
            }else{
                count <- cbind(count, round(mcols(meancov)$score *
                                            width(meancov) / readlen))
            }
        }
        rownames(count) <- colnames(count) <- NULL
    }
    ## filter on windows
    if(is.null(peaks) & bincut > 0){
        binidx <- rowMaxs(count, na.rm=TRUE) >= bincut
        count <- count[binidx, ]
        regions <- regions[binidx]
    }
    SummarizedExperiment(assays=list(counts=count),
                         rowRanges=regions, colData=colData)
}
tengmx/ComplexDiff documentation built on May 31, 2019, 8:34 a.m.