R/diffRegions.R

#' @title Differential Binding Estimation for Protein Complexes
#'
#' @description
#' This is the main function of this package. It accepts outputs from other
#' functions in this package, and integrates statistical methods of signal
#' smoothing, bump hunting and differential testing, and reports differential
#' binding regions with estimated significance. It is important that the inputs
#' are genome-wide bin level read counts, instead of peak level. Also, it is
#' noted that each read should be only assigned to one bin if multiple
#' overlapping exists, as done by function \code{regionReads}.
#'
#' @param count A matrix of read counts or a RangedSummarizedExperiment, where
#' columns are samples and rows are genome-wide bins. This object can be
#' generated by function \code{regionReads}.
#' @param bins If \code{count} is a read count matrix, \code{bins} should be
#' provided as a GRanges object recording bins of corresponding rows in
#' \code{count}. If \code{count} is a RangedSummarizedExperiment, this
#' parameter will be ignored.
#' @param meta If \code{count} is a read count matrix, this should be a
#' DataFrame object recording sample annotations. Rows of \code{meta}
#' correspond to the columns of \code{count}. The \code{design} parameterB
#' treats the column names of \code{meta} as variables. If \code{count} is a
#' RangedSummarizedExperiment, this parameter will be ignored.
#' @param design A formula object which expresses how read counts for each bin
#' depend on the variables in \code{meta}, e.g. '~ group + condition' etc. Or,
#' if \code{count} is a RangedSummarizedExperiment, the bins and meta objects
#' will be extracted by \code{rowRanges()} and \code{colData()} from
#' \code{count} object. By default, the last variable in \code{design} formula
#' will be used to build the differential binding contrast. At most two
#' variables are allowed if \code{diffmeth} is set to 'ttest'. (Details see
#' below)
#' @param sizefac A numeric vector indicating estimated size of samples for
#' normalization purpose. This vector can be generated by function
#' \code{sizeFac}.
#' @param rccut A numeric cutoff on normalized count matrix using
#' \code{sizefac}. If positive, only bins with normalized counts larger than
#' \code{rccut} in at least one sample are selected for fold change estimate.
#' Unlike other functions in this package, moderate cutoff would be better
#' as too large results more false negative and too small increases the time
#' cost. (Default: 15)
#' @param fccut A numeric cutoff on smoothed log2foldchanges of bins for
#' bump hunting of differtial binding regions. Neighbor bins with fold change
#' larger than this value will be merged together with allowed gaps.
#' (Default: 0.4)
#' @param gap A integer specifying the gaps allowed for bin merging, in the
#' unit of number of bins. (Default: 2)
#' @param diffmeth Method for statistical testing of differential binding.
#' (Default: 'DESeq2')
#'
#' @details
#' Three methods are provided for significance estimation of differential
#' binding. \code{DESeq2} allows pseudo-estimation for comparisons without
#' replicates; otherwise, all methods can be used for comparisons with at least
#' two replicates. The \code{design} formula can be specified as suggested by
#' \code{DESeq2} and \code{limma} if these two methods are selected. For
#' \code{ttest}, \code{design} can either contain one or two components,
#' referring to student's t-test or paired t-test based on logarithm scaled
#' data. For consistance with other packages, the last component in
#' \code{design} formula is the contrast on which the final differential
#' estimation are reported.
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomeInfoDb
#' @import BiocGenerics
#' @import GenomicRanges
#' @import SummarizedExperiment
#' @importFrom bumphunter clusterMaker
#' @importFrom bumphunter smoother
#' @importFrom bumphunter locfitByCluster
#' @importFrom bumphunter regionFinder
#' @importFrom matrixStats rowMaxs
#' @importFrom DESeq2 DESeqDataSetFromMatrix
#' @importFrom DESeq2 sizeFactors
#' @importFrom DESeq2 sizeFactors<-
#' @importFrom DESeq2 DESeq
#' @importFrom DESeq2 results
#' @importFrom limma voom
#' @importFrom limma eBayes
#' @importFrom limma topTable
#' @importFrom limma lmFit
#' @importFrom genefilter rowttests
#' @importFrom methods is
#' @importFrom stats model.frame
#' @importFrom stats model.matrix
#' @importFrom stats t.test
#'
#' @return
#' A GRanges object containing potential regions with differential binding, as
#' well as statistical significances as meta columns.
#'
#' @export
#'
#' @examples
#' ## load sample data
#' data(complex)
#' names(complex)
#'
#' ## test sample data
#' sizefac <- sizeFac(count=complex$counts,plot=TRUE)$sizefac
#' library(SummarizedExperiment)
#' se <- SummarizedExperiment(assays=list(counts=complex$counts),
#'                            rowRanges=complex$bins,
#'                            colData=DataFrame(cond=c("ctr","tre")))
#' dr <- diffRegions(count=se,design=~cond,sizefac=sizefac)
#'
#' ## return values
#' dr
#' hist(width(dr),nclass=30,xlab="region width",
#'      main="Width of potential differential regions")
#' hist(-log10(dr$pvalue),nclass=30,xlab="-log10 pvalue",
#'      main="Estimated significance")

diffRegions <- function(count, bins=NULL, meta=NULL, design, sizefac,
                        rccut=15, fccut=0.4, gap=2,
                        diffmeth=c("DESeq2","limma","ttest")){
    stopifnot((is.matrix(count) || is(count,"RangedSummarizedExperiment")) &&
              ncol(count) >= 2)
    stopifnot(is.null(bins) || (is(bins,"GRanges") && 
                                  length(bins)==nrow(count)))
    stopifnot(is.null(meta) || (is(meta,"DataFrame") && ncol(meta)>=1))
    if(is.matrix(count) && (is.null(bins) || is.null(meta)))
        stop("Bins or meta shouldn't be NULL when count is matrix\n")
    stopifnot(class(design) == "formula")
    stopifnot(is.numeric(sizefac) && length(sizefac) == ncol(count)
              && all(sizefac > 0))
    stopifnot(is.numeric(rccut) && length(rccut) == 1 && rccut >= 0)
    stopifnot(is.numeric(fccut) && length(fccut) == 1 && fccut > 0)
    stopifnot(is.numeric(gap) && length(gap) == 1 && gap >= 0)
    if(is(count,"RangedSummarizedExperiment")){
        bins <- rowRanges(count)
        meta <- colData(count)
        count <- assay(count,1)
    }
    effects <- model.frame(design, data = meta)
    stopifnot(ncol(effects) >= 1 && length(unique(effects[,ncol(effects)]))==2)
    diffmeth <- match.arg(diffmeth)
    if(diffmeth=="ttest"){
        paired <- FALSE
        if(ncol(effects) > 2 || (ncol(effects) == 2 &&
           any(sapply(split(effects[,2],effects[,1]),function(x)
                      length(x) !=2 || length(unique(x)) != 2)))){
            cat("try methods other than 'ttest' for a complicate design.\n
                 differential analysis abandoned.\n")
            return(0)
        }else if(ncol(effects) == 2){
            paired <- TRUE
        }
    }
    ## slected bins
    sizefac <- sizefac / median(sizefac)
    rowidx <- rowMaxs(t(t(count)/sizefac)) >= rccut
    countsub <- count[rowidx,]
    binssub <- bins[rowidx]
    pos <- start(binssub)
    chr <- as.character(seqnames(binssub))
    cluster <- clusterMaker(chr, pos, assumeSorted = TRUE,
                            maxGap = width(binssub[1]) * gap + 1)
    ## raw fold changes
    if(diffmeth=="DESeq2"){
        dds <- DESeqDataSetFromMatrix(countData=countsub,colData=meta,
                                      design= design)
        sizeFactors(dds) <- sizefac
        dds <- DESeq(dds,fitType="mean")
        log2fc <- results(dds)$log2FoldChange
    }else if(diffmeth=="limma"){
        libsize <- 1e+06 * sizefac
        design <- model.matrix(design, data = meta)
        dcb <- voom(countsub, design, lib.size=libsize)
        fit <- eBayes(lmFit(dcb,design))
        diffR <- topTable(fit,coef=ncol(design),sort.by="none",
                          number=nrow(fit))
        log2fc <- diffR$logFC
    }else{
        logcountsub <- log2(t(t(countsub)/sizefac)+0.5)
        log2fc <- rowttests(logcountsub,as.factor(effects[,ncol(effects)]))$dm
    }
    ## fold changes smoothing & bump hunting
    log2fc[is.na(log2fc)] <- 0
    log2fcs <- smoother(log2fc,pos,cluster,smoothFunction=locfitByCluster,
                        minNum=4, bpSpan=5000)$fitted
    log2fcs[is.na(log2fcs)] <- log2fc[is.na(log2fcs)]
    regiontab <- regionFinder(log2fcs, chr, pos, cluster, summary = mean,
                          order = TRUE, oneTable = TRUE, cutoff=fccut,
                          assumeSorted = TRUE, verbose = TRUE)
    regiongrg <- GRanges(regiontab$chr,
                         IRanges(start = regiontab$start,
                                 end = regiontab$end + width(binssub[1]) - 1))
    ## diferential binding re-estimation
    fo <- findOverlaps(regiongrg,bins)
    fosplit <- split(subjectHits(fo),queryHits(fo))
    regionCount <- t(sapply(fosplit,function(i)
                            colSums(count[i,,drop=FALSE])))
    if(diffmeth=="DESeq2"){
        dds <- DESeqDataSetFromMatrix(countData=regionCount,colData=meta,
                                      design= design)
        sizeFactors(dds) <- sizefac
        dds <- DESeq(dds,fitType="mean")
        deseq <- results(dds)
        regiongrg$stat <- deseq$stat
        regiongrg$log2fc <- deseq$log2FoldChange
        regiongrg$pvalue <- deseq$pvalue
    }else if(diffmeth=="limma"){
        dcb <- voom(regionCount, design, lib.size=libsize)
        fit <- eBayes(lmFit(dcb,design))
        limm <- topTable(fit,coef=ncol(design),sort.by="none",number=nrow(fit))
        regiongrg$stat <- limm$t
        regiongrg$log2fc <- limm$logFC
        regiongrg$pvalue <- limm$P.Value
    }else{
        logregionrc <- log2(t(t(regionCount)/sizefac)+0.5)
        if(!paired){
            tt <- rowttests(logregionrc,as.factor(effects[,1]))
            regiongrg$stat <- tt$statistic
            regiongrg$log2fc <- tt$dm
            regiongrg$pvalue <- tt$p.value
        }else{
            logregionrc <- logregionrc[,order(effects[,2],effects[,1])]
            levels <- unique(effects[,2])
            regionpara <- data.frame(apply(logregionrc,1,function(x){
                tmp <- t.test(x[effects[,2]==levels[1]],
                              x[effects[,2]==levels[2]],paired=TRUE)
                c(tmp$stat,tmp$est,tmp$p.value)
            }))
            regiongrg$stat <- regionpara[,1]
            regiongrg$log2fc <- regionpara[,2]
            regiongrg$pvalue <- regionpara[,3]
        }
    }
    regiongrg
}
tengmx/ComplexDiff documentation built on May 31, 2019, 8:34 a.m.