#' @title Differential Binding Estimation for Protein Complexes
#'
#' @description
#' This is the main function of this package. It accepts outputs from other
#' functions in this package, and integrates statistical methods of signal
#' smoothing, bump hunting and differential testing, and reports differential
#' binding regions with estimated significance. It is important that the inputs
#' are genome-wide bin level read counts, instead of peak level. Also, it is
#' noted that each read should be only assigned to one bin if multiple
#' overlapping exists, as done by function \code{regionReads}.
#'
#' @param count A matrix of read counts or a RangedSummarizedExperiment, where
#' columns are samples and rows are genome-wide bins. This object can be
#' generated by function \code{regionReads}.
#' @param bins If \code{count} is a read count matrix, \code{bins} should be
#' provided as a GRanges object recording bins of corresponding rows in
#' \code{count}. If \code{count} is a RangedSummarizedExperiment, this
#' parameter will be ignored.
#' @param meta If \code{count} is a read count matrix, this should be a
#' DataFrame object recording sample annotations. Rows of \code{meta}
#' correspond to the columns of \code{count}. The \code{design} parameterB
#' treats the column names of \code{meta} as variables. If \code{count} is a
#' RangedSummarizedExperiment, this parameter will be ignored.
#' @param design A formula object which expresses how read counts for each bin
#' depend on the variables in \code{meta}, e.g. '~ group + condition' etc. Or,
#' if \code{count} is a RangedSummarizedExperiment, the bins and meta objects
#' will be extracted by \code{rowRanges()} and \code{colData()} from
#' \code{count} object. By default, the last variable in \code{design} formula
#' will be used to build the differential binding contrast. At most two
#' variables are allowed if \code{diffmeth} is set to 'ttest'. (Details see
#' below)
#' @param sizefac A numeric vector indicating estimated size of samples for
#' normalization purpose. This vector can be generated by function
#' \code{sizeFac}.
#' @param rccut A numeric cutoff on normalized count matrix using
#' \code{sizefac}. If positive, only bins with normalized counts larger than
#' \code{rccut} in at least one sample are selected for fold change estimate.
#' Unlike other functions in this package, moderate cutoff would be better
#' as too large results more false negative and too small increases the time
#' cost. (Default: 15)
#' @param fccut A numeric cutoff on smoothed log2foldchanges of bins for
#' bump hunting of differtial binding regions. Neighbor bins with fold change
#' larger than this value will be merged together with allowed gaps.
#' (Default: 0.4)
#' @param gap A integer specifying the gaps allowed for bin merging, in the
#' unit of number of bins. (Default: 2)
#' @param diffmeth Method for statistical testing of differential binding.
#' (Default: 'DESeq2')
#'
#' @details
#' Three methods are provided for significance estimation of differential
#' binding. \code{DESeq2} allows pseudo-estimation for comparisons without
#' replicates; otherwise, all methods can be used for comparisons with at least
#' two replicates. The \code{design} formula can be specified as suggested by
#' \code{DESeq2} and \code{limma} if these two methods are selected. For
#' \code{ttest}, \code{design} can either contain one or two components,
#' referring to student's t-test or paired t-test based on logarithm scaled
#' data. For consistance with other packages, the last component in
#' \code{design} formula is the contrast on which the final differential
#' estimation are reported.
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomeInfoDb
#' @import BiocGenerics
#' @import GenomicRanges
#' @import SummarizedExperiment
#' @importFrom bumphunter clusterMaker
#' @importFrom bumphunter smoother
#' @importFrom bumphunter locfitByCluster
#' @importFrom bumphunter regionFinder
#' @importFrom matrixStats rowMaxs
#' @importFrom DESeq2 DESeqDataSetFromMatrix
#' @importFrom DESeq2 sizeFactors
#' @importFrom DESeq2 sizeFactors<-
#' @importFrom DESeq2 DESeq
#' @importFrom DESeq2 results
#' @importFrom limma voom
#' @importFrom limma eBayes
#' @importFrom limma topTable
#' @importFrom limma lmFit
#' @importFrom genefilter rowttests
#' @importFrom methods is
#' @importFrom stats model.frame
#' @importFrom stats model.matrix
#' @importFrom stats t.test
#'
#' @return
#' A GRanges object containing potential regions with differential binding, as
#' well as statistical significances as meta columns.
#'
#' @export
#'
#' @examples
#' ## load sample data
#' data(complex)
#' names(complex)
#'
#' ## test sample data
#' sizefac <- sizeFac(count=complex$counts,plot=TRUE)$sizefac
#' library(SummarizedExperiment)
#' se <- SummarizedExperiment(assays=list(counts=complex$counts),
#' rowRanges=complex$bins,
#' colData=DataFrame(cond=c("ctr","tre")))
#' dr <- diffRegions(count=se,design=~cond,sizefac=sizefac)
#'
#' ## return values
#' dr
#' hist(width(dr),nclass=30,xlab="region width",
#' main="Width of potential differential regions")
#' hist(-log10(dr$pvalue),nclass=30,xlab="-log10 pvalue",
#' main="Estimated significance")
diffRegions <- function(count, bins=NULL, meta=NULL, design, sizefac,
rccut=15, fccut=0.4, gap=2,
diffmeth=c("DESeq2","limma","ttest")){
stopifnot((is.matrix(count) || is(count,"RangedSummarizedExperiment")) &&
ncol(count) >= 2)
stopifnot(is.null(bins) || (is(bins,"GRanges") &&
length(bins)==nrow(count)))
stopifnot(is.null(meta) || (is(meta,"DataFrame") && ncol(meta)>=1))
if(is.matrix(count) && (is.null(bins) || is.null(meta)))
stop("Bins or meta shouldn't be NULL when count is matrix\n")
stopifnot(class(design) == "formula")
stopifnot(is.numeric(sizefac) && length(sizefac) == ncol(count)
&& all(sizefac > 0))
stopifnot(is.numeric(rccut) && length(rccut) == 1 && rccut >= 0)
stopifnot(is.numeric(fccut) && length(fccut) == 1 && fccut > 0)
stopifnot(is.numeric(gap) && length(gap) == 1 && gap >= 0)
if(is(count,"RangedSummarizedExperiment")){
bins <- rowRanges(count)
meta <- colData(count)
count <- assay(count,1)
}
effects <- model.frame(design, data = meta)
stopifnot(ncol(effects) >= 1 && length(unique(effects[,ncol(effects)]))==2)
diffmeth <- match.arg(diffmeth)
if(diffmeth=="ttest"){
paired <- FALSE
if(ncol(effects) > 2 || (ncol(effects) == 2 &&
any(sapply(split(effects[,2],effects[,1]),function(x)
length(x) !=2 || length(unique(x)) != 2)))){
cat("try methods other than 'ttest' for a complicate design.\n
differential analysis abandoned.\n")
return(0)
}else if(ncol(effects) == 2){
paired <- TRUE
}
}
## slected bins
sizefac <- sizefac / median(sizefac)
rowidx <- rowMaxs(t(t(count)/sizefac)) >= rccut
countsub <- count[rowidx,]
binssub <- bins[rowidx]
pos <- start(binssub)
chr <- as.character(seqnames(binssub))
cluster <- clusterMaker(chr, pos, assumeSorted = TRUE,
maxGap = width(binssub[1]) * gap + 1)
## raw fold changes
if(diffmeth=="DESeq2"){
dds <- DESeqDataSetFromMatrix(countData=countsub,colData=meta,
design= design)
sizeFactors(dds) <- sizefac
dds <- DESeq(dds,fitType="mean")
log2fc <- results(dds)$log2FoldChange
}else if(diffmeth=="limma"){
libsize <- 1e+06 * sizefac
design <- model.matrix(design, data = meta)
dcb <- voom(countsub, design, lib.size=libsize)
fit <- eBayes(lmFit(dcb,design))
diffR <- topTable(fit,coef=ncol(design),sort.by="none",
number=nrow(fit))
log2fc <- diffR$logFC
}else{
logcountsub <- log2(t(t(countsub)/sizefac)+0.5)
log2fc <- rowttests(logcountsub,as.factor(effects[,ncol(effects)]))$dm
}
## fold changes smoothing & bump hunting
log2fc[is.na(log2fc)] <- 0
log2fcs <- smoother(log2fc,pos,cluster,smoothFunction=locfitByCluster,
minNum=4, bpSpan=5000)$fitted
log2fcs[is.na(log2fcs)] <- log2fc[is.na(log2fcs)]
regiontab <- regionFinder(log2fcs, chr, pos, cluster, summary = mean,
order = TRUE, oneTable = TRUE, cutoff=fccut,
assumeSorted = TRUE, verbose = TRUE)
regiongrg <- GRanges(regiontab$chr,
IRanges(start = regiontab$start,
end = regiontab$end + width(binssub[1]) - 1))
## diferential binding re-estimation
fo <- findOverlaps(regiongrg,bins)
fosplit <- split(subjectHits(fo),queryHits(fo))
regionCount <- t(sapply(fosplit,function(i)
colSums(count[i,,drop=FALSE])))
if(diffmeth=="DESeq2"){
dds <- DESeqDataSetFromMatrix(countData=regionCount,colData=meta,
design= design)
sizeFactors(dds) <- sizefac
dds <- DESeq(dds,fitType="mean")
deseq <- results(dds)
regiongrg$stat <- deseq$stat
regiongrg$log2fc <- deseq$log2FoldChange
regiongrg$pvalue <- deseq$pvalue
}else if(diffmeth=="limma"){
dcb <- voom(regionCount, design, lib.size=libsize)
fit <- eBayes(lmFit(dcb,design))
limm <- topTable(fit,coef=ncol(design),sort.by="none",number=nrow(fit))
regiongrg$stat <- limm$t
regiongrg$log2fc <- limm$logFC
regiongrg$pvalue <- limm$P.Value
}else{
logregionrc <- log2(t(t(regionCount)/sizefac)+0.5)
if(!paired){
tt <- rowttests(logregionrc,as.factor(effects[,1]))
regiongrg$stat <- tt$statistic
regiongrg$log2fc <- tt$dm
regiongrg$pvalue <- tt$p.value
}else{
logregionrc <- logregionrc[,order(effects[,2],effects[,1])]
levels <- unique(effects[,2])
regionpara <- data.frame(apply(logregionrc,1,function(x){
tmp <- t.test(x[effects[,2]==levels[1]],
x[effects[,2]==levels[2]],paired=TRUE)
c(tmp$stat,tmp$est,tmp$p.value)
}))
regiongrg$stat <- regionpara[,1]
regiongrg$log2fc <- regionpara[,2]
regiongrg$pvalue <- regionpara[,3]
}
}
regiongrg
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.