R/diffRegionsWithPerm.R

#' @title Differential Binding Estimation with Permutation
#'
#' @description
#' This function does differential binding estimation and additional
#' permutation analysis. This function works only if multiple replicates are
#' provided for comparison; otherwise, it is the same as \code{diffRegions}
#' which is the main function for differential binding estimation. Permutation
#' analysis is obtained by shuffling samples between compared conditions.
#' Customized shuffling can also be specified by parameter \code{permute} for
#' complicated experiment designs.
#'
#' @param count A matrix of read counts or a RangedSummarizedExperiment, where
#' columns are samples and rows are genome-wide bins. This object can be
#' generated by function \code{regionReads}.
#' @param bins If \code{count} is a read count matrix, \code{bins} should be
#' provided as a GRanges object recording bins of corresponding rows in
#' \code{count}. If \code{count} is a RangedSummarizedExperiment, this
#' parameter will be ignored.
#' @param meta If \code{count} is a read count matrix, this should be a
#' DataFrame object recording sample annotations. Rows of \code{meta}
#' correspond to the columns of \code{count}. The \code{design} parameter
#' treats the column names of \code{meta} as variables. If \code{count} is a
#' RangedSummarizedExperiment, this parameter will be ignored.
#' @param design A formula object which expresses how read counts for each bin
#' depend on the variables in \code{meta}, e.g. '~ group + condition' etc. Or,
#' if \code{count} is a RangedSummarizedExperiment, the bins and meta objects
#' will be extracted by \code{rowRanges()} and \code{colData()} from
#' \code{count} object. By default, the last variable in \code{design} formula
#' will be used to build the differential binding contrast. At most two
#' variables are allowed if \code{diffmeth} is set to 'ttest'. (Details see
#' below)
#' @param sizefac A numeric vector indicating estimated size of samples for
#' normalization purpose. This vector can be generated by function
#' \code{sizeFac}.
#' @param rccut A numeric cutoff on normalized count matrix using
#' \code{sizefac}. If positive, only bins with normalized counts larger than
#' \code{rccut} in at least one sample are selected for fold change estimate.
#' Unlike other functions in this package, moderate cutoff would be better
#' as too large results more false negative and too small increases the time
#' cost. (Default: 15)
#' @param fccut A numeric cutoff on smoothed log2foldchanges of bins for
#' bump hunting of differtial binding regions. Neighbor bins with fold change
#' larger than this value will be merged together with allowed gaps.
#' (Default: 0.4)
#' @param gap A integer specifying the gaps allowed for bin merging, in the
#' unit of number of bins. (Default: 2)
#' @param permute A matrix where each row contains shuffled indices of all
#' samples. If \code{NULL}, permutation is automatically obtained by shuffling
#' samples between compared conditions. This object can be generated by using
#' R-package \code{permute} as well.
#' @param maxperm Maximum number of permutations to be finished. (Default:10)
#' @param diffmeth Method for statistical testing of differential binding.
#' (Default: 'DESeq2')
#'
#' @details
#' Three methods are provided for significance estimation of differential
#' binding. \code{DESeq2} allows pseudo-estimation for comparisons without
#' replicates; otherwise, all methods can be used for comparisons with at least
#' two replicates. The \code{design} formula can be specified as suggested by
#' \code{DESeq2} and \code{limma} if these two methods are selected. For
#' \code{ttest}, \code{design} can either contain one or two components,
#' referring to student's t-test or paired t-test based on logarithm scaled
#' data. For consistance with other packages, the last component in
#' \code{design} formula is the contrast on which the final differential
#' estimation are reported.
#'
#' The default permutation works on the main contrast only (the last component
#' in \code{design} formula). If customized permutation needed, e.g. invloving
#' several components in \code{design} formula, users should provided the
#' customized indecies of permutations through parameter \code{permute}.
#'
#' @import S4Vectors
#' @import GenomicRanges
#' @import SummarizedExperiment
#' @importFrom methods is
#' @importFrom utils combn
#' @importFrom stats ecdf
#' @importFrom stats model.frame
#'
#' @return
#' A GRanges object containing potential regions with differential binding, as
#' well as statistical significances as meta columns.
#'
#' @export
#'
#' @examples
#' ## load sample data
#' data(complex)
#' names(complex)
#'
#' ## test sample data
#' sizefac <- sizeFac(count=complex$counts,plot=TRUE)$sizefac
#' library(S4Vectors)
#' meta <- DataFrame(cond=c("ctr","tre"))
#' dr <- diffRegionsWithPerm(count=complex$counts,bins=complex$bins,
#'                           meta=meta,design=~cond,sizefac=sizefac)


diffRegionsWithPerm <- function(count, bins=NULL, meta=NULL, design, sizefac,
                            rccut=15, fccut=0.4, gap=2,
                            permute=NULL, maxperm=10,
                            diffmeth=c("DESeq2","limma","ttest")){
    stopifnot((is.matrix(count) || is(count,"RangedSummarizedExperiment")) &&
              ncol(count) >= 2)
    stopifnot(is.null(bins) || (is(bins,"GRanges") &&
                                length(bins)==nrow(count)))
    stopifnot(is.null(meta) || (is(meta,"DataFrame") && ncol(meta)>=1))
    if(is.matrix(count) && (is.null(bins) || is.null(meta)))
        stop("Bins or meta shouldn't be NULL when count is matrix\n")
    stopifnot(class(design) == "formula")
    stopifnot(is.numeric(sizefac) && length(sizefac) == ncol(count)
              && all(sizefac > 0))
    stopifnot(is.numeric(rccut) && length(rccut) == 1 && rccut >= 0)
    stopifnot(is.numeric(fccut) && length(fccut) == 1 && fccut > 0)
    stopifnot(is.numeric(gap) && length(gap) == 1 && gap >= 0)
    stopifnot(is.null(permute) ||
              (is.matrix(permute) &&
               all(apply(permute,1,function(x)
                         identical(seq_len(ncol(count)),sort(x))))))
    stopifnot(is.numeric(maxperm) && length(maxperm) == 1 && maxperm >= 1)
    if(is(count,"RangedSummarizedExperiment")){
        bins <- rowRanges(count)
        meta <- colData(count)
        count <- assay(count,1)
    }
    effects <- model.frame(design, data = meta)
    stopifnot(ncol(effects) >= 1 && length(unique(effects[,ncol(effects)]))==2)
    diffmeth <- match.arg(diffmeth)
    if(diffmeth=="ttest"){
        paired <- FALSE
        if(ncol(effects) > 2 || (ncol(effects) == 2 &&
           any(sapply(split(effects[,2],effects[,1]),function(x)
                      length(x) !=2 || length(unique(x)) != 2)))){
            cat("try methods other than 'ttest' for a complex design.\n
                 differential analysis abandoned.\n")
            return(0)
        }else if(ncol(effects) == 2){
            paired <- TRUE
        }
    }
    if(ncol(count)==2){
        cat("No permutation performed due to not enough samples!\n")
        ## real test
        observed <- diffRegions(count=count, bins=bins, meta=meta,
                                design=design, sizefac=sizefac, rccut=rccut,
                                fccut=fccut, gap=gap, diffmeth=diffmeth)
        cat("Differential analysis of real data......done!\n")
    }else{
        observed <- diffRegions(count=count, bins=bins, meta=meta,
                                design=design, sizefac=sizefac, rccut=rccut,
                                fccut=fccut, gap=gap, diffmeth=diffmeth)
        cat("Differential analysis of real data......done!\n")
        ## permute test
        if(is.null(permute)){
            level <- effects[,ncol(effects)]
            subn <- min(floor(table(level)/2))
            combs <- combn(min(table(level)),subn)
            comb1 <- matrix(which(level==(unique(level)[1]))[combs],
                            nrow(combs))
            comb2 <- matrix(which(level==(unique(level)[2]))[combs],
                            nrow(combs))
            comb1f <- comb1[,rep(seq_len(ncol(comb1)),ncol(comb2))]
            comb2f <- comb2[,rep(seq_len(ncol(comb2)),each=ncol(comb1))]
            permute <- t(sapply(seq_len(ncol(comb1f)),function(i){
                ranks <- seq_len(ncol(count))
                tmpidx <- match(c(comb1f[,i],comb2f[,i]),ranks)
                ranks[tmpidx] <- c(comb2f[,i],comb1f[,i])
                ranks
            }))
        }
        if(maxperm<nrow(permute))
            permute <- permute[sample(seq_len(nrow(permute)),maxperm),]
        n <- nrow(permute)
        cat("Differential analysis of permute data...... Total",n,"times!\n")
        perm <- list()
        for(i in seq_len(n)){
            counttmp <- count[,permute[i,]]
            sizefactmp <- sizefac[permute[i,]]
            perm[[i]] <- diffRegions(count=counttmp, bins=bins, meta=meta,
                                     design=design, sizefac=sizefactmp,
                                     rccut=rccut, fccut=fccut, gap=gap,
                                     diffmeth=diffmeth)
            cat(i,"permute done!\n")
        }
        ## significance evaluation
        permp <- ecdf(abs(unlist(sapply(perm,function(x) x$stat))))
        observed$permPV <- 1-permp(abs(observed$stat))
    }
    observed
}
tengmx/ComplexDiff documentation built on May 31, 2019, 8:34 a.m.