R/sizeFac.R

#' @title Estimate Size Factors of Samples
#'
#' @description This function estimates sample size factors for normalization
#' purpose in downsteam analysis. Size factors of sample pairs are estimated
#' firstly by comparing samples to one reference sample (i.e. the sample
#' corresponding to first column of count matrix). Then, size factors are
#' combined across all samples with the median size factor as 1. In detail,
#' binding type is first estimated using the same strategy as function
#' \code{chipType} for each sample pair. When biomodel, size factor is
#' calcualted based on a decision on which kernale density mode to be used for
#' scaling.
#'
#' @param count A matrix of read counts or a SummarizedExperiment, where
#' columns are samples and rows are peaks or high coverage bins. This object
#' can be generated by function \code{regionReads}.
#' @param cutoff A numeric cut off on count matrix. If positive, only
#' peaks/bins with counts larger than \code{cutoff} in at least one sample are
#' used to estimate the size factors. We recommend a larger cutoff since
#' background signal can dramatically mask the right estimation of kernal
#' density, especially for deep sequenced ChIP-seq samples. (Default: 50)
#' @param fold A numeric threshold to help determining the binding type. In
#' detail, if top 2 estimated modes on smoothed kernal density have a height
#' differece less than the folds given by \code{fold}, binding type will be
#' determined as bimodel; otherwise, it is unimodel. This number should be
#' larger than 1. (Default: 10)
#' @param h Initial smoothing factor when estimating kernal density for bump
#' hunting. (Default: 0.1)
#' @param plot A logical indicator that if M-A plot and smoothed kernal density
#' should be visualized. (Default: FALSE)
#' @param sanity A logical indicator if checking sanity across replicates in
#' the same conditions. A negative report of sanity check indicates either a
#' bad experiment (e.g. binding type is not consistent across replicates) or a
#' bad initiation of function parameters (e.g. \code{cutoff} and \code{fold}
#' are not pre-estimated well). However, a negative report of sanity check
#' doesn't neccessarily mean a bad estimation of size factors, as the strategy
#' of hunting kernal density mode is robust to find the right scaling
#' regardless of unimodel or bimodel. (Default: FALSE)
#' @param cond \code{NULL} or a two-level factor specifying biological
#' conditions for samples in \code{count} (e.g. control & treatment). This
#' parameter is different from the meta information in \code{count} when
#' \code{count} is a SummarizedExperiment. It only includes information of
#' conditions, and should be a factor object. This parameter is only appliable
#' when \code{sanity} is TRUE. (Default: NULL)
#'
#' @importFrom matrixStats rowMaxs
#' @importFrom matrixStats rowDiffs
#' @import SummarizedExperiment
#' @importFrom methods is
#' @importFrom graphics abline
#' @importFrom graphics layout
#' @importFrom stats dnorm
#'
#' @return
#' A list with the following conponents:
#' \item{sizefac}{A numeric vector indicating estimated size factors of
#' samples}
#' \item{type}{A character vector with value either "bimodel" or "unimodel",
#' indicating the binding types by comparing ro the sample "control"}
#'
#' @export
#'
#' @examples
#' ## load sample data
#' data(complex)
#' names(complex)
#'
#' ## test sample data
#' sizeFac(count=complex$counts)

sizeFac <- function(count, cutoff=50L, fold=10, h=0.1, plot=FALSE,
                            sanity=FALSE, cond=NULL){
    stopifnot((is.matrix(count) || is(count,"SummarizedExperiment")) &&
              ncol(count) >= 2)
    stopifnot(is.numeric(cutoff) && length(cutoff) == 1 && cutoff >= 0)
    stopifnot(is.numeric(fold) && length(fold) == 1 && fold > 1)
    stopifnot(is.numeric(h) && length(h) == 1 && h > 0)
    stopifnot(is.logical(plot) && length(plot) == 1)
    stopifnot(is.logical(sanity) && length(sanity) == 1)
    if(sanity){
        stopifnot(is.factor(cond) && nlevels(cond) == 2 &&
                  ncol(count) == length(cond))
    }
    ## loop on each sample pair
    if(is(count,"SummarizedExperiment")) count <- assay(count,1)
    sizefacs <- 1
    cmplxtypes <- 'unimodel'
    for(i in seq_len(ncol(count))[-1]){
        countpair <- count[,c(1,i)]
        ## raw M & A
        counttmp <- countpair[rowMaxs(countpair) >= cutoff,]
        logcount <- log2(counttmp + 0.5)
        M <- rowDiffs(logcount)
        A <- rowMeans(logcount)
        ## kernal density bumps
        bump <- c()
        hpair <- h
        ix <- seq(round(min(M), 1), max(M), 0.05)
        while(length(bump)<2){
            dm <- rowMeans(sapply(M, function(m) dnorm(ix, mean=m, sd=hpair)))
            bump <- which(diff(sign(diff(dm))) == -2) + 1
            bump2 <- bump[order(dm[bump],
                                decreasing=TRUE)[seq_len(
                                min(2,length(bump)))]]
            mu <- ix[bump2]
            mudm <- dm[bump2]
            hpair <- hpair * 0.8
        }
        ## chip type
        enrich <- abs(log(mudm[1] / mudm[2]))
        if(enrich <= log(fold)){
            cmplxtype <- "bimodel"
        }else{
            cmplxtype <- "unimodel"
        }
        ## size factor
        M0idx <- 1
        if(cmplxtype == "bimodel"){
            Mtmp <- M[A > max(A) * 0.8]
            ind <- vector('integer',length=length(Mtmp))
            ind[abs(Mtmp-mu[1]) <= abs(Mtmp-mu[2])] <- 1L
            if(sum(ind) < length(ind)/2){
                M0idx <- 2
            }
        }
        M0 <- mu[M0idx]
        sizefac <- c(1,2^M0)
        ## plots
        if(plot){
            layout(matrix(1:2,1,2))
            plot(A,M,pch=20,cex=0.5,main=cmplxtype)
            abline(h=0,lty=2,col='red',lwd=2)
            plot(density(M,na.rm = TRUE,adjust=1),xlab='M',
                 main=cmplxtype,lwd=2)
            if(cmplxtype == "bimodel")
                abline(v=mu,lty=2,col='blue',lwd=2)
            else abline(v=M0,lwd=2,col='blue',lty=2)
        }
        sizefacs <- c(sizefacs,sizefac[2])
        cmplxtypes <- c(cmplxtypes,cmplxtype)
    }
    ## sanity check
    if(sanity){
        sanind <- all(sapply(levels(cond),function(x)
                   length(unique(cmplxtypes[cond==x]))==1))
        if(sanind) cat("Sanity check passed...\n")
        else cat("Sanity check failed...\n")
    }
    sizefacs <- sizefacs / median(sizefacs)
    cmplxtypes[1] <- "control"
    list(sizefac=sizefacs,type=cmplxtypes)
}
tengmx/ComplexDiff documentation built on May 31, 2019, 8:34 a.m.