R/PolyfitFunctions.R
In Polyfit: Add-on to DESeq to improve p-values and q-values

Documented in levelPValues twoSidedPValueFromDiscrete

#
#	This file contains R functions to implement the Polyfit add-on to DESeq 
# described in the paper "Improved error estimates for the analysis 
# of differential expression from RNA-‐seq data"
#
#	pfNbinomTest()
#		replaces the DESeq function nbinomTest(); produces a p-value distribution 
#		without the 'flagpole' at p=1
#
#	pfNbinomTestForMatrices()
#		replaces the DESeq function nbinomTestForMatrices(); is called by 
#   pfNbinomTest()
#
#	twoSidedPValueFromDiscrete()
#		function to produce a 2-sided p-value with a uniform distribution on [0, 1] 
#		from a discrete distribution; is called by pfNbinomTestForMatrices() and 
#		by pfExactTestDoubleTail()
#
#	levelPValues()
#   Function to level out a p-value spectrum generated by DESeq or edgeR by 
#   fitting a quadratic function to the right hand portion of the spectrum, and 
#   to produce 'corrected' p-values and q-values using an adapted version of 
#   the Storey-Tibsharini procedure
	 	
#########################################################################################
#
#	The function replacing the original function nbinomTest()
#
#	Conrad Burden, April 2013
#	Adapted from the original R code by Simon Anders)
#
pfNbinomTest <- function (cds, condA, condB, pvals_only = FALSE, eps = NULL) 
{
    stopifnot(is(cds, "CountDataSet"))
    if (cds@multivariateConditions) 
        stop("For CountDataSets with multivariate conditions, only the GLM-based test can be used.")
    if (all(is.na(dispTable(cds)))) 
        stop("Call 'estimateDispersions' first.")
    if (dispTable(cds)[condA] == "blind" || dispTable(cds)[condB] == 
        "blind") {
        if (fitInfo(cds, "blind")$sharingMode != "fit-only") 
            warning("You have used 'method=\"blind\"' in estimateDispersion without also setting 'sharingMode=\"fit-only\"'. This will not yield useful results.")
    }
    stopifnot(condA %in% levels(conditions(cds)))
    stopifnot(condB %in% levels(conditions(cds)))
    if (!is.null(eps)) 
        warning("The 'eps' argument is defunct and hence ignored.")
    colA <- conditions(cds) == condA
    colB <- conditions(cds) == condB
    bmv <- getBaseMeansAndVariances(counts(cds)[, colA | colB], 
        sizeFactors(cds)[colA | colB])
    rawScvA <- fData(cds)[, paste("disp", dispTable(cds)[condA], 
        sep = "_")]
    rawScvB <- fData(cds)[, paste("disp", dispTable(cds)[condB], 
        sep = "_")]
#
#	Original code commented out
#	
#    pval <- nbinomTestForMatrices(counts(cds)[, colA], counts(cds)[, 
#
#	Replacement code calls new version of NbinomTestForMatrices()
#
    pval <- pfNbinomTestForMatrices(counts(cds)[, colA], counts(cds)[, 
#
        colB], sizeFactors(cds)[colA], sizeFactors(cds)[colB], 
        rawScvA, rawScvB)
    if (pvals_only) 
        pval
    else {
        bmvA <- getBaseMeansAndVariances(counts(cds)[, colA], 
            sizeFactors(cds)[colA])
        bmvB <- getBaseMeansAndVariances(counts(cds)[, colB], 
            sizeFactors(cds)[colB])
        data.frame(id = rownames(counts(cds)), baseMean = bmv$baseMean, 
            baseMeanA = bmvA$baseMean, baseMeanB = bmvB$baseMean, 
            foldChange = bmvB$baseMean/bmvA$baseMean, 
                         log2FoldChange = log2(bmvB$baseMean/bmvA$baseMean), 
            pval = pval, padj = p.adjust(pval, method = "BH"), 
            stringsAsFactors = FALSE)
    }
}
#
################################################################################
#
#	The function replacing the original function nbinomTestForMatrices()
#
#	Conrad Burden, April 2013
#	Adapted from original R code by Simon Anders)
#
pfNbinomTestForMatrices <- 
	function (countsA, countsB, sizeFactorsA, sizeFactorsB, dispsA, 
    dispsB) 
{
    kAs <- rowSums(cbind(countsA))
    kBs <- rowSums(cbind(countsB))
    mus <- rowMeans(cbind(t(t(countsA)/sizeFactorsA), 
                          t(t(countsB)/sizeFactorsB)))
    fullVarsA <- pmax(mus * sum(sizeFactorsA) + dispsA * mus^2 * 
        sum(sizeFactorsA^2), mus * sum(sizeFactorsA) * (1 + 1e-08))
    fullVarsB <- pmax(mus * sum(sizeFactorsB) + dispsB * mus^2 * 
        sum(sizeFactorsB^2), mus * sum(sizeFactorsB) * (1 + 1e-08))
    sumDispsA <- (fullVarsA - mus * sum(sizeFactorsA))/(mus * 
        sum(sizeFactorsA))^2
    sumDispsB <- (fullVarsB - mus * sum(sizeFactorsB))/(mus * 
        sum(sizeFactorsB))^2
    sapply(1:length(kAs), function(i) {
        if (kAs[i] == 0 & kBs[i] == 0) 
            return(NA)
        ks <- 0:(kAs[i] + kBs[i])
        ps <- dnbinom(ks, mu = mus[i] * sum(sizeFactorsA), 
                      size = 1/sumDispsA[i]) * 
            dnbinom(kAs[i] + kBs[i] - ks, mu = mus[i] * sum(sizeFactorsB), 
                size = 1/sumDispsB[i])
#        pobs <- dnbinom(kAs[i], mu = mus[i] * sum(sizeFactorsA), 
#            size = 1/sumDispsA[i]) * dnbinom(kBs[i], mu = mus[i] * 
#            sum(sizeFactorsB), size = 1/sumDispsB[i])
#        stopifnot(pobs == ps[kAs[i] + 1])
#        if (kAs[i] * sum(sizeFactorsB) < kBs[i] * sum(sizeFactorsA)) 
#            numer <- ps[1:(kAs[i] + 1)]
#        else numer <- ps[(kAs[i] + 1):length(ps)]
#        min(1, 2 * sum(numer)/sum(ps))
#
#		Replacement code for 2-sided p-value without 'flagpole'
#
		probs <- ps/sum(ps)       
        pValue <- twoSidedPValueFromDiscrete(probs, kAs[i])
        return(pValue)
#
     })
}
#
################################################################################
#
#	Function to calculate a 2-sided p-value of an observation xobs for a finite 
#	discrete distribution
#		Prob(X = xobs) = probs[xobs + 1]
#	over the range xobs in (0, 1, ..., xmax) by "squaring off" the distribution 
# to a continuous distribution
#
#	Arguments: 
#	probs	an array containing the probabilities that X takes the values 0, 1, ...
#	xobs	the observed value of X  
#
#	Note that the returned 2-sided p-value contains a random component, i.e. a 
# given set of input parameters returns a different result each time 
#
#	April 2013
#	Conrad Burden
#
    twoSidedPValueFromDiscrete <- function(probs,xobs){
        if(!all(probs >= 0)){
    	    stop("probs contains negative values \n") 
	    	    }
        if(abs(sum(probs) - 1) >1.e-10){
            warning("probs do not sum to 1 and will be normalised \n") 
            }
    probs <- probs/sum(probs)
        xmax <- length(probs) - 1
        if(length(xobs)!= 1){
		    stop("xobs not a single value in the range of probs") 
        }
        if(!is.element(xobs, 0:xmax)){
            stop("xobs not a single value in the range of probs") 
        }
#
#	choose xcut randomly and uniformly between xobs and (xobs + 1)
#
        randomFrac <- runif(1)
        xcut <- xobs + randomFrac
#
#	p-value calculated either from lower or upper tail of "squared-off" 
# distribution
#
        distribFunct <- c(0, cumsum(probs))   
        leftSidePValue <- distribFunct[xobs+1] + randomFrac*probs[xobs+1]
        pVal <- 2*min(leftSidePValue, 1 - leftSidePValue)
        return(pVal)
        }
#
################################################################################
#
#   Function to level out a p-value spectrum generated by DESeq or edgeR by 
#   fitting a quadratic function to the right hand portion of the spectrum,
#   produce 'corrected' p-values and q-values using an adapted version of the 
#   Storey-Tibsharini procedure
#
#   Arguments:
#   oldPvalues	an array of p-values produced by the replacement DESeq function 
#               pfNbinomTest() or the replacement edgeR function pfExactTest()
#   plot        TRUE to plot original and corrected pvalue spectra; FALSE not to plot
#
#	Returns a list containing:
#		$pi0estimate	an estimate of the proportion of genes not differentially 
#                  expressed
#		$lambdaOptimal	the point in the p-value spectrum past which a quadratic 
#                 is fitted
#		$pValueCorr		p-values calculated from the levelled spectrum 
#		$qValueCorr		q-values calculated from the levelled spectrum 
#		$qValueCorrBH	q-values calculated from $pValueCorr using Benjamini-Hochberg 
#
#	April 2013
#	Conrad Burden
#
    levelPValues <- function(oldPvals, plot=FALSE){
#
#
        nGenes <- length(oldPvals) 
        originalHist <- hist(oldPvals, breaks=seq(0,1,by=0.01), plot=FALSE)
        x <- originalHist$mids		# x and y are the coordintes of the mipoints of 
        y <- originalHist$counts	#  the tops of the bars of the histograms
#
#	Define the quadratic fitting function and its integral:
# 
        quadFunct <- function(x, a){
            quadFunct <- a[1] + a[2]*x + a[3]*x^2 
            quadFunct
            }
#
        intQuadFunct <- function(x, a){
            intQuadFunct <- a[1]*x + a[2]*x^2/2 + a[3]*x^3/3
            intQuadFunct
            }
#
#	Set up some arrays for loop over lambda
# 
        lambdaArray <- seq(0.1, 0.95, by=0.01)
        aEstimate <- array(dim=c(3,length(lambdaArray)))
        pi0hat <- array(dim=length(lambdaArray))
        astart <- c(lm(y~x)$coeff, 0)
        lambdaIndex <- 0
#
        for(lambda in lambdaArray){
            lambdaIndex <- lambdaIndex + 1
            xLambda <- x[x>=lambda]
            yLambda <- y[x>=lambda]
#	
#	Define a function equal to the sum of squares residuals, which gets minimised
#
            sumSq <- function(a){
                sumSq <- sum((yLambda - quadFunct(xLambda, a))^2)
                sumSq
                }
#
#	Minimise the sum of squares residuals, 
#	fitted parameters are stored in the variable aEstimate
#	nlm() needs an initial guess for the parameters:
#		use a straight line fit for the first 2 parameters first time through
#		then use aEstimate from previous lambda
#
            fit <-  nlm(sumSq, astart)
            aEstimate[, lambdaIndex] <- fit$estimate
            astart <- aEstimate[, lambdaIndex]
#
#
#	piZero(lambda) estimate: 
#    (area under the fitted quadratic from 0 to 1)/(area under histogram)
#
            pi0hat[lambdaIndex] <- intQuadFunct(1, astart)/sum(y)/0.01
            }
#
#
#	Calculate a density plot using only the 'reasonable' pi0hat's
#
        reasonablePi0hats <- pi0hat>=0 & pi0hat<2
        pi0hatR <- pi0hat[reasonablePi0hats]
#
#	Optimal lambda and pi_0 got by seeing where density of pi0 peaks
#
        ddd <- density(pi0hatR, na.rm=TRUE)
        pi0estimate <- ddd$x[which.max(ddd$y)]
#		
        lambdaOptimal <- lambdaArray[which.min(abs(pi0hatR-pi0estimate))]
        aEst <- aEstimate[,which.min(abs(pi0hat-pi0estimate))]
#
#	Calculate 'corrected' p-values
#
        pValueCorr <- apply(as.array(oldPvals), 1, FUN=intQuadFunct, a=aEst)	
        pValueCorr <- pValueCorr/intQuadFunct(1, aEst)
#
#	Calculate 'corrected' q-values
#
        nGenes1 <- length(oldPvals[!is.na(oldPvals)])
        TPplusFP <- array(dim=nGenes)
        TPplusFP[order(pValueCorr)] <- (1:nGenes) # any NAs present are ordered last
        FP <- pValueCorr*pi0estimate*nGenes1
        qValueCorr <- FP/TPplusFP
        qValueCorrBH <- p.adjust(pValueCorr, method="BH")
#
#	Plots
#
        if (plot){
            oldpar <- par(mfrow=c(2,2))
#
#	Plot pi0 estimates at each lambda
#	
            plot(lambdaArray,pi0hat, ylim=c(0.6,1.2), pch=16, cex=0.9, xlim=c(0, 1),
                xlab=expression(lambda), ylab = expression(hat(pi)[0](lambda)), 
                main = "")
                mtext(substitute(hat(pi)[0]*" = "*this*"  at  "*lambda*" = "*that, 
                        list(this=round(pi0estimate, digits = 3), 
                        that=round(lambdaOptimal, digits = 2))), 
                        side = 3, font=10)
            abline(h=pi0estimate, lty=2, cex=0.5)
            points(lambdaOptimal, pi0estimate,col="red", pch=16, cex=1.25)	
#
#	Plot histogram of original p-values and superimpose quadratic
#
            plot(originalHist, 
                xlab = "p-value", main="Original P-value spectrum")
            xPointsLeft <- seq(0,lambdaOptimal,by=0.01)
            xPointsRight <- seq(lambdaOptimal,1,by=0.01)
            points(xPointsLeft, quadFunct(xPointsLeft,aEst), 
                type = "l", col= "green",lwd=1.25)
            points(xPointsRight, quadFunct(xPointsRight,aEst), 
                type = "l", col= "red",lwd=1.25)
#
#	Plot density of pi0 estimatesat each lambda
#	
			plot(ddd, xlab = expression(hat(pi)[0](lambda)), main = "")
#
#	Plot histogram of corrected p-values and straight line at pi0hat
#
            hist(pValueCorr, breaks=seq(0,1,by=0.01), xlab = "corrected p-value", 
                main="Corrected P-value spectrum")
#
            height <- nGenes1*pi0estimate/100
            lambdaCorr <- intQuadFunct(lambdaOptimal, aEst)/intQuadFunct(1, aEst)
            points(c(0,lambdaCorr),c(height,height), col="green", type="l",lwd=1.25)
            points(c(lambdaCorr,1),c(height,height), col="red", type="l",lwd=1.25)			
#
            par(oldpar)
            }
#
            list(pi0estimate=pi0estimate, lambdaOptimal=lambdaOptimal, 
            pValueCorr=pValueCorr, qValueCorr=qValueCorr, qValueCorrBH=qValueCorrBH)
        }
#