Nothing
#
# This file contains R functions to implement the Polyfit add-on to DESeq
# described in the paper "Improved error estimates for the analysis
# of differential expression from RNA-‐seq data"
#
# pfNbinomTest()
# replaces the DESeq function nbinomTest(); produces a p-value distribution
# without the 'flagpole' at p=1
#
# pfNbinomTestForMatrices()
# replaces the DESeq function nbinomTestForMatrices(); is called by
# pfNbinomTest()
#
# twoSidedPValueFromDiscrete()
# function to produce a 2-sided p-value with a uniform distribution on [0, 1]
# from a discrete distribution; is called by pfNbinomTestForMatrices() and
# by pfExactTestDoubleTail()
#
# levelPValues()
# Function to level out a p-value spectrum generated by DESeq or edgeR by
# fitting a quadratic function to the right hand portion of the spectrum, and
# to produce 'corrected' p-values and q-values using an adapted version of
# the Storey-Tibsharini procedure
#########################################################################################
#
# The function replacing the original function nbinomTest()
#
# Conrad Burden, April 2013
# Adapted from the original R code by Simon Anders)
#
pfNbinomTest <- function (cds, condA, condB, pvals_only = FALSE, eps = NULL)
{
stopifnot(is(cds, "CountDataSet"))
if (cds@multivariateConditions)
stop("For CountDataSets with multivariate conditions, only the GLM-based test can be used.")
if (all(is.na(dispTable(cds))))
stop("Call 'estimateDispersions' first.")
if (dispTable(cds)[condA] == "blind" || dispTable(cds)[condB] ==
"blind") {
if (fitInfo(cds, "blind")$sharingMode != "fit-only")
warning("You have used 'method=\"blind\"' in estimateDispersion without also setting 'sharingMode=\"fit-only\"'. This will not yield useful results.")
}
stopifnot(condA %in% levels(conditions(cds)))
stopifnot(condB %in% levels(conditions(cds)))
if (!is.null(eps))
warning("The 'eps' argument is defunct and hence ignored.")
colA <- conditions(cds) == condA
colB <- conditions(cds) == condB
bmv <- getBaseMeansAndVariances(counts(cds)[, colA | colB],
sizeFactors(cds)[colA | colB])
rawScvA <- fData(cds)[, paste("disp", dispTable(cds)[condA],
sep = "_")]
rawScvB <- fData(cds)[, paste("disp", dispTable(cds)[condB],
sep = "_")]
#
# Original code commented out
#
# pval <- nbinomTestForMatrices(counts(cds)[, colA], counts(cds)[,
#
# Replacement code calls new version of NbinomTestForMatrices()
#
pval <- pfNbinomTestForMatrices(counts(cds)[, colA], counts(cds)[,
#
colB], sizeFactors(cds)[colA], sizeFactors(cds)[colB],
rawScvA, rawScvB)
if (pvals_only)
pval
else {
bmvA <- getBaseMeansAndVariances(counts(cds)[, colA],
sizeFactors(cds)[colA])
bmvB <- getBaseMeansAndVariances(counts(cds)[, colB],
sizeFactors(cds)[colB])
data.frame(id = rownames(counts(cds)), baseMean = bmv$baseMean,
baseMeanA = bmvA$baseMean, baseMeanB = bmvB$baseMean,
foldChange = bmvB$baseMean/bmvA$baseMean,
log2FoldChange = log2(bmvB$baseMean/bmvA$baseMean),
pval = pval, padj = p.adjust(pval, method = "BH"),
stringsAsFactors = FALSE)
}
}
#
################################################################################
#
# The function replacing the original function nbinomTestForMatrices()
#
# Conrad Burden, April 2013
# Adapted from original R code by Simon Anders)
#
pfNbinomTestForMatrices <-
function (countsA, countsB, sizeFactorsA, sizeFactorsB, dispsA,
dispsB)
{
kAs <- rowSums(cbind(countsA))
kBs <- rowSums(cbind(countsB))
mus <- rowMeans(cbind(t(t(countsA)/sizeFactorsA),
t(t(countsB)/sizeFactorsB)))
fullVarsA <- pmax(mus * sum(sizeFactorsA) + dispsA * mus^2 *
sum(sizeFactorsA^2), mus * sum(sizeFactorsA) * (1 + 1e-08))
fullVarsB <- pmax(mus * sum(sizeFactorsB) + dispsB * mus^2 *
sum(sizeFactorsB^2), mus * sum(sizeFactorsB) * (1 + 1e-08))
sumDispsA <- (fullVarsA - mus * sum(sizeFactorsA))/(mus *
sum(sizeFactorsA))^2
sumDispsB <- (fullVarsB - mus * sum(sizeFactorsB))/(mus *
sum(sizeFactorsB))^2
sapply(1:length(kAs), function(i) {
if (kAs[i] == 0 & kBs[i] == 0)
return(NA)
ks <- 0:(kAs[i] + kBs[i])
ps <- dnbinom(ks, mu = mus[i] * sum(sizeFactorsA),
size = 1/sumDispsA[i]) *
dnbinom(kAs[i] + kBs[i] - ks, mu = mus[i] * sum(sizeFactorsB),
size = 1/sumDispsB[i])
# pobs <- dnbinom(kAs[i], mu = mus[i] * sum(sizeFactorsA),
# size = 1/sumDispsA[i]) * dnbinom(kBs[i], mu = mus[i] *
# sum(sizeFactorsB), size = 1/sumDispsB[i])
# stopifnot(pobs == ps[kAs[i] + 1])
# if (kAs[i] * sum(sizeFactorsB) < kBs[i] * sum(sizeFactorsA))
# numer <- ps[1:(kAs[i] + 1)]
# else numer <- ps[(kAs[i] + 1):length(ps)]
# min(1, 2 * sum(numer)/sum(ps))
#
# Replacement code for 2-sided p-value without 'flagpole'
#
probs <- ps/sum(ps)
pValue <- twoSidedPValueFromDiscrete(probs, kAs[i])
return(pValue)
#
})
}
#
################################################################################
#
# Function to calculate a 2-sided p-value of an observation xobs for a finite
# discrete distribution
# Prob(X = xobs) = probs[xobs + 1]
# over the range xobs in (0, 1, ..., xmax) by "squaring off" the distribution
# to a continuous distribution
#
# Arguments:
# probs an array containing the probabilities that X takes the values 0, 1, ...
# xobs the observed value of X
#
# Note that the returned 2-sided p-value contains a random component, i.e. a
# given set of input parameters returns a different result each time
#
# April 2013
# Conrad Burden
#
twoSidedPValueFromDiscrete <- function(probs,xobs){
if(!all(probs >= 0)){
stop("probs contains negative values \n")
}
if(abs(sum(probs) - 1) >1.e-10){
warning("probs do not sum to 1 and will be normalised \n")
}
probs <- probs/sum(probs)
xmax <- length(probs) - 1
if(length(xobs)!= 1){
stop("xobs not a single value in the range of probs")
}
if(!is.element(xobs, 0:xmax)){
stop("xobs not a single value in the range of probs")
}
#
# choose xcut randomly and uniformly between xobs and (xobs + 1)
#
randomFrac <- runif(1)
xcut <- xobs + randomFrac
#
# p-value calculated either from lower or upper tail of "squared-off"
# distribution
#
distribFunct <- c(0, cumsum(probs))
leftSidePValue <- distribFunct[xobs+1] + randomFrac*probs[xobs+1]
pVal <- 2*min(leftSidePValue, 1 - leftSidePValue)
return(pVal)
}
#
################################################################################
#
# Function to level out a p-value spectrum generated by DESeq or edgeR by
# fitting a quadratic function to the right hand portion of the spectrum,
# produce 'corrected' p-values and q-values using an adapted version of the
# Storey-Tibsharini procedure
#
# Arguments:
# oldPvalues an array of p-values produced by the replacement DESeq function
# pfNbinomTest() or the replacement edgeR function pfExactTest()
# plot TRUE to plot original and corrected pvalue spectra; FALSE not to plot
#
# Returns a list containing:
# $pi0estimate an estimate of the proportion of genes not differentially
# expressed
# $lambdaOptimal the point in the p-value spectrum past which a quadratic
# is fitted
# $pValueCorr p-values calculated from the levelled spectrum
# $qValueCorr q-values calculated from the levelled spectrum
# $qValueCorrBH q-values calculated from $pValueCorr using Benjamini-Hochberg
#
# April 2013
# Conrad Burden
#
levelPValues <- function(oldPvals, plot=FALSE){
#
#
nGenes <- length(oldPvals)
originalHist <- hist(oldPvals, breaks=seq(0,1,by=0.01), plot=FALSE)
x <- originalHist$mids # x and y are the coordintes of the mipoints of
y <- originalHist$counts # the tops of the bars of the histograms
#
# Define the quadratic fitting function and its integral:
#
quadFunct <- function(x, a){
quadFunct <- a[1] + a[2]*x + a[3]*x^2
quadFunct
}
#
intQuadFunct <- function(x, a){
intQuadFunct <- a[1]*x + a[2]*x^2/2 + a[3]*x^3/3
intQuadFunct
}
#
# Set up some arrays for loop over lambda
#
lambdaArray <- seq(0.1, 0.95, by=0.01)
aEstimate <- array(dim=c(3,length(lambdaArray)))
pi0hat <- array(dim=length(lambdaArray))
astart <- c(lm(y~x)$coeff, 0)
lambdaIndex <- 0
#
for(lambda in lambdaArray){
lambdaIndex <- lambdaIndex + 1
xLambda <- x[x>=lambda]
yLambda <- y[x>=lambda]
#
# Define a function equal to the sum of squares residuals, which gets minimised
#
sumSq <- function(a){
sumSq <- sum((yLambda - quadFunct(xLambda, a))^2)
sumSq
}
#
# Minimise the sum of squares residuals,
# fitted parameters are stored in the variable aEstimate
# nlm() needs an initial guess for the parameters:
# use a straight line fit for the first 2 parameters first time through
# then use aEstimate from previous lambda
#
fit <- nlm(sumSq, astart)
aEstimate[, lambdaIndex] <- fit$estimate
astart <- aEstimate[, lambdaIndex]
#
#
# piZero(lambda) estimate:
# (area under the fitted quadratic from 0 to 1)/(area under histogram)
#
pi0hat[lambdaIndex] <- intQuadFunct(1, astart)/sum(y)/0.01
}
#
#
# Calculate a density plot using only the 'reasonable' pi0hat's
#
reasonablePi0hats <- pi0hat>=0 & pi0hat<2
pi0hatR <- pi0hat[reasonablePi0hats]
#
# Optimal lambda and pi_0 got by seeing where density of pi0 peaks
#
ddd <- density(pi0hatR, na.rm=TRUE)
pi0estimate <- ddd$x[which.max(ddd$y)]
#
lambdaOptimal <- lambdaArray[which.min(abs(pi0hatR-pi0estimate))]
aEst <- aEstimate[,which.min(abs(pi0hat-pi0estimate))]
#
# Calculate 'corrected' p-values
#
pValueCorr <- apply(as.array(oldPvals), 1, FUN=intQuadFunct, a=aEst)
pValueCorr <- pValueCorr/intQuadFunct(1, aEst)
#
# Calculate 'corrected' q-values
#
nGenes1 <- length(oldPvals[!is.na(oldPvals)])
TPplusFP <- array(dim=nGenes)
TPplusFP[order(pValueCorr)] <- (1:nGenes) # any NAs present are ordered last
FP <- pValueCorr*pi0estimate*nGenes1
qValueCorr <- FP/TPplusFP
qValueCorrBH <- p.adjust(pValueCorr, method="BH")
#
# Plots
#
if (plot){
oldpar <- par(mfrow=c(2,2))
#
# Plot pi0 estimates at each lambda
#
plot(lambdaArray,pi0hat, ylim=c(0.6,1.2), pch=16, cex=0.9, xlim=c(0, 1),
xlab=expression(lambda), ylab = expression(hat(pi)[0](lambda)),
main = "")
mtext(substitute(hat(pi)[0]*" = "*this*" at "*lambda*" = "*that,
list(this=round(pi0estimate, digits = 3),
that=round(lambdaOptimal, digits = 2))),
side = 3, font=10)
abline(h=pi0estimate, lty=2, cex=0.5)
points(lambdaOptimal, pi0estimate,col="red", pch=16, cex=1.25)
#
# Plot histogram of original p-values and superimpose quadratic
#
plot(originalHist,
xlab = "p-value", main="Original P-value spectrum")
xPointsLeft <- seq(0,lambdaOptimal,by=0.01)
xPointsRight <- seq(lambdaOptimal,1,by=0.01)
points(xPointsLeft, quadFunct(xPointsLeft,aEst),
type = "l", col= "green",lwd=1.25)
points(xPointsRight, quadFunct(xPointsRight,aEst),
type = "l", col= "red",lwd=1.25)
#
# Plot density of pi0 estimatesat each lambda
#
plot(ddd, xlab = expression(hat(pi)[0](lambda)), main = "")
#
# Plot histogram of corrected p-values and straight line at pi0hat
#
hist(pValueCorr, breaks=seq(0,1,by=0.01), xlab = "corrected p-value",
main="Corrected P-value spectrum")
#
height <- nGenes1*pi0estimate/100
lambdaCorr <- intQuadFunct(lambdaOptimal, aEst)/intQuadFunct(1, aEst)
points(c(0,lambdaCorr),c(height,height), col="green", type="l",lwd=1.25)
points(c(lambdaCorr,1),c(height,height), col="red", type="l",lwd=1.25)
#
par(oldpar)
}
#
list(pi0estimate=pi0estimate, lambdaOptimal=lambdaOptimal,
pValueCorr=pValueCorr, qValueCorr=qValueCorr, qValueCorrBH=qValueCorrBH)
}
#
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.