#suppressPackageStartupMessages(library(edgeR))
#suppressPackageStartupMessages(library(genefilter)) #for filtering functions
root <- "~/projects/conquerDE"
devtools::load_all(root)
data(L)
## --- functions
zeroWeightsLibSizeFast <- function(counts, design, initialWeightAt0 = TRUE,
maxit = 100, plot = FALSE, plotW = FALSE,
designZI = NULL, wTol = 1e-4){
### EM-algorithm for estimating weights
#require(edgeR)
# if(plot | plotW) par(mfrow=c(1,plot+plotW))
counts <- edgeR::DGEList(counts)
counts <- edgeR::calcNormFactors(counts)
effLibSize <- counts$samples$lib.size*counts$samples$norm.factors
logEffLibSize <- log(effLibSize)
zeroId <- counts$counts==0
w <- matrix(1,nrow=nrow(counts),ncol=ncol(counts), dimnames=list(c(1:nrow(counts)), NULL))
#if(initialWeightAt0) w[zeroId] <- 0.1 else w[zeroId] <- 0.9
## starting values based on P(zero) in the library
for(k in 1:ncol(w)) w[counts$counts[,k]==0,k] <- 1-mean(counts$counts[,k]==0)
for(i in 1:maxit){
counts$weights <- w
### M-step counts
counts <- edgeR::estimateGLMCommonDisp(counts, design, interval=c(0,10))
counts <- edgeR::estimateGLMTagwiseDisp(counts, design, prior.df=0, min.row.sum=1)
# if(plot) plotBCV(counts)
fit <- edgeR::glmFit(counts, design)
likC <- stats::dnbinom(counts$counts, mu=fit$fitted.values, size=1/counts$tagwise.dispersion)
### M-step mixture parameter: model zero probability
successes <- colSums(1-w) #P(zero)
failures <- colSums(w) #1-P(zero)
if(is.null(designZI)){
zeroFit <- stats::glm(cbind(successes,failures) ~ logEffLibSize, family="binomial")} else{
zeroFit <- stats::glm(cbind(successes,failures) ~-1+designZI, family="binomial")}
pi0Hat <- predict(zeroFit,type="response")
## E-step: Given estimated parameters, calculate expected value of weights
pi0HatMat <- edgeR::expandAsMatrix(pi0Hat,dim=dim(counts),byrow=TRUE)
wOld <- w
w <- 1-pi0HatMat*zeroId/(pi0HatMat*zeroId+(1-pi0HatMat)*likC*zeroId+1e-15)
rownames(w) <- rownames(wOld)
if(plotW) hist(w[zeroId])
}
return(w)
}
counts <- L$counts
model_matrix <- L$design
contrast <- L$contrast
meta <- L$meta
run_zingeR <- function(L, alpha=0.05) {
dge <- edgeR::DGEList(counts, samples=meta)
dge <- edgeR::calcNormFactors(dge)
niter <- 30
#you may want to increase this for large (~ >200 cells) datasets, can
#checked this by looking whether the weight distribution of zero counts is
#still changing over the last few iterations.
# internal function
zeroWeights <- zeroWeightsLibSizeFast(dge, model_matrix, plot = FALSE,
maxit = niter, initialWeightAt0 = TRUE,
plotW = FALSE)
dge$weights <- zeroWeights
dge <- edgeR::estimateDispWeighted(dge, design, weights = dge$weights)
fit <- edgeR::glmFit(dge, design = design)
fit$df.residual <- rowSums(fit$weights) - ncol(design)
# internal function
lrt <- glmLRTOld(fit, test = "F")
## independent filtering from DESeq2
pval <- lrt$table$PValue
baseMean <- unname(rowMeans(sweep(dge$counts, 2, dge$samples$norm.factors, FUN = "*")))
# internal function
hlp <- pvalueAdjustment_kvdb(baseMean = baseMean, pValue = pval)
padj <- hlp$padj
tt <- edgeR::topTags(lrt, n = Inf)
### estimateDispWeighted function and related functions
.comboGroups <- function(truths)
# Function that returns a list of vectors of indices,
# where each vector refers to the rows with the same
# combination of TRUE/FALSE values in 'truths'.
#
# written by Aaron Lun
# Created 24 October 2014
{
# Integer packing will only work for 31 libraries at a time.
assembly <- list()
collected <- 0L
step <- 31L
bits <- as.integer(2^(1:step-1L))
while (collected < ncol(truths)) {
upper <- pmin(ncol(truths) - collected, step)
keys <- t(truths[,collected+1:upper,drop=FALSE]) * bits[1:upper]
assembly[[length(assembly)+1L]] <- as.integer(colSums(keys))
collected <- collected + step
}
# Figuring out the unique components.
o <- do.call(order, assembly)
nr <- nrow(truths)
is.different <- logical(nr)
for (i in 1:length(assembly)) {
is.different <- is.different | c(TRUE, diff(assembly[[i]][o])!=0L)
}
first.of.each <- which(is.different)
last.of.each <- c(first.of.each[-1]-1L, nr)
# Returning the groups.
output <- list()
for (u in 1:length(first.of.each)) {
output[[u]] <- o[first.of.each[u]:last.of.each[u]]
}
return(output)
}
.residDF <- function(zero, design)
# Effective residual degrees of freedom after adjusting for exact zeros
# Gordon Smyth and Aaron Lun
# Created 6 Jan 2014. Last modified 2 Sep 2014
{
nlib <- ncol(zero)
ncoef <- ncol(design)
nzero <- as.integer(rowSums(zero))
# Default is no zero
DF <- rep(nlib-ncoef,length(nzero))
# All zero case
DF[nzero==nlib] <- 0L
# Anything in between?
somezero <- nzero>0L & nzero<nlib
if(any(somezero)) {
zero2 <- zero[somezero,,drop=FALSE]
groupings <- .comboGroups(zero2)
# Identifying the true residual d.f. for each of these rows.
DF2 <- nlib-nzero[somezero]
for (u in 1:length(groupings)) {
i <- groupings[[u]]
zeroi <- zero2[i[1],]
DF2[i] <- DF2[i]-qr(design[!zeroi,,drop=FALSE])$rank
}
DF2 <- pmax(DF2, 0L)
DF[somezero] <- DF2
}
DF
}
estimateDispWeighted = function (y, design = NULL, prior.df = NULL, trend.method = "locfit", tagwise = TRUE, span = NULL, min.row.sum = 5, grid.length = 21,
grid.range = c(-10, 10), robust = FALSE, winsor.tail.p = c(0.05,
0.1), tol = 1e-06, weights=NULL)
{
#adjusted by Koen VdB on 04 March 2016
if (!is(y, "DGEList"))
stop("y must be a DGEList")
trend.method <- match.arg(trend.method, c("none", "loess",
"locfit", "movingave"))
ntags <- nrow(y$counts)
nlibs <- ncol(y$counts)
offset <- getOffset(y)
AveLogCPM <- aveLogCPM(y)
offset <- edgeR::expandAsMatrix(offset, dim(y))
sel <- rowSums(y$counts) >= min.row.sum
spline.pts <- seq(from = grid.range[1], to = grid.range[2],
length = grid.length)
spline.disp <- 0.1 * 2^spline.pts
grid.vals <- spline.disp/(1 + spline.disp)
l0 <- matrix(0, sum(sel), grid.length)
if (is.null(design)) {
cat("Design matrix not provided. Switch to the classic mode.\n")
group <- y$samples$group <- as.factor(y$samples$group)
if (length(levels(group)) == 1)
design <- matrix(1, nlibs, 1)
else design <- model.matrix(~group)
if (all(tabulate(group) <= 1)) {
warning("There is no replication, setting dispersion to NA.")
y$common.dispersion <- NA
return(y)
}
pseudo.obj <- y[sel, ]
q2q.out <- equalizeLibSizes(y[sel, ], dispersion = 0.01)
pseudo.obj$counts <- q2q.out$pseudo
ysplit <- splitIntoGroups(pseudo.obj)
delta <- optimize(commonCondLogLikDerDelta, interval = c(1e-04,
100/(100 + 1)), tol = tol, maximum = TRUE, y = ysplit,
der = 0)
delta <- delta$maximum
disp <- delta/(1 - delta)
q2q.out <- equalizeLibSizes(y[sel, ], dispersion = disp)
pseudo.obj$counts <- q2q.out$pseudo
ysplit <- splitIntoGroups(pseudo.obj)
for (j in 1:grid.length) for (i in 1:length(ysplit)) l0[,
j] <- condLogLikDerDelta(ysplit[[i]], grid.vals[j],
der = 0) + l0[, j]
}
else {
design <- as.matrix(design)
if (ncol(design) >= ncol(y$counts)) {
warning("No residual df: setting dispersion to NA")
y$common.dispersion <- NA
return(y)
}
glmfit <- glmFit(y$counts[sel, ], design, offset = offset[sel,
], dispersion = 0.05, prior.count = 0, weights=weights[sel,]) ###www
zerofit <- (glmfit$fitted.values < 1e-04) & (glmfit$counts <
1e-04)
by.group <- .comboGroups(zerofit)
for (subg in by.group) {
cur.nzero <- !zerofit[subg[1], ]
if (!any(cur.nzero)) {
next
}
if (all(cur.nzero)) {
redesign <- design
}
else {
redesign <- design[cur.nzero, , drop = FALSE]
QR <- qr(redesign)
redesign <- redesign[, QR$pivot[1:QR$rank], drop = FALSE]
if (nrow(redesign) == ncol(redesign)) {
next
}
}
last.beta <- NULL
for (i in 1:grid.length) {
out <- adjustedProfileLik(spline.disp[i], y = y$counts[sel,
][subg, cur.nzero, drop = FALSE], design = redesign,
offset = offset[sel, ][subg, cur.nzero, drop = FALSE],
start = last.beta, get.coef = TRUE, weights=weights[sel,][subg, cur.nzero, drop = FALSE]) ###www
l0[subg, i] <- out$apl
last.beta <- out$beta
}
}
}
out.1 <- WLEB(theta = spline.pts, loglik = l0, covariate = AveLogCPM[sel],
trend.method = trend.method, span = span, individual = FALSE,
m0.out = TRUE)
y$common.dispersion <- 0.1 * 2^out.1$overall
disp.trend <- 0.1 * 2^out.1$trend
y$trended.dispersion <- rep(disp.trend[which.min(AveLogCPM[sel])],
ntags)
y$trended.dispersion[sel] <- disp.trend
y$trend.method <- trend.method
y$AveLogCPM <- AveLogCPM
y$span <- out.1$span
if (!tagwise)
return(y)
if (is.null(prior.df)) {
glmfit <- glmFit(y$counts[sel, ], design, offset = offset[sel,
], dispersion = disp.trend, prior.count = 0, weights=weights[sel,]) ###www
df.residual <- glmfit$df.residual
zerofit <- (glmfit$fitted.values < 1e-04) & (glmfit$counts <
1e-04)
df.residual <- .residDF(zerofit, design)
s2 <- glmfit$deviance/df.residual
s2[df.residual == 0] <- 0
s2 <- pmax(s2, 0)
s2.fit <- squeezeVar(s2, df = df.residual, covariate = AveLogCPM[sel],
robust = robust, winsor.tail.p = winsor.tail.p)
prior.df <- s2.fit$df.prior
}
ncoefs <- ncol(design)
prior.n <- prior.df/(nlibs - ncoefs)
if (trend.method != "none") {
y$tagwise.dispersion <- y$trended.dispersion
}
else {
y$tagwise.dispersion <- rep(y$common.dispersion, ntags)
}
too.large <- prior.n > 1e+06
if (!all(too.large)) {
temp.n <- prior.n
if (any(too.large)) {
temp.n[too.large] <- 1e+06
}
out.2 <- WLEB(theta = spline.pts, loglik = l0, prior.n = temp.n,
covariate = AveLogCPM[sel], trend.method = trend.method,
span = span, overall = FALSE, trend = FALSE, m0 = out.1$shared.loglik)
if (!robust) {
y$tagwise.dispersion[sel] <- 0.1 * 2^out.2$individual
}
else {
y$tagwise.dispersion[sel][!too.large] <- 0.1 * 2^out.2$individual[!too.large]
}
}
if (!robust) {
y$prior.df <- prior.df
y$prior.n <- prior.n
}
else {
y$prior.df <- y$prior.n <- rep(Inf, ntags)
y$prior.df[sel] <- prior.df
y$prior.n[sel] <- prior.n
}
y
}
### old glmLRT function for F-test
glmLRTOld <- function(glmfit,coef=ncol(glmfit$design),contrast=NULL,test="chisq")
# Tagwise likelihood ratio tests for DGEGLM
# Gordon Smyth, Davis McCarthy and Yunshun Chen.
# Created 1 July 2010. Last modified 22 Nov 2013.
{
# Check glmfit
if(!is(glmfit,"DGEGLM")) {
if(is(glmfit,"DGEList") && is(coef,"DGEGLM")) {
stop("First argument is no longer required. Rerun with just the glmfit and coef/contrast arguments.")
}
stop("glmfit must be an DGEGLM object (usually produced by glmFit).")
}
if(is.null(glmfit$AveLogCPM)) glmfit$AveLogCPM <- aveLogCPM(glmfit)
nlibs <- ncol(glmfit)
# Check test
test <- match.arg(test,c("F","f","chisq"))
if(test=="f") test <- "F"
# Check design matrix
design <- as.matrix(glmfit$design)
nbeta <- ncol(design)
if(nbeta < 2) stop("Need at least two columns for design, usually the first is the intercept column")
coef.names <- colnames(design)
# Evaluate logFC for coef to be tested
# Note that contrast takes precedence over coef: if contrast is given
# then reform design matrix so that contrast of interest is last column.
if(is.null(contrast)) {
if(length(coef) > 1) coef <- unique(coef)
if(is.character(coef)) {
check.coef <- coef %in% colnames(design)
if(any(!check.coef)) stop("One or more named coef arguments do not match a column of the design matrix.")
coef.name <- coef
coef <- match(coef, colnames(design))
}
else
coef.name <- coef.names[coef]
logFC <- glmfit$coefficients[,coef,drop=FALSE]/log(2)
} else {
contrast <- as.matrix(contrast)
qrc <- qr(contrast)
ncontrasts <- qrc$rank
if(ncontrasts==0) stop("contrasts are all zero")
coef <- 1:ncontrasts
if(ncontrasts < ncol(contrast)) contrast <- contrast[,qrc$pivot[coef]]
logFC <- drop((glmfit$coefficients %*% contrast)/log(2))
if(ncontrasts>1) {
coef.name <- paste("LR test of",ncontrasts,"contrasts")
} else {
contrast <- drop(contrast)
i <- contrast!=0
coef.name <- paste(paste(contrast[i],coef.names[i],sep="*"),collapse=" ")
}
Dvec <- rep.int(1,nlibs)
Dvec[coef] <- diag(qrc$qr)[coef]
Q <- qr.Q(qrc,complete=TRUE,Dvec=Dvec)
design <- design %*% Q
}
if(length(coef)==1) logFC <- as.vector(logFC)
# Null design matrix
design0 <- design[,-coef,drop=FALSE]
# Null fit
fit.null <- glmFit(glmfit$counts,design=design0,offset=glmfit$offset,weights=glmfit$weights,dispersion=glmfit$dispersion,prior.count=0)
# Likelihood ratio statistic
LR <- fit.null$deviance - glmfit$deviance
### ADDED
fit.null$df.residual <- rowSums(fit.null$weights)-ncol(design0)
## END ADDED
df.test <- fit.null$df.residual - glmfit$df.residual ## okay
# Chisquare or F-test
LRT.pvalue <- switch(test,
"F" = {
phi <- quantile(glmfit$dispersion,p=0.5)
mu <- quantile(glmfit$fitted.values,p=0.5)
gamma.prop <- (phi*mu/(1 + phi*mu))^2
prior.df <- glmfit$prior.df
if(is.null(prior.df)) prior.df <- 20
glmfit$df.total <- glmfit$df.residual + prior.df/gamma.prop
pf(LR/df.test, df1=df.test, df2=glmfit$df.total, lower.tail = FALSE, log.p = FALSE)
},
"chisq" = pchisq(LR, df=df.test, lower.tail = FALSE, log.p = FALSE)
)
rn <- rownames(glmfit)
if(is.null(rn))
rn <- 1:nrow(glmfit)
else
rn <- make.unique(rn)
tab <- data.frame(
logFC=logFC,
logCPM=glmfit$AveLogCPM,
LR=LR,
PValue=LRT.pvalue,
row.names=rn
)
glmfit$counts <- NULL
glmfit$table <- tab
glmfit$comparison <- coef.name
glmfit$df.test <- df.test
new("DGELRT",unclass(glmfit))
}
### independent filtering from DESeq2, slightly adapted
pvalueAdjustment_kvdb <- function(baseMean, filter, pValue,
theta, alpha=0.05, pAdjustMethod="BH") {
# perform independent filtering
if (missing(filter)) {
filter <- baseMean
}
if (missing(theta)) {
lowerQuantile <- mean(filter == 0)
if (lowerQuantile < .95) upperQuantile <- .95 else upperQuantile <- 1
theta <- seq(lowerQuantile, upperQuantile, length=50)
}
# do filtering using genefilter
stopifnot(length(theta) > 1)
filtPadj <- filtered_p(filter=filter, test=pValue,
theta=theta, method=pAdjustMethod)
numRej <- colSums(filtPadj < alpha, na.rm = TRUE)
# prevent over-aggressive filtering when all genes are null,
# by requiring the max number of rejections is above a fitted curve.
# If the max number of rejection is not greater than 10, then don't
# perform independent filtering at all.
lo.fit <- lowess(numRej ~ theta, f=1/5)
if (max(numRej) <= 10) {
j <- 1
} else {
residual <- if (all(numRej==0)) {
0
} else {
numRej[numRej > 0] - lo.fit$y[numRej > 0]
}
thresh <- max(lo.fit$y) - sqrt(mean(residual^2))
j <- if (any(numRej > thresh)) {
which(numRej > thresh)[1]
} else {
1
}
}
padj <- filtPadj[, j, drop=TRUE]
cutoffs <- quantile(filter, theta)
filterThreshold <- cutoffs[j]
filterNumRej <- data.frame(theta=theta, numRej=numRej)
filterTheta <- theta[j]
return(list(padj=padj, filterThreshold=filterThreshold, filterTheta=filterTheta, filterNumRej = filterNumRej, lo.fit=lo.fit, alpha=alpha))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.