# EVALUATE DISTRIBUTIONAL FITS --------------------------------------------
#' @name evaluateDist
#' @aliases evaluateDist
#' @title Model Diagnostics for RNAseq Data
#' @description With this function, the user can determine goodness of fit for each gene.
#' @usage evaluateDist(countData, batchData = NULL,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = NULL, MeanFragLengths = NULL,
#' RNAseq = "singlecell", Protocol,
#' Normalisation,
#' GeneFilter = 0.25, SampleFilter = 3,
#' FracGenes = 1,
#' verbose = TRUE)
#' @param countData is a count matrix (row=gene, column=sample).
#' Please provide the measurements of one group only, e.g. the control group.
#' @param batchData is a \code{data.frame} for batch annotation.
#' Rows correspond to samples. The first column should contain the batches, e.g. 'a', 'b', 'c', etc..
#' This is only used for the normalisation.
#' @param spikeData is a count \code{matrix}.
#' Rows correspond to spike-ins, columns to samples.
#' The order of columns should be the same as in the \code{countData}.
#' This is only needed for spike-in aware normalisation methods ('MR', 'Linnorm', 'scran', 'SCnorm', 'bayNorm', 'Census'),
#' see details section of \code{\link{estimateParam}}.
#' @param spikeInfo is a molecule count \code{matrix} of spike-ins.
#' Rows correspond to spike-ins. The order of rows should be the same as in the \code{spikeData}.
#' The column names should be 'SpikeID' and 'SpikeInput' for molecule counts of spike-ins.
#' This is only needed for spike-in aware normalisation methods ('MR', 'Linnorm', 'scran', 'SCnorm', 'bayNorm', 'Census'),
#' see details section of \code{\link{estimateParam}}.
#' @param Lengths is a numeric vector of transcript lengths with the same length and order as the rows in countData.
#' This variable is only needed for internal gene length corrections (TPM), see details section of \code{\link{estimateParam}}.
#' @param MeanFragLengths is a numeric vector of mean fragment lengths with the same length as columns in countData.
#' This variable is only needed for internal gene length corrections (TPM), see details section of \code{\link{estimateParam}}.
#' @param RNAseq is a character value: "bulk" or "singlecell". We recommended to use this evaluaiton for single cells only.
#' @param Protocol is a character value defining the type of counts given in \code{countData}.
#' Options are "UMI" (e.g. 10X Genomics, CEL-seq2) or "Read" (e.g. Smart-seq2).
#' @param Normalisation is a character value: 'TMM', 'MR', 'PosCounts', 'UQ', 'scran', 'Linnorm',
#' 'SCnorm', 'Census', 'depth', 'none'.
#' For more information, please consult the details section of \code{\link{estimateParam}}.
#' @param GeneFilter is a numeric vector indicating the minimal proportion of nonzero expression values
#' for a gene across all samples to be considered expressed and used for estimating normalisation size factors.
#' The default is \code{0.25}, i.e. at least 25\% of the expression values per gene need to be nonzero.
#' @param SampleFilter is a numeric vector indicating the minimal number of MADs (median absolute deviation)
#' away from the median number of features detected as well as sequencing depth across all samples
#' so that outlying samples are removed prior to normalisation.
#' The default is \code{5}, i.e. at least 5 MADs away from the median.
#' Choose higher values if you wsant to filter out less samples.
#' This parameter is particularly important for single cells to ensure reliable parameter estimation.
#' For more information, please consult \code{\link[scater]{isOutlier}}.
#' @param FracGenes The fraction of genes to calculate goodness of fit statistics, default is 1, i.e. for all genes.
#' @param verbose Logical value to indicate whether to print function information.
#' Default is \code{TRUE}.
#' @return List object with the results of goodness of fit and estimated parameters:
#' \item{edgeR}{Goodness-of-fit statistic, degrees of freedom and associated p-value using the deviance and residual degrees of freedom from \code{\link[edgeR]{glmFit}}. Furthermore, the AIC of the edgeR model fit using the residuals of \code{\link[edgeR]{zscoreNBinom}}.}
#' \item{GOF}{The fitting results per distribution, including loglikelihood, goodness-of-fit statistics, AIC and predicted number of zeroes. The following distributions were considered: Poisson, negative binomial, zero-inflated poisson and negative binomial following the 'standard' (i.e. \code{\link[stats]{glm}}, \code{\link[MASS]{glm.nb}} and \code{\link[pscl]{zeroinfl}} implementation) and fitdist approach (see \code{\link[fitdistrplus]{fitdist}}) and Beta-Poisson following Marioni or Hemberg parameterisation. Furthermore, model fit comparison by LRT for nested and Vuong Test for non-nested models.}
#' \item{Estimates}{The estimated parameters of distribution fitting.}
#' \item{ObservedZeros}{The number of zeroes and dropout rate per gene.}
#' @examples
#' \dontrun{
#' ## using example data set, but run it for fraction of genes
#' data("CELseq2_Gene_UMI_Counts")
#' evalDistRes <- evaluateDist(countData = CELseq2_Gene_UMI_Counts, batchData = NULL,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = NULL, MeanFragLengths = NULL,
#' RNAseq = "singlecell", Protocol = "UMI",
#' Normalisation = "scran",
#' GeneFilter = 0.1, SampleFilter = 3,
#' FracGenes = 0.1,
#' verbose = TRUE)
#' plotEvalDist(evalDistRes)
#' }
#' @author Beate Vieth
#' @rdname evaluateDist
#' @importFrom edgeR DGEList cpm.DGEList estimateDisp glmFit zscoreNBinom
#' @importFrom stats model.matrix glm logLik AIC dnbinom dpois fitted na.omit family
#' @importFrom MASS glm.nb
#' @importFrom pscl zeroinfl vuong
#' @importFrom fitdistrplus fitdist
#' @importFrom gamlss.dist ZIP ZINBI
#' @importFrom nonnest2 vuongtest
#' @export
evaluateDist <- function(countData, batchData = NULL,
spikeData = NULL, spikeInfo = NULL,
Lengths = NULL, MeanFragLengths = NULL,
RNAseq = "singlecell", Protocol,
Normalisation,
GeneFilter = 0.25, SampleFilter = 3,
FracGenes = 1,
verbose = TRUE) {
invisible(gamlss.dist::ZINBI())
invisible(gamlss.dist::ZIP())
options(stringsAsFactors = F)
# check the matching of names and objects
checkup = .run.checkup(countData = countData,
readData = NULL,
batchData = batchData,
spikeData = spikeData,
spikeInfo = spikeInfo,
Lengths = Lengths,
MeanFragLengths = MeanFragLengths,
RNAseq = RNAseq,
verbose = TRUE)
# extract checked objects
countData <- checkup$countData
readData <- checkup$readData
batchData <- checkup$batchData
spikeData <- checkup$spikeData
spikeInfo <- checkup$spikeInfo
Lengths <- checkup$Lengths
MeanFragLengths <- checkup$MeanFragLengths
Label <- ifelse(is.null(batchData), "none", "known")
# run estimation
estParam = .run.estParam(countData = countData,
readData = NULL,
batchData = batchData,
spikeData = spikeData,
spikeInfo = spikeInfo,
Lengths = Lengths,
MeanFragLengths = MeanFragLengths,
Distribution = "NB",
RNAseq = RNAseq,
Protocol = Protocol,
Normalisation = Normalisation,
Label = Label,
GeneFilter = GeneFilter,
SampleFilter = SampleFilter,
sigma = 1.96,
NCores = NULL,
verbose = FALSE)
# kick out dropouts
detect.samples <- rownames(estParam$DropOuts$Sample)[rowSums(estParam$DropOuts$Sample[,c(1:3)], na.rm = TRUE) == 0L]
detect.genes <- rownames(countData[rowSums(countData)>1,])
filt.genes <- names(estParam$DropOuts$Gene)[!estParam$DropOuts$Gene]
# inform user
if(verbose){
sampletype <- ifelse(RNAseq == "singlecell", "single cells", "bulk samples")
message(paste0("The count matrix contains the expression of ", length(detect.genes), " genes with at least one count profiled across ", ncol(countData), " ", sampletype, ". ", length(filt.genes), " genes and ", length(detect.samples), " ", sampletype, " passed the filtering and were used for estimating normalisation factors with ", Normalisation, "."))
}
# fill in size factors and use only detected genes
sf <- structure(colSums(countData) / median(colSums(countData)), names = colnames(countData))
sf[names(estParam$sf)] <- estParam$sf
countData <- countData[rownames(countData) %in% detect.genes, ]
# set up dge edgeR
seq.depth <- colSums(countData)
nsf <- log(sf/seq.depth)
nsf <- exp(nsf - mean(nsf, na.rm=TRUE))
# construct input object
dge <- edgeR::DGEList(counts = countData,
lib.size = colSums(countData),
norm.factors = nsf,
remove.zeros = FALSE)
# calculate normalized counts (if norm lib is true does not sum up to 1 million)
out.cpm <- edgeR::cpm.DGEList(dge, normalized.lib.sizes = TRUE, log = FALSE)
# estimate dispersions
invisible(capture.output(
dge <- suppressMessages(edgeR::estimateDisp(y=dge))
))
# design.mat
design.mat <- matrix(1,ncol(dge$counts),1)
rownames(design.mat) <- colnames(dge$counts)
colnames(design.mat) <- "Intercept"
# apply edgeR glm fit
fit.edgeR <- edgeR::glmFit(dge, design = design.mat)
# calculate residuals
y <- fit.edgeR$counts
mu <- fit.edgeR$fitted.values
phi <- fit.edgeR$dispersion
coefs <- fit.edgeR$coefficients
# v <- mu*(1+phi*mu)
# d <- edgeR::nbinomUnitDeviance(y,mu,phi)
# resid.pearson <- (y-mu) / sqrt(v)
# resid.deviance <- sign(y-mu) * sqrt(d)
resid.quantile <- edgeR::zscoreNBinom(y,mu=mu,size=1/phi)
# calculate AIC
edgeR.aic <- .myAIC(resids = resid.quantile, dat.n = ncol(y), npar = ncol(coefs), k=2)
# calculate gof
edgeR.gof.stats <- .myGOF(deviances = fit.edgeR$deviance, df.residuals = fit.edgeR$df.residual)
# make output table
genenames <- rownames(dge$counts)
headers.1 <- c("pois_standard", "nbinom_standard", "zifpois_standard", "zifnbinom_standard")
headers.2 <- c("loglikelihood", "loglikelihooddf", "gofstat", "gofdf", "gofpval", "predzero", "aic")
headers_tmp1 <- paste(rep(headers.1, each=7), rep(headers.2, times=4), sep="_")
headers.3 <- c("pois_fitdistr", "nbinom_fitdistr", "zifpois_fitdistr", "zifnbinom_fitdistr")
headers.4 <- c("gofstat", "gofdf", "gofpval")
headers_tmp2 <- paste(rep(headers.3, each=3), rep(headers.4, times=4), sep="_")
headers.5 <- c("PoiBeta_Hemberg", "PoiBeta_Marioni")
headers.6 <- c("loglikelihood", "loglikelihooddf", "predzero", 'aic')
headers_tmp3 <- paste(rep(headers.5, each=4), rep(headers.6, times=2), sep="_")
headers_tmp4 <- c('LRT_standard_NBPoisson', 'Vuong_standard_ZPoisson', 'Vuong_standard_ZNB', 'Vuong_standard_ZNBZPoisson')
headersgof <- c(headers_tmp1, headers_tmp2, headers_tmp3, headers_tmp4)
gof.res <- data.frame(matrix(NA, length(genenames), length(headersgof),
dimnames=list(c(genenames), c(headersgof))),
stringsAsFactors = F)
headersests <- c("pois_lambda", "nbinom_size", "nbinom_mu", "zifpois_mu", "zifpois_sigma", "zifnbinom_mu", "zifnbinom_sigma", "zifnbinom_nu", 'alpha_bp_hemberg', 'alpha_bp_marioni', 'beta_bp_hemberg', 'beta_bp_marioni', 'gamma_bp_hemberg', 'gamma_bp_marioni')
estimate.res <- data.frame(matrix(NA, length(genenames), length(headersests),
dimnames=list(c(genenames), c(headersests))),
stringsAsFactors = F)
# run model fits for the genes
if(FracGenes==1) {
geneid <- 1:nrow(dge$counts)
}
if(!FracGenes==1) { # take fraction of genes per mean bins
meancnts = rowMeans(out.cpm)
tmp.ecdf.mean = stats::ecdf(meancnts)
tmp.quantile.mean = stats::quantile(tmp.ecdf.mean, probs=c(0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 0.9))
qbins.mean = unique(c(min(meancnts, na.rm = TRUE),unname(tmp.quantile.mean),Inf))
bin.mean = cut(meancnts, qbins.mean)
geneid = unname(unlist(sapply(levels(bin.mean), function(x) {
tmp.group <- genenames[(bin.mean %in% x)]
tmp.id <- sample(tmp.group, size= round(FracGenes*length(tmp.group)), replace = F)
})))
}
for (i in geneid) {
if(verbose) {
message(paste0("Estimation for gene ", which(geneid == i), " out of ", length(geneid),
". A total of ", nrow(dge$counts),
" genes are available for estimation."))
}
# design.mat
design.mat <- matrix(1,ncol(dge$counts),1)
# zero measurement indicator
counts0 <- dge$counts[i,] == 0
# number of samples
nsamples=length(dge$counts[i,])
# number of nonzero samples
nn0 = sum(!counts0)
# dropout
p0 = mean(dge$counts[i,]==0)
# mean and dispersion of nonzero portion of normalised counts
estmu = sum((!counts0) * dge$counts[i,])/nn0
s2 = sum((!counts0) * (dge$counts[i,] - estmu)^2)/(nn0 - 1)
disp = 1 / (estmu^2/(s2 - estmu + 1e-04))
# fit glm poisson
tmp.fit.pois <- try(stats::glm(dge$counts[i,] ~ 1, family=stats::poisson(link = "log"),
offset = edgeR::getOffset(y=dge)), silent = TRUE)
# fit poisson with fitdistrplus
tmp.fit.pois.wo <- try(fitdistrplus::fitdist(data = dge$counts[i,], 'pois'), silent = TRUE)
# fit glm negative binomial
tmp.fit.nbinom <- try(MASS::glm.nb(dge$counts[i,] ~ 1 + offset(edgeR::getOffset(y=dge))), silent = TRUE)
# fit negative binomial with fitdistrplus
tmp.fit.nbinom.wo <- try(fitdistrplus::fitdist(data = dge$counts[i,], 'nbinom'), silent = TRUE)
# fit glm zero inflated poisson
tmp.fit.zifpois <- try(pscl::zeroinfl(dge$counts[i,] ~ 1, dist = 'poisson', offset = edgeR::getOffset(y=dge), EM = FALSE), silent = TRUE)
# fit zero inflated poisson with fitdistrplus
tmp.fit.zifpois.wo <- try(fitdistrplus::fitdist(dge$counts[i,], "ZIP",
start = list(mu = estmu, sigma=p0),
discrete = TRUE, lower = c(0, 0), upper = c(Inf, 1)),
silent = TRUE)
# fit glm zero inflated negative binomial
tmp.fit.zifnbinom <- try(pscl::zeroinfl(dge$counts[i,] ~ 1, dist = 'negbin', offset = edgeR::getOffset(y=dge), EM = FALSE), silent = TRUE)
# fit zero inflated negative binomial with fitdistrplus
tmp.fit.zifnbinom.wo <- try(fitdistrplus::fitdist(dge$counts[i,], "ZINBI", start = list(mu = estmu, sigma=disp, nu=p0), discrete = TRUE, lower = c(0, 0, 0), upper = c(Inf,Inf, 1)), silent = TRUE)
# fit poisson-beta distribution
tmp.fit.pb <- try(.PoissonBetaFit(x.raw = dge$counts[i,], x.norm = out.cpm[i,]), silent = TRUE)
# fill up results table
# poisson
if(inherits(tmp.fit.pois, 'try-error')) {
gof.res[i, grep("^pois_standard", colnames(gof.res), perl=TRUE)] <- NA
} else {
pois.gof.stats <- .myGOF(deviances = tmp.fit.pois$deviance, df.residuals = tmp.fit.pois$df.residual)
pois.predzero <- round(sum(stats::dpois(0, stats::fitted(tmp.fit.pois))))
pois.aic <- stats::AIC(tmp.fit.pois)
pois.loglik <- as.numeric(logLik(tmp.fit.pois))
pois.loglikdf <- as.numeric(attr(logLik(tmp.fit.pois), "df"))
gof.res[i, grep("^pois_standard", colnames(gof.res), perl=TRUE)] <- cbind(pois.loglik, pois.loglikdf, pois.gof.stats, pois.predzero, pois.aic)
}
# poisson with fitdistr
if(inherits(tmp.fit.pois.wo, 'try-error')) {
gof.res[i, grep("^pois_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
pois.gof.fitdistr <- try(.fitdistrplusGOF(fitdistobj=tmp.fit.pois.wo), silent=T)
if(inherits(pois.gof.fitdistr, 'try-error')) {
gof.res[i, grep("^pois_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
gof.res[i, grep("^pois_fitdistr", colnames(gof.res), perl=TRUE)] <- cbind(pois.gof.fitdistr)
estimate.res[i, grep("^pois_", colnames(estimate.res), perl=TRUE)] <- tmp.fit.pois.wo$estimate
}
}
# neg binom
if(inherits(tmp.fit.nbinom, 'try-error')) {
gof.res[i, grep("^nbinom_standard", colnames(gof.res), perl=TRUE)] <- NA
} else {
nbinom.gof.stats <- .myGOF(deviances = tmp.fit.nbinom$deviance, df.residuals = tmp.fit.nbinom$df.residual)
nbinom.predzero <- round(sum(stats::dnbinom(0, mu = stats::fitted(tmp.fit.nbinom), size = tmp.fit.nbinom$theta)))
nbinom.aic <- stats::AIC(tmp.fit.nbinom)
nbinom.loglik <- as.numeric(logLik(tmp.fit.nbinom))
nbinom.loglikdf <- as.numeric(attr(logLik(tmp.fit.nbinom), "df"))
gof.res[i, grep("^nbinom_standard", colnames(gof.res), perl=TRUE)] <- cbind(nbinom.loglik, nbinom.loglikdf, nbinom.gof.stats, nbinom.predzero, nbinom.aic)
}
# neg binom with fitdistr
if(inherits(tmp.fit.nbinom.wo, 'try-error')) {
gof.res[i, grep("^nbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
nbinom.gof.fitdistr <- try(.fitdistrplusGOF(fitdistobj=tmp.fit.nbinom.wo), silent=T)
if(inherits(nbinom.gof.fitdistr, 'try-error')) {
gof.res[i, grep("^nbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
gof.res[i, grep("^nbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- cbind(nbinom.gof.fitdistr)
estimate.res[i, grep("^nbinom_", colnames(estimate.res), perl=TRUE)] <- tmp.fit.nbinom.wo$estimate
}
}
# zero inflated poisson
if(inherits(tmp.fit.zifpois, 'try-error')) {
gof.res[i, grep("^zifpois_standard", colnames(gof.res), perl=TRUE)] <- NA
} else {
zifpois.predzero <- round(sum(predict(tmp.fit.zifpois, type = "prob")[, 1]))
zifpois.aic <- stats::AIC(tmp.fit.zifpois)
zifpois.loglik <- as.numeric(logLik(tmp.fit.zifpois))
zifpois.loglikdf <- as.numeric(attr(logLik(tmp.fit.zifpois), "df"))
zifpois.gof.stats <- .myGOF(deviances = -2*logLik(tmp.fit.zifpois), df.residuals = length(dge$counts[i,]-2))
gof.res[i, grep("^zifpois_standard", colnames(gof.res), perl=TRUE)] <- cbind(zifpois.loglik, zifpois.loglikdf, zifpois.gof.stats, zifpois.predzero, zifpois.aic)
}
# zero inflated poisson with fitdistr
if(inherits(tmp.fit.zifpois.wo, 'try-error')) {
gof.res[i, grep("^zifpois_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
zifpois.gof.fitdistr <- try(.fitdistrplusGOF(fitdistobj=tmp.fit.zifpois.wo), silent=T)
if(inherits(zifpois.gof.fitdistr, 'try-error')) {
gof.res[i, grep("^zifpois_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
gof.res[i, grep("^zifpois_fitdistr", colnames(gof.res), perl=TRUE)] <- cbind(zifpois.gof.fitdistr)
estimate.res[i, grep("^zifpois_", colnames(estimate.res), perl=TRUE)] <- tmp.fit.zifpois.wo$estimate
}
}
# zero inflated neg binom
if(inherits(tmp.fit.zifnbinom, 'try-error')) {
gof.res[i, grep("^zifnbinom_standard", colnames(gof.res), perl=TRUE)] <- NA
} else {
zifnbinom.predzero <- round(sum(predict(tmp.fit.zifnbinom, type = "prob")[, 1]))
zifnbinom.aic <- stats::AIC(tmp.fit.zifnbinom)
zifnbinom.loglik <- as.numeric(logLik(tmp.fit.zifnbinom))
zifnbinom.loglikdf <- as.numeric(attr(logLik(tmp.fit.zifnbinom), "df"))
zifnbinom.gof.stats <- .myGOF(deviances = -2*logLik(tmp.fit.zifnbinom), df.residuals = length(dge$counts[i,]-1))
gof.res[i, grep("^zifnbinom_standard", colnames(gof.res), perl=TRUE)] <- cbind(zifnbinom.loglik, zifnbinom.loglikdf, zifnbinom.gof.stats, zifnbinom.predzero, zifnbinom.aic)
}
# zero inflated neg binom with fitdistr
if(inherits(tmp.fit.zifnbinom.wo, 'try-error')) {
gof.res[i, grep("^zifnbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
zifnbinom.gof.fitdistr <- try(.fitdistrplusGOF(fitdistobj=tmp.fit.zifnbinom.wo), silent=T)
if(inherits(zifnbinom.gof.fitdistr, 'try-error')) {
gof.res[i, grep("^zifnbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- NA
} else {
gof.res[i, grep("^zifnbinom_fitdistr", colnames(gof.res), perl=TRUE)] <- cbind(zifnbinom.gof.fitdistr)
estimate.res[i, grep("^zifnbinom_", colnames(estimate.res), perl=TRUE)] <- tmp.fit.zifnbinom.wo$estimate
}
}
# LR Test for NB > P ?
if(all(!inherits(tmp.fit.pois, 'try-error') && !inherits(tmp.fit.nbinom, 'try-error'))) {
tmp.lrt.nbp = try(pchisq(as.numeric(2 * (logLik(tmp.fit.nbinom) - logLik(tmp.fit.pois))),
df = 1, lower.tail = FALSE), silent=T)
if(!inherits(tmp.lrt.nbp, 'try-error')) {
gof.res[i, grep("LRT_standard_NBPoisson", colnames(gof.res), perl=TRUE)] <- tmp.lrt.nbp
}
}
# Vuong Test for ZINB > NB, ZIP > P, ZNB >ZP ?
if(all(!inherits(tmp.fit.zifpois, 'try-error') && !inherits(tmp.fit.pois, 'try-error'))) {
tmp.vuong.p = try(nonnest2::vuongtest(tmp.fit.zifpois,tmp.fit.pois), silent=T)
if(!inherits(tmp.vuong.p, 'try-error')) {
gof.res[i, grep("Vuong_standard_ZPoisson", colnames(gof.res), perl=TRUE)] <- tmp.vuong.p$p_LRT$A
}
}
if(all(!inherits(tmp.fit.zifnbinom, 'try-error') && !inherits(tmp.fit.nbinom, 'try-error'))) {
tmp.vuong.nb = try(nonnest2::vuongtest(tmp.fit.zifnbinom,tmp.fit.nbinom), silent=T)
if(!inherits(tmp.vuong.nb, 'try-error')) {
gof.res[i, grep("Vuong_standard_ZNB", colnames(gof.res), perl=TRUE)] <- tmp.vuong.nb$p_LRT$A
}
}
if(all(!inherits(tmp.fit.zifnbinom, 'try-error') && !inherits(tmp.fit.zifpois, 'try-error'))) {
tmp.vuong.nbp = try(nonnest2::vuongtest(tmp.fit.zifnbinom,tmp.fit.zifpois), silent=T)
if(!inherits(tmp.vuong.nbp, 'try-error')) {
gof.res[i, grep("Vuong_standard_ZNBZPoisson", colnames(gof.res), perl=TRUE)] <- tmp.vuong.nbp$p_LRT$A
}
}
# poisson beta fit
if(inherits(tmp.fit.pb, 'try-error')) {
gof.res[i, grep("^PoiBeta", colnames(gof.res))] <- NA
} else {
gof.res[i, grep("^PoiBeta", colnames(gof.res))] <- c(tmp.fit.pb$LogLikelihood.hemberg, 3, tmp.fit.pb$PredZero.hemberg, tmp.fit.pb$AIC.hemberg, tmp.fit.pb$LogLikelihood.marioni, 3, tmp.fit.pb$PredZero.marioni, tmp.fit.pb$AIC.marioni)
estimate.res[i,grep('bp', colnames(estimate.res), perl=TRUE)] <- cbind(tmp.fit.pb$bp.alpha.hemberg, tmp.fit.pb$bp.alpha.marioni, tmp.fit.pb$bp.beta.hemberg, tmp.fit.pb$bp.beta.marioni,tmp.fit.pb$bp.gamma.hemberg, tmp.fit.pb$bp.gamma.marioni)
}
}
# fill in edgeR results
edgeR.res <- cbind(edgeR.gof.stats, edgeR.aic)
colnames(edgeR.res) <- c('edgeRglm_gofstat', 'edgeRglm_gofdf', 'edgeRglm_gofpval', 'edgeRglm_aic')
rownames(edgeR.res) <- rownames(dge$counts)
# observed zeros
ObservedZeros <- data.frame(ObsZero=rowSums(dge$counts==0),
Dropout=rowMeans(dge$counts==0))
rownames(ObservedZeros) <- rownames(dge$counts)
# list result object:
# goodness of fit statistics per model
# estimated parameters
# observed zeros
return(list(edgeR = edgeR.res,
GOF = gof.res,
Estimates = estimate.res,
ObservedZeros = ObservedZeros))
}
# EVALUATE DIFFERENTIAL EXPRESSION ----------------------------------------
#' @name evaluateDE
#' @aliases evaluateDE
#' @title Compute the confusion matrix-related quantities from simulation results
#' @description This function takes the simulation output from \code{\link{simulateDE}}
#' and computes quantities of the confusion matrix for statistical power evaluation.
#' @usage evaluateDE(simRes,
#' alpha.type=c("adjusted","raw"),
#' MTC=c('BY', 'BH', 'holm', 'hochberg', 'hommel', 'bonferroni', 'Storey', 'IHW'),
#' alpha.nominal=0.1,
#' stratify.by=c("mean", "dispersion", "dropout", "lfc"),
#' strata.probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
#' filter.by=c("none", "mean", "dispersion", "dropout"),
#' strata.filtered=1, target.by=c("lfc", "effectsize"), delta=0,
#' Table = TRUE)
#' @param simRes The result from \code{\link{simulateDE}}.
#' @param MTC Multiple testing correction method to use. Available options are
#' 1) see \link[stats]{p.adjust.methods},
#' 2) Storey's qvalue see \link[qvalue]{qvalue} and
#' 3) Independent Hypothesis Weighting considering mean expression as covariate (see \link[IHW]{ihw}).
#' Default is \code{BY}, i.e. Benjamini-Yekutieli FDR correction method.
#' @param alpha.type A string to represent the way to call DE genes.
#' Available options are \code{"adjusted"} i.e. applying multiple testing correction and
#' \code{"raw"} i.e. using p-values. Default is \code{"adjusted"}.
#' @param alpha.nominal The nomial level of significance. Default is 0.1.
#' @param stratify.by A string to represent the way to stratify genes.
#' Available options are \code{"mean"}, \code{"dispersion"}, \code{"dropout"} and \code{"lfc"},
#' for stratifying genes by average expression levels, dispersion, dropout rates or estimated log2 fold changes.
#' @param strata.probs A vector specifying the probability values for sample quantiles of the strata. See \link[qvalue]{qvalue}.
#' @param filter.by A string to represent the way to filter genes.
#' This is used in conjunction with strata.filtered for gene filtering.
#' Available options are \code{"none"}, \code{"mean"}, \code{"dispersion"} and \code{"dropout"}.
#' \code{"none"} stands for no filtering, thus all genes will be considered.
#' \code{"mean"} stands for filtering based on average gene expression levels.
#' \code{"dispersion"} stands for filtering based on gene expression dispersion.
#' \code{"dropout"} stands for filtering based on dropout rates.
#' @param strata.filtered The strata to be filtered out in computing error matrix-related quantities.
#' Genes falling into these strata will be excluded. See "Details" for more description of gene filtering.
#' @param target.by A string to specify the method to define "biologically important" DE genes.
#' Available options are (1) \code{"lfc"}: interesting genes are defined by absolute log2 fold changes.
#' (2) \code{"effectsize"}: interesting genes are defined by
#' absolute log2 fold changes divided by the square root of 1/(mean+dispersion).
#' @param delta A threshold used for defining "biologically important" genes.
#' Genes with absolute log2 fold changes (when target.by is "lfc")
#' or effect sizes (when target.by is "effectsize") greater than this value
#' are deemed DE in error rates calculations. If \code{delta=0} then no threshold is applied. See "Details" for more description.
#' @param Table A logical vector. If the default \code{TRUE}, then a table of marginal error rates is printed additionally (see \code{\link{printEvalDE}}).
#' @return A list with the following entries:
#' \item{TN, TP, FP, FN, TNR, TPR, FPR, FNR, FDR}{3D array representing the number of true negatives, true positives, false positives,
#' false negatives and their proportions/rates as well as false discovery rate
#' for all simulation settings. The dimension of the arrays are nstrata * N * nsims.
#' Here nstrata is number of specified strata.
#' N is number of different sample sizes settings, and nsims is number of simulations.}
#' \item{TN.marginal, TP.marginal, FP.marginal, FN.marginal}{Matrix representing the number of true negatives, true positives, false positives,
#' false negatives for all simulation settings.
#' The dimension of the matrices are N * nsims.
#' Here N is number of different sample sizes settings, and nsims is number of simulations.}
#' \item{TNR.marginal, TPR.marginal, FPR.marginal, FNR.marginal, FDR.marginal}{Matrix representing the marginal rates for all simulation settings.
#' The dimension of the matrices are N * nsims.}
#' \item{stratagenes, stratadiffgenes}{Number of genes per stratum and number of DE genes per stratum.}
#' \item{stratify.by}{The input "stratify.by".}
#' \item{strata}{The input strata.}
#' \item{n1,n2}{Sample sizes per group.
#' This is taken from the simulation options.}
#' \item{target.by}{The input method to define "biologically important" DE genes,
#' either by log fold change or effect size.}
#' \item{delta}{The input delta for biologically important genes.
#' If delta=0, all target.by will be considered.}
#' @details This is the main function to compute various power-related quantities,
#' using stratification and filtering.
#' \describe{
#' \item{Gene stratification}{We recommend to compute and visualize error rates (especially TPR)
#' conditional on expression characteristics like mean, dispersion and/or dropout rate.
#' It is likely that the power to detect DE genes is strongly dependent on
#' mean expression levels even though the magnitude of effect sizes is the same.
#' The stratified results will provide a more comprehensive power assessment and
#' better guide the investigators in experimental designs and analysis strategies.}
#' \item{Gene filtering}{Sometimes it is advisible to filter out some genes
#' (such as the ones with very low mean expression) before DE detection.
#' The filtering option here provides an opportunity to compare the rates before and after filtering.}
#' \item{Define biologically interesting genes}{We provide two options to define biologically interesting genes:
#' by absolute values of log fold changes or effect sizes
#' (absolute values of log fold changes divided by the square root of 1/(mean+dispersions)).
#' Genes with these quantities over a threshold are deemed interesting,
#' and the rate calculations are based on these genes.}
#' }
#' @author Beate Vieth
#' @seealso \code{\link{estimateParam}} for negative binomial parameters,
#' \code{\link{Setup}} for setting up simulation parameters and
#' \code{\link{simulateDE}} for simulating differential expression and
#' \code{\link{plotEvalDE}} for visualisation.
#' @examples
#' \dontrun{
#' # estimate gene parameters
#' data("Bulk_Read_Counts")
#' data("GeneLengths_hg19")
#' estparam_gene <- estimateParam(countData = Bulk_Read_Counts,
#' readData = NULL,
#' batchData = NULL,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = GeneLengths_hg19, MeanFragLengths = NULL,
#' RNAseq = 'bulk', Protocol = 'Read',
#' Distribution = 'NB', Normalisation = "MR",
#' GeneFilter = 0.25, SampleFilter = 3,
#' sigma = 1.96, NCores = NULL, verbose = TRUE)
#' # define log fold change
#' p.lfc <- function(x) sample(c(-1,1), size=x,replace=T)*rgamma(x, shape = 2, rate = 2)
#' # set up simulations
#' setupres <- Setup(ngenes = 10000, nsims = 10,
#' p.DE = 0.1, pLFC = p.lfc,
#' n1 = c(3,6,12), n2 = c(3,6,12),
#' Thinning = c(1,0.9,0.8), LibSize = 'given',
#' estParamRes = estparam_gene,
#' estSpikeRes = NULL,
#' DropGenes = FALSE,
#' sim.seed = 4379, verbose = TRUE)
#' # run simulation
#' simres <- simulateDE(SetupRes = setupres,
#' Prefilter = NULL, Imputation = NULL,
#' Normalisation = 'MR', Label = 'none',
#' DEmethod = "limma-trend", DEFilter = FALSE,
#' NCores = NULL, verbose = TRUE)
#' # DE evaluation
#' evalderes <- evaluateDE(simRes = simres, alpha.type="adjusted",
#' MTC='BH', alpha.nominal=0.05,
#' stratify.by = "lfc", filter.by = "mean",
#' strata.filtered = 1,
#' target.by = "lfc", delta = 0.25)
#' plotEvalDE(evalRes = evalderes, rate = "marginal")
#' plotEvalDE(evalRes = evalderes, rate = "conditional")
#' }
#' @rdname evaluateDE
#' @importFrom stats ecdf quantile p.adjust.methods p.adjust
#' @importFrom qvalue qvalue
#' @importFrom IHW ihw adj_pvalues
#' @export
evaluateDE <- function(simRes, alpha.type=c("adjusted","raw"),
MTC=c('BY', 'BH', 'holm', 'hochberg', 'hommel', 'bonferroni', 'Storey', 'IHW'),
alpha.nominal=0.1,
stratify.by=c("mean", "dispersion", "dropout", "lfc"),
strata.probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
filter.by=c("none", "mean", "dispersion", "dropout"),
strata.filtered=1,
target.by=c("lfc", "effectsize"), delta=0,
Table = TRUE) {
alpha.type = match.arg(alpha.type)
MTC = match.arg(MTC)
stratify.by = match.arg(stratify.by)
filter.by = match.arg(filter.by)
target.by = match.arg(target.by)
## some general parameters
Nreps1 = simRes$SimSetup$n1
Nreps2 = simRes$SimSetup$n2
ngenes = simRes$DESetup$ngenes
DEids = simRes$DESetup$DEid
lfcs = simRes$DESetup$pLFC
tlfcs = lapply(1:length(lfcs), function(i) {lfcs[[i]]})
nsims = simRes$DESetup$nsims
estmeans = simRes$estParamRes$Parameters[[simRes$DESetup$Draw$MoM]]$means
estdisps = simRes$estParamRes$Parameters[[simRes$DESetup$Draw$MoM]]$dispersion
estdropout = simRes$estParamRes$Parameters[[simRes$DESetup$Draw$MoM]]$gene.dropout
mu = simRes$SimulateRes$mu
disp = simRes$SimulateRes$disp
dropout = simRes$SimulateRes$dropout
elfc = simRes$SimulateRes$elfc
DEmethod = simRes$Pipeline$DEmethod
pvalue = simRes$SimulateRes$pvalue
fdr = simRes$SimulateRes$fdr
## calculate strata
tmp.ecdf.mean = stats::ecdf(log2(estmeans+1))
tmp.quantile.mean = stats::quantile(tmp.ecdf.mean, probs=strata.probs)
strata.mean = unique(c(0,unname(tmp.quantile.mean),Inf))
strata.mean = unique(round(strata.mean, digits=2))
tmp.ecdf.disps = stats::ecdf(log2(estdisps+1))
tmp.quantile.disps = stats::quantile(tmp.ecdf.disps, probs=strata.probs)
strata.disps = unique(c(0,unname(tmp.quantile.disps),Inf))
strata.disps = unique(round(strata.disps, digits=2))
tmp.ecdf.drop = stats::ecdf(estdropout)
tmp.quantile.drop = stats::quantile(tmp.ecdf.drop, probs=strata.probs)
strata.drop = unique(c(0,unname(tmp.quantile.drop),1))
strata.drop = unique(round(strata.drop, digits=2))
tmp.ecdf.lfc = stats::ecdf(unique(unlist(tlfcs)))
tmp.quantile.lfc = stats::quantile(tmp.ecdf.lfc, probs=strata.probs)
strata.lfc = unique(c(-Inf,unname(tmp.quantile.lfc),Inf))
strata.lfc = unique(round(strata.lfc, digits=2))
## initialize results
## determine dimension of results, for filtering
if(stratify.by=='mean') {
nr = length(strata.mean) - 1
}
if(stratify.by=='dispersion') {
nr = length(strata.disps) - 1
}
if(stratify.by=='dropout') {
nr = length(strata.drop) - 1
}
if(stratify.by=='lfc') {
nr = length(strata.lfc) - 1
}
if(filter.by %in% c("mean", "dispersion", "dropout")) {
if(filter.by == stratify.by){
nstrata = nr - strata.filtered
}
if(!filter.by == stratify.by){
nstrata = nr
}
}
if(filter.by == "none") {
nstrata = nr
}
TP = TN = FP = FN = TPR = TNR = FPR = FNR = FDR = xgrl = xgrld = array(NA,dim=c(nstrata,length(Nreps1), nsims))
TP.marginal = TN.marginal = FP.marginal = FN.marginal = TPR.marginal = TNR.marginal = FPR.marginal = FNR.marginal = FDR.marginal = matrix(NA,length(Nreps1), nsims)
## loop over simulation and replicates
for(i in 1:nsims) {
for(j in seq(along=Nreps1)) {
Nrep1 = Nreps1[j]
Nrep2 = Nreps2[j]
## get DE flags.
DEid = DEids[[i]]
lfc = lfcs[[i]]
Zg = Zg2 = rep(0, ngenes)
Zg[DEid] = 1
## find target (interesting) genes
if(delta == 0) {
Zg2 = Zg
}
if(!delta == 0) {
if(target.by == "lfc") {
ix = abs(lfc) > delta
} else if (target.by == "effectsize") {
effectsize = lfc / sqrt(1/((log2(mu[,j,i] + 1)+log2(disp[,j,i] + 1))))
ix = abs(effectsize) > delta
}
Zg2[ix] = 1
}
### STRATIFICATION
## calculate stratificaton
# mean
X.bar1 = mu[,j,i]
ix.keep.mean = which(!is.na(X.bar1))
xgr.mean = cut(log2(X.bar1[ix.keep.mean]+1), strata.mean)
xgrd.mean = cut(log2(X.bar1[DEid]+1), strata.mean)
# dispersion
X.disp1 = disp[,j,i]
ix.keep.disps = which(!is.na(X.disp1))
xgr.disps = cut(log2(X.disp1[ix.keep.disps]+1), strata.disps)
xgrd.disps = cut(log2(X.disp1[DEid]+1), strata.disps)
# dropout
X.drop1 = dropout[,j,i]
ix.keep.drop = which(!is.na(X.drop1))
xgr.drop = cut(X.drop1[ix.keep.drop], strata.drop)
xgrd.drop = cut(X.drop1[DEid], strata.drop)
# lfc
X.lfc1 = elfc[,j,i]
ix.keep.lfc = which(!is.na(X.lfc1))
xgr.lfc = cut(X.lfc1[ix.keep.lfc], strata.lfc)
xgrd.lfc = cut(X.lfc1[DEid], strata.lfc)
### FILTERING
## calculate filtering
## stratify by mean
if(stratify.by == "mean") {
if(filter.by == "mean") {
lev.mean = levels(xgr.mean)
strata.filt.mean = c(1:strata.filtered)
fix.keep.mean = ix.keep.mean[!(xgr.mean %in% lev.mean[strata.filt.mean])]
fix.dekeep.mean = intersect(fix.keep.mean, DEid)
# recut
fxgr.mean = cut(log2(X.bar1[fix.keep.mean]+1), strata.mean[-strata.filt.mean])
fxgrd.mean = cut(log2(X.bar1[fix.dekeep.mean]+1), strata.mean[-strata.filt.mean])
}
if(filter.by == "dispersion") {
lev.disps = levels(xgr.disps)
strata.filt.disps = c((length(lev.disps)-strata.filtered+1):length(lev.disps))
fix.keep.mean = ix.keep.mean[!(xgr.disps %in% lev.disps[strata.filt.disps])]
fix.dekeep.mean = intersect(fix.keep.mean, DEid)
# recut
fxgr.mean = cut(log2(X.bar1[fix.keep.mean]+1), strata.mean)
fxgrd.mean = cut(log2(X.bar1[fix.dekeep.mean]+1), strata.mean)
}
if(filter.by == "dropout") {
lev.drop = levels(xgr.drop)
strata.filt.drop = c((length(lev.drop)-strata.filtered+1):length(lev.drop))
fix.keep.mean = ix.keep.mean[!(xgr.drop %in% lev.drop[strata.filt.drop])]
fix.dekeep.mean = intersect(fix.keep.mean, DEid)
# recut
fxgr.mean = cut(log2(X.bar1[fix.keep.mean]+1), strata.mean)
fxgrd.mean = cut(log2(X.bar1[fix.dekeep.mean]+1), strata.mean)
}
if(filter.by == "none") {
fix.keep.mean = ix.keep.mean
fxgr.mean = xgr.mean
fxgrd.mean = xgrd.mean
}
}
## stratify by dispersion
if(stratify.by == "dispersion") {
if(filter.by == "mean") {
lev.mean = levels(xgr.mean)
strata.filt.mean = c(1:strata.filtered)
fix.keep.disps = ix.keep.disps[!(xgr.mean %in% lev.mean[strata.filt.mean])]
fix.dekeep.disps = intersect(fix.keep.disps, DEid)
# recut
fxgr.disps = cut(log2(X.disp1[fix.keep.disps]+1), strata.disps)
fxgrd.disps = cut(log2(X.disp1[fix.dekeep.disps]+1), strata.disps)
}
if(filter.by == "dispersion") {
lev.disps = levels(xgr.disps)
lev.filt.disps = c((length(lev.disps)-strata.filtered+1):length(lev.disps))
fix.keep.disps = ix.keep.disps[!(xgr.disps %in% lev.disps[lev.filt.disps])]
fix.dekeep.disps = intersect(fix.keep.disps, DEid)
strata.filt.disps = length(strata.disps) - strata.filtered
# recut
fxgr.disps = cut(log2(X.disp1[fix.keep.disps]+1), strata.disps[1:strata.filt.disps])
fxgrd.disps = cut(log2(X.disp1[fix.dekeep.disps]+1), strata.disps[1:strata.filt.disps])
}
if(filter.by == "dropout") {
lev.drop = levels(xgr.drop)
strata.filt.drop = c((length(lev.drop)-strata.filtered+1):length(lev.drop))
fix.keep.disps = ix.keep.disps[!(xgr.drop %in% lev.drop[strata.filt.drop])]
fix.dekeep.disps = intersect(fix.keep.disps, DEid)
# recut
fxgr.disps = cut(X.disp1[fix.keep.disps], strata.disps)
fxgrd.disps = cut(X.disp1[fix.dekeep.disps], strata.disps)
}
if(filter.by == "none") {
fix.keep.disps = ix.keep.disps
fxgr.disps = xgr.disps
fxgrd.disps = xgrd.disps
}
}
## stratify by dropout
if(stratify.by == "dropout") {
if(filter.by == "mean") {
lev.mean = levels(xgr.mean)
strata.filt.mean = c(1:strata.filtered)
fix.keep.drop = ix.keep.drop[!(xgr.mean %in% lev.mean[strata.filt.mean])]
fix.dekeep.drop = intersect(fix.keep.drop, DEid)
# recut
fxgr.drop = cut(X.drop1[fix.keep.drop], strata.drop)
fxgrd.drop = cut(X.drop1[fix.dekeep.drop], strata.drop)
}
if(filter.by == "dispersion") {
lev.disps = levels(xgr.disps)
strata.filt.disps = c((length(lev.disps)-strata.filtered+1):length(lev.disps))
fix.keep.drop = ix.keep.drop[!(xgr.disps %in% lev.disps[strata.filt.disps])]
fix.dekeep.drop = intersect(fix.keep.drop, DEid)
# recut
fxgr.drop = cut(X.drop1[fix.keep.drop], strata.drop)
fxgrd.drop = cut(X.drop1[fix.dekeep.drop], strata.drop)
}
if(filter.by == "dropout") {
lev.drop = levels(xgr.drop)
strata.filt.drop = c((length(lev.drop)-strata.filtered+1):length(lev.drop))
fix.keep.drop = ix.keep.drop[!(xgr.drop %in% lev.drop[strata.filt.drop])]
fix.dekeep.drop = intersect(fix.keep.drop, DEid)
strata.filt.drop = length(strata.drop) - strata.filtered
# recut
fxgr.drop = cut(X.drop1[fix.keep.drop], strata.drop[1:strata.filt.drop])
fxgrd.drop = cut(X.drop1[fix.dekeep.drop], strata.drop[1:strata.filt.drop])
}
if(filter.by == "none") {
fix.keep.drop = ix.keep.drop
fxgr.drop = xgr.drop
fxgrd.drop = xgrd.drop
}
}
## stratify by lfc
if(stratify.by == "lfc") {
if(filter.by == "mean") {
lev.mean = levels(xgr.mean)
strata.filt.mean = c(1:strata.filtered)
fix.keep.lfc = ix.keep.lfc[!(xgr.mean %in% lev.mean[strata.filt.mean])]
fix.dekeep.lfc = intersect(fix.keep.lfc, DEid)
# recut
fxgr.lfc = cut(X.lfc1[fix.keep.lfc], strata.lfc)
fxgrd.lfc = cut(X.lfc1[fix.dekeep.lfc], strata.lfc)
}
if(filter.by == "dispersion") {
lev.disps = levels(xgr.disps)
strata.filt.disps = c((length(lev.disps)-strata.filtered+1):length(lev.disps))
fix.keep.lfc = ix.keep.lfc[!(xgr.disps %in% lev.mean[strata.filt.disps])]
fix.dekeep.lfc = intersect(fix.keep.lfc, DEid)
# recut
fxgr.lfc = cut(X.lfc1[fix.keep.lfc], strata.lfc)
fxgrd.lfc = cut(X.lfc1[fix.dekeep.lfc], strata.lfc)
}
if(filter.by == "dropout") {
lev.drop = levels(xgr.drop)
strata.filt.drop = c((length(lev.drop)-strata.filtered+1):length(lev.drop))
fix.keep.lfc = ix.keep.lfc[!(xgr.drop %in% lev.drop[strata.filt.drop])]
fix.dekeep.lfc = intersect(fix.keep.lfc, DEid)
# recut
fxgr.lfc = cut(X.lfc1[fix.keep.lfc], strata.lfc)
fxgrd.lfc = cut(X.lfc1[fix.dekeep.lfc], strata.lfc)
}
if(filter.by == "none") {
fix.keep.lfc = ix.keep.lfc
fxgr.lfc = xgr.lfc
fxgr.lfc = xgr.lfc
fxgrd.lfc = xgrd.lfc
}
}
### SET STRATIFICATION
if(stratify.by == "mean") {
strata = strata.mean
xgr = fxgr.mean
xgrd = fxgrd.mean
ix.keep = fix.keep.mean
}
if(stratify.by == "dispersion") {
strata = strata.disps
xgr = fxgr.disps
xgrd = fxgrd.disps
ix.keep = fix.keep.disps
}
if(stratify.by == "dropout") {
strata = strata.drop
xgr = fxgr.drop
xgrd = fxgrd.drop
ix.keep = fix.keep.drop
}
if(stratify.by == "lfc") {
strata = strata.lfc
xgr = fxgr.lfc
xgrd = fxgrd.lfc
ix.keep = fix.keep.lfc
}
## get type I error alpha (pvalue or fdr output from testing)
if(alpha.type == "raw") {
if(DEmethod %in% c("edgeR-QL", "edgeR-LRT", "limma-voom", "limma-trend", "NBPSeq", "T-Test",
"DESeq2", "ROTS", "MAST", "scde", "BPSC", "scDD", "monocle", "DECENT",
"edgeR-zingeR", "edgeR-ZINB-WaVE", "DESeq2-zingeR", "DESeq2-ZINB-WaVE")) {
x = pvalue[ix.keep,j,i]
x[is.na(x)] = 1
}
if(DEmethod %in% c("baySeq", "NOISeq", "EBSeq")) {
message(paste0("The DE method ", DEmethod," only provides adjusted p-values."))
x = fdr[ix.keep,j,i]
x[is.na(x)] = 1
}
}
if(alpha.type == "adjusted") {
if(DEmethod %in% c("edgeR-QL", "edgeR-LRT", "limma-voom", "limma-trend", "NBPSeq", "T-Test",
"DESeq2", "ROTS", "MAST", "scde", "BPSC", "scDD", "monocle", "DECENT",
"edgeR-zingeR", "edgeR-ZINB-WaVE", "DESeq2-zingeR", "DESeq2-ZINB-WaVE")) {
pval = pvalue[ix.keep,j,i]
meanexpr = log2(mu[ix.keep,j,i]+1)
if(MTC %in% stats::p.adjust.methods) {
x = stats::p.adjust(pval, method = MTC)
x[is.na(x)] = 1
}
if(MTC %in% "Storey") {
tmp.p = pval[!is.na(pval)]
tmp.q = qvalue::qvalue(p = tmp.p)$qvalues
x = rep(NA, length(pval))
x[!is.na(pval)] = tmp.q
x[is.na(x)] = 1
}
if(MTC %in% "IHW") {
in.dat = data.frame(pvalue = pval, meanexpr = meanexpr)
tmp = IHW::ihw(pvalue ~ meanexpr, data = in.dat, alpha = alpha.nominal)
x = IHW::adj_pvalues(tmp)
x[is.na(x)] = 1
}
}
if(DEmethod %in% c("baySeq", "NOISeq", "EBSeq")) {
message(paste0("The DE method ", DEmethod," only provides adjusted p-values."))
x = fdr[ix.keep,j,i]
x[is.na(x)] = 1
}
}
## update Zg flags after filtering
Zg = Zg[ix.keep]
Zg2 = Zg2[ix.keep]
# number of strata genes and diff strata genes in output table
xgrl[,j,i] = table(xgr)
xgrld[,j,i] = table(xgrd)
## calculate stratified power-related quantities
error.mat = .error.matrix(p=x, p.crit=alpha.nominal, Zg=Zg, Zg2=Zg2, xgr=xgr)
TP[,j,i] = error.mat$TP
TN[,j,i] = error.mat$TN
FP[,j,i] = error.mat$FP
FN[,j,i] = error.mat$FN
TP.marginal[j,i] = error.mat$TP.marginal
TN.marginal[j,i] = error.mat$TN.marginal
FP.marginal[j,i] = error.mat$FP.marginal
FN.marginal[j,i] = error.mat$FN.marginal
TPR[,j,i] = error.mat$TPR
TNR[,j,i] = error.mat$TNR
FPR[,j,i] = error.mat$FPR
FNR[,j,i] = error.mat$FNR
FDR[,j,i] = error.mat$FDR
TPR.marginal[j,i] = error.mat$TPR.marginal
TNR.marginal[j,i] = error.mat$TNR.marginal
FPR.marginal[j,i] = error.mat$FPR.marginal
FNR.marginal[j,i] = error.mat$FNR.marginal
FDR.marginal[j,i] = error.mat$FDR.marginal
}
}
output <- list(stratagenes=xgrl,
stratadiffgenes=xgrld,
TN=TN, TP=TP, FP=FP, FN=FN,
TN.marginal=TN.marginal,
TP.marginal=TP.marginal,
FP.marginal=FP.marginal,
FN.marginal=FN.marginal,
TNR=TNR, TPR=TPR, FPR=FPR, FNR=FNR, FDR=FDR,
TNR.marginal=TNR.marginal, TPR.marginal=TPR.marginal,
FPR.marginal=FPR.marginal, FNR.marginal=FNR.marginal,
FDR.marginal=FDR.marginal,
## below are input parameters:
alpha.type=alpha.type, MTC=ifelse(alpha.type=="adjusted", MTC, "not applicable"),
alpha.nominal=alpha.nominal,
stratify.by=stratify.by, strata=strata, strata.levels=levels(xgr),
target.by=target.by, n1=Nreps1, n2=Nreps2, delta=delta)
if(Table){
printEvalDE(evalRes = output)
}
return(output)
}
# EVALUATE SETUP ----------------------------------------------------------
#' @name evaluateSim
#' @aliases evaluateSim
#' @title Compute the performance related metrics from simulation results.
#' @description This function takes the simulation output from \code{\link{simulateDE}}
#' and computes several metrics that give an indication of the simulation setup performance.
#' @usage evaluateSim(simRes)
#' @param simRes The result from \code{\link{simulateDE}}.
#' @return A list with the following entries:
#' \item{LogFoldChange}{The absolute mean error (\code{MAE}), root mean square error (\code{RMSE})
#' and root mean square residual error of a robust linear model (\code{rRMSE}, \code{\link[MASS]{rlm}})
#' of log fold change differences between estimated and simulated.
#' Furthermore, the fraction of missing entries (\code{NAFraction}) for MAE annd RMSE.}
#' \item{SizeFactors}{The median absolute deviation (MAD) between the simulated and estimated size factors,
#' the root mean square residual error of a robust linear model (rRMSE, \code{\link[MASS]{rlm}}) and
#' the group-specific ratio between simulated and estimated size factors (\code{GroupX}).}
#' @author Beate Vieth
#' @seealso \code{\link{estimateParam}} for negative binomial parameters,
#' \code{\link{Setup}} for setting up simulation parameters and
#' \code{\link{simulateDE}} for simulating differential expression
#' @examples
#' \dontrun{
#' # estimate gene parameters
#' data("SmartSeq2_Gene_Read_Counts")
#' Batches = data.frame(Batch = sapply(strsplit(colnames(SmartSeq2_Gene_Read_Counts), "_"), "[[", 1),
#' stringsAsFactors = F,
#' row.names = colnames(SmartSeq2_Gene_Read_Counts))
#' data("GeneLengths_mm10")
#' estparam_gene <- estimateParam(countData = SmartSeq2_Gene_Read_Counts,
#' readData = NULL,
#' batchData = Batches,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = GeneLengths_mm10, MeanFragLengths = NULL,
#' RNAseq = 'singlecell', Protocol = 'Read',
#' Distribution = 'ZINB', Normalisation = "scran",
#' GeneFilter = 0.1, SampleFilter = 3,
#' sigma = 1.96, NCores = NULL, verbose = TRUE)
#' # define log fold change
#' p.lfc <- function(x) sample(c(-1,1), size=x,replace=T)*rgamma(x, shape = 1, rate = 2)
#' # set up simulations
#' setupres <- Setup(ngenes = 10000, nsims = 10,
#' p.DE = 0.1, pLFC = p.lfc,
#' n1 = c(20,50,100), n2 = c(30,60,120),
#' Thinning = c(1,0.9,0.8), LibSize = 'given',
#' estParamRes = estparam_gene,
#' estSpikeRes = NULL,
#' DropGenes = FALSE,
#' sim.seed = 66437, verbose = TRUE)
#' # run simulation
#' simres <- simulateDE(SetupRes = setupres,
#' Prefilter = "FreqFilter",
#' Imputation = NULL,
#' Normalisation = 'scran', Label = 'none',
#' DEmethod = "limma-trend", DEFilter = FALSE,
#' NCores = NULL, verbose = TRUE)
#'
#' # evaluation
#' evalsimres <- evaluateSim(simRes = simres)
#' plotEvalSim(evalRes = evalsimres, Annot = TRUE)
#' plotTime(evalRes = evalsimres, Annot = TRUE)
#' }
#' @rdname evaluateSim
#' @importFrom stats mad
#' @importFrom matrixStats rowSds
#' @export
evaluateSim <- function(simRes) {
# simulation parameters
Nreps1 = simRes$SimSetup$n1
Nreps2 = simRes$SimSetup$n2
ngenes = simRes$DESetup$ngenes
DEids = simRes$DESetup$DEid
tlfcs = simRes$DESetup$pLFC
nsims = simRes$DESetup$nsims
t.designs = simRes$SimulateRes$true.designs
# estimated parameters
elfcs = simRes$SimulateRes$elfc
tsfs = simRes$SimulateRes$true.sf
esfs = simRes$SimulateRes$est.sf
time.taken <- simRes$SimulateRes$time.taken
# create lfc output objects
my.names = paste0(Nreps1, " vs ", Nreps2)
# error in log2 fold changes
lfc.error.mat <- lapply(1:length(my.names), function(x) {
matrix(NA, nrow = nsims, ncol = 15,
dimnames = list(c(paste0("Sim", 1:nsims)),
c(paste0(rep(x=c("ALL", "DE", "EE"),each=5),"_",
c('RMSE_Value', "MAE_Value", "RMSE_NAFraction", "MAE_NAFraction", "rRMSE_Value")))))
})
names(lfc.error.mat) <- my.names
# create library size error output objects
sf.error.mat <- lapply(1:length(my.names), function(x) {
matrix(NA, nrow = nsims, ncol = 4,
dimnames = list(c(paste0("Sim_", 1:nsims)),
c("MAD", "rRMSE", "Group 1","Group 2")))
})
names(sf.error.mat) <- my.names
# create timing output objects
time.taken.mat <- lapply(1:length(my.names), function(x) {
data.frame(matrix(NA, nrow = length(colnames(time.taken[[1]])),
ncol = 3, dimnames = list(c(colnames(time.taken[[1]])),
c("Mean", "SD", "SEM")))
)
})
names(time.taken.mat) <- my.names
## loop over simulation and replicates
for(i in 1:nsims) {
# DE flag
DEid = DEids[[i]]
Zg = rep(0, ngenes)
Zg[DEid] = 1
# true log fold change of all genes
all.tlfc = tlfcs[[i]]
# true log fold change of DE genes
de.tlfc = all.tlfc[which(Zg==1)]
# true log fold change of EE genes
ee.tlfc = all.tlfc[which(Zg==0)]
for(j in seq(along=Nreps1)) {
## LOG2 FOLD CHANGES
# estimated log fold change of all genes
all.elfc = elfcs[, j, i]
# estimated log fold changes of EE genes
ix.ee.lfc = which(Zg==0)
ee.lfc = all.elfc[ix.ee.lfc]
# estimated log fold change of DE genes
ix.de.lfc = which(Zg==1)
de.lfc = all.elfc[ix.de.lfc]
# estimate mean squared error and absolute error
all.error <- .lfc.evaluate(truth=all.tlfc, estimated=all.elfc)
ee.error <- .lfc.evaluate(truth=ee.tlfc, estimated=ee.lfc)
de.error <- .lfc.evaluate(truth=de.tlfc, estimated=de.lfc)
error.est <- c(all.error, de.error, ee.error)
lfc.error.mat[[j]][i,] = error.est
## SIZE FACTORS
# true sf over all samples, center to mean=1
tsf = tsfs[[j]][i, ]
n.tsf = tsf*length(tsf)/sum(tsf)
# estimated sf over all sample, center to mean=1
esf = esfs[[j]][i,]
n.esf = esf*length(esf)/sum(esf)
# MAD of log fold change difference between estimated and true size factors
lfc.nsf = log2(esf) - log2(tsf)
mad.nsf = stats::mad(lfc.nsf)
# error of estimation
error.sf <- .fiterror.sf(estimated.sf = n.esf, true.sf = n.tsf)
# ratio of estimated and true size factors per true group assignment
t.design = t.designs[[j]][i,]
ratio.sf <- .ratio.sf(estimated.nsf = n.esf,
true.nsf = n.tsf,
group=t.design)
sf.res <- unlist(c(mad.nsf, error.sf, ratio.sf))
names(sf.res) <- NULL
sf.error.mat[[j]][i,] = sf.res
}
}
## TIMING
for(j in seq(along=Nreps1)) {
tmp.time <- time.taken[[j]]
time.taken.mat[[j]][,"Mean"] <- colMeans(tmp.time)
time.taken.mat[[j]][,"SD"] <- matrixStats::colSds(tmp.time)
time.taken.mat[[j]][,"SEM"] <- matrixStats::colSds(tmp.time)/sqrt(nsims)
}
output <- list(LogFoldChange = lfc.error.mat,
SizeFactors = sf.error.mat,
Pipeline = simRes$Pipeline,
DESetup = simRes$DESetup,
Timing = time.taken.mat)
# return object
return(output)
}
# EVALUATE ROCR -----------------------------------------------------------
#' @name evaluateROC
#' @aliases evaluateROC
#' @title Receiver operator characteristics of simulation results
#' @description This function takes the simulation output from \code{\link{simulateDE}}
#' and computes receiver operator characteristics (ROC) and area under the curve values (AUC) with the help of functions implemented in \code{\link[iCOBRA]{calculate_performance}}.
#' @usage evaluateROC(simRes,
#' alpha.type=c("adjusted","raw"),
#' MTC=c('BY', 'BH', 'Storey', 'IHW',
#' 'holm', 'hochberg', 'hommel', 'bonferroni'),
#' alpha.nominal = 0.1,
#' target.by=c("lfc", "effectsize"), delta=0,
#' raw = FALSE, verbose = TRUE)
#' @param simRes The result from \code{\link{simulateDE}}.
#' @param alpha.type A string to represent the way to call DE genes.
#' Available options are \code{"adjusted"} i.e. applying multiple testing correction and
#' \code{"raw"} i.e. using p-values. Default is \code{"adjusted"}.
#' @param MTC Multiple testing correction method to use. Available options are
#' 1) see \link[stats]{p.adjust.methods},
#' 2) Storey's qvalue see \link[qvalue]{qvalue} and
#' 3) Independent Hypothesis Weighting considering mean expression as covariate (see \link[IHW]{ihw}).
#' Default is \code{BY}, i.e. Benjamini-Yekutieli FDR correction method.
#' @param alpha.nominal The nomial value of significance. Default is 0.1.
#' @param target.by A string to specify the method to define "biologically important" DE genes.
#' Available options are (1) \code{"lfc"}: interesting genes are defined by absolute log2 fold changes.
#' (2) \code{"effectsize"}: interesting genes are defined by
#' absolute log2 fold changes divided by the square root of 1/(mean+dispersion).
#' @param delta A threshold used for defining "biologically important" genes.
#' Genes with absolute log2 fold changes (when target.by is "lfc")
#' or effect sizes (when target.by is "effectsize") greater than this value
#' are deemed DE in error rates calculations.
#' If the default \code{delta=0}, then no threshold is applied.
#' @param raw Logical vector. The default \code{FALSE} removes
#' intermediate calculations from the output since they can be quite large.
#' @param verbose Logical vector to indicate whether to show progress report of evaluation.
#' Default is \code{TRUE}.
#' @return A list with the following entries:
#' \item{Performances}{The output of \code{\link[iCOBRA]{calculate_performance}} of aspect "fdrtprcurve" calculating the proportions of TN, FN, TP and FP and related rates which uses \code{\link[ROCR]{prediction}} and \code{\link[ROCR]{performance}} internally.}
#' \item{TPRvsFDR}{The output of \code{\link[iCOBRA]{calculate_performance}} of aspect "fdrtpr" calculating the proportions of TN, FN, TP and FP and associated TPR and observed FDR for each nominal level from 0.01 to 1 in steps of 0.01.}
#' \item{Scores}{The mean +/- standard error of performance measures and area under the curve. These are calculated once for conservative (extension "conv") and once for liberal nominal (extension "lib") FDR levels. Measures include: accuracy (ACC), F-measure of precision and recall (F1score), Matthew's correlation coefficient (MCC), partial Area under the Curve of TPR vs FDR (TPRvsFDR_pAUC), area under the ROC-curve (TPRvsFPR_AUC) and area under the PR-curve (TPRvsPPV_AUC).}
#' \item{Raw}{If \code{raw} is \code{TRUE}, then the intermediate calculations of the above three list entries is also included in the output.}
#' \item{Settings}{Reiterating chosen evaluation parameters.}
#' @author Beate Vieth
#' @seealso \code{\link{estimateParam}} for parameter estimation needed for simulations,
#' \code{\link{Setup}} for setting up simulation parameters and
#' \code{\link{simulateDE}} for simulating differential expression.
#' @examples
#' \dontrun{
#' # estimate gene parameters
#' data("SmartSeq2_Gene_Read_Counts")
#' estparam_gene <- estimateParam(countData = SmartSeq2_Gene_Read_Counts,
#' readData = NULL,
#' batchData = NULL,
#' spikeData = NULL, spikeInfo = NULL,
#' Lengths = NULL, MeanFragLengths = NULL,
#' RNAseq = 'singlecell', Protocol = 'Read',
#' Distribution = 'ZINB', Normalisation = "scran",
#' GeneFilter = 0.1, SampleFilter = 3,
#' sigma = 1.96, NCores = NULL, verbose = TRUE)
#' # define log2 fold change
#' p.lfc <- function(x) sample(c(-1,1), size=x,replace=T)*rgamma(x, shape = 1, rate = 2)
#' # set up simulations
#' setupres <- Setup(ngenes = 10000, nsims = 10,
#' p.DE = 0.1, pLFC = p.lfc,
#' n1 = c(20,50,100), n2 = c(30,60,120),
#' Thinning = c(1,0.9,0.8), LibSize = 'given',
#' estParamRes = estparam_gene,
#' estSpikeRes = NULL,
#' DropGenes = TRUE,
#' sim.seed = 83596, verbose = TRUE)
#' # run simulation
#' simres <- simulateDE(SetupRes = setupres,
#' Prefilter = "FreqFilter", Imputation = NULL,
#' Normalisation = 'scran', Label = 'none',
#' DEmethod = "limma-trend", DEFilter = FALSE,
#' NCores = NULL, verbose = TRUE)
#' # evaluation
#' evalrocres <- evaluateROC(simRes = simres,
#' alpha.type = "adjusted",
#' MTC = 'BH', alpha.nominal = 0.05,
#' raw = FALSE)
#' # plot evaluation
#' plotEvalROC(evalRes = evalrocres)
#' }
#' @rdname evaluateROC
#' @importFrom stats ecdf quantile p.adjust.methods p.adjust
#' @importFrom qvalue qvalue
#' @importFrom IHW ihw adj_pvalues
#' @importFrom iCOBRA calculate_performance COBRAData
#' @importFrom tidyr "%>%"
#' @importFrom dplyr select
#' @importFrom utils tail
#' @export
evaluateROC <- function(simRes, alpha.type=c("adjusted","raw"),
MTC=c('BY', 'BH', 'Storey', 'IHW',
'holm', 'hochberg', 'hommel', 'bonferroni'),
alpha.nominal = 0.1,
target.by=c("lfc", "effectsize"),
delta=0,
raw = FALSE,
verbose = TRUE) {
alpha.type = match.arg(alpha.type)
MTC = match.arg(MTC)
target.by = match.arg(target.by)
Nreps1 = simRes$SimSetup$n1
Nreps2 = simRes$SimSetup$n2
ngenes = simRes$DESetup$ngenes
nsims = simRes$DESetup$nsims
DEids = simRes$DESetup$DEid
lfcs = simRes$DESetup$pLFC
elfcs = simRes$SimulateRes$elfc
DEmethod = simRes$Pipeline$DEmethod
pvalue = simRes$SimulateRes$pvalue
fdr = simRes$SimulateRes$fdr
mu = simRes$SimulateRes$mu
disp = simRes$SimulateRes$disp
if (verbose) { message(paste0("Preparing output arrays.")) }
my.names = paste0(Nreps1, " vs ", Nreps2)
Truths = Predictions = array(NA, dim = c(length(Nreps1),
ngenes, nsims))
perf = vector("list", length(my.names))
perf <- lapply(1:length(perf), function(x) {
perf[[x]] = vector("list", nsims)
})
tprvsfdr = vector("list", length(my.names))
tprvsfdr <- lapply(1:length(tprvsfdr), function(x) {
tprvsfdr[[x]] = vector("list", nsims)
})
scores = vector("list", length(my.names))
scores <- lapply(1:length(scores), function(x) {
scores[[x]] = data.frame(matrix(NA, nrow = nsims, ncol = 12,
dimnames = list(NULL, c("Samples", "Sim",
"TPRvsFPR_AUC", "TPRvsPPV_AUC",
"TPRvsFDR_pAUC_lib", "TPRvsFDR_pAUC_conv",
"ACC_lib", "ACC_conv",
"F1score_lib", "F1score_conv",
"MCC_lib", "MCC_conv"
))))
scores[[x]][, "Samples"] = rep(x = my.names[x], nsims)
scores[[x]][, "Sim"] = 1:nsims
scores[[x]]
})
if (verbose) { message(paste0("Calculating performance measures.")) }
for (i in 1:nsims) {
for (j in seq(along = Nreps1)) {
Nrep1 = Nreps1[j]
Nrep2 = Nreps2[j]
elfc = as.numeric(elfcs[, j, i])
DEid = DEids[[i]]
lfc = lfcs[[i]]
Zg = Zg2 = rep(0, ngenes)
Zg[DEid] = 1
if (delta == 0) {
Zg2 = Zg
}
if (!delta == 0) {
if (target.by == "lfc") {
ix = abs(lfc) > delta
}
else if (target.by == "effectsize") {
effectsize = lfc / sqrt(1/((log2(mu[,j,i] + 1)+log2(disp[,j,i] + 1))))
ix = abs(effectsize) > delta
}
Zg2[ix] = 1
}
if (alpha.type == "raw") {
if (DEmethod %in% c("edgeR-QL", "edgeR-LRT", "T-Test",
"limma-voom", "limma-trend", "NBPSeq", "DESeq2",
"ROTS", "MAST", "scde", "BPSC", "scDD", "monocle",
"DECENT", "edgeR-zingeR", "edgeR-ZINB-WaVE",
"DESeq2-zingeR", "DESeq2-ZINB-WaVE")) {
x = pvalue[, j, i]
x[is.na(x)] = 1
}
if (DEmethod %in% c("baySeq", "NOISeq", "EBSeq")) {
message(paste0("The DE method ", DEmethod,
" only provides adjusted p-values."))
x = fdr[, j, i]
x[is.na(x)] = 1
}
}
if (alpha.type == "adjusted") {
if (DEmethod %in% c("edgeR-QL", "edgeR-LRT", "T-Test",
"limma-voom", "limma-trend", "NBPSeq", "DESeq2",
"ROTS", "MAST", "BPSC", "scDD",
"DECENT", "edgeR-zingeR", "edgeR-ZINB-WaVE",
"DESeq2-zingeR", "DESeq2-ZINB-WaVE")) {
pval = pvalue[, j, i]
meanexpr = mu[, j, i]
if (MTC %in% stats::p.adjust.methods) {
x = stats::p.adjust(pval, method = MTC)
x[is.na(x)] = 1
}
if (MTC %in% "Storey") {
tmp.p = pval[!is.na(pval)]
tmp.q = qvalue::qvalue(p = tmp.p)$qvalues
x = rep(NA, length(pval))
x[!is.na(pval)] = tmp.q
x[is.na(x)] = 1
}
if (MTC %in% "IHW") {
in.dat = data.frame(pvalue = pval, meanexpr = meanexpr)
tmp = IHW::ihw(pvalue ~ meanexpr, data = in.dat,
alpha = alpha.nominal)
x = IHW::adj_pvalues(tmp)
x[is.na(x)] = 1
}
}
if (DEmethod %in% c("baySeq", "NOISeq", "EBSeq")) {
message(paste0("The DE method ", DEmethod,
" only provides adjusted p-values."))
x = fdr[, j, i]
x[is.na(x)] = 1
}
}
Truths[j, , i] = Zg2
Predictions[j, , i] = x
cobradata <- iCOBRA::COBRAData(pval = data.frame(Sim = pval, row.names = paste0("G", 1:ngenes)),
padj = data.frame(Sim = x, row.names = paste0("G", 1:ngenes)),
score = data.frame(Sim = elfc, row.names = paste0("G", 1:ngenes)),
truth = data.frame(status = Zg2, logFC = lfc, expr = meanexpr, row.names = paste0("G", 1:ngenes)))
invisible(capture.output(res <- suppressMessages(
iCOBRA::calculate_performance(cobradata,
aspects = c("fdrtpr", "fdrtprcurve"),
binary_truth = "status", cont_truth = "logFC",
thrs = seq(from = 0.01, to = 1, by = 0.01),
splv = "none",
maxsplit = 4, onlyshared = FALSE, thr_venn = 0.05,
type_venn = "adjp", topn_venn = 100, rank_by_abs = TRUE,
prefer_pval = TRUE))
))
fdrtprcurve <- res@fdrtprcurve %>%
dplyr::select(-c(!!"method":!!"splitval")) %>%
dplyr::mutate(Samples = my.names[j], Sim = i)
perf.dat <- .roc.calc(df = fdrtprcurve)
perf[[j]][[i]] <- perf.dat[, c("Samples", "Sim", "CUTOFF",
"TP", "FP", "TN", "FN",
"FPR", "FNR", "FDR",
"TPR", "TNR", "NBR",
"PPV", "ACC",
"F1score", "MCC")]
# AUC calculations
fpr <- perf[[j]][[i]]$FPR
tpr <- perf[[j]][[i]]$TPR
ppv <- perf[[j]][[i]]$PPV
# TPR versus FPR
fpr1 <- fpr[!is.na(fpr) & !is.na(tpr)]
tpr1 <- tpr[!is.na(fpr) & !is.na(tpr)]
id <- order(fpr1)
scores[[j]][i, "TPRvsFPR_AUC"] <- sum(diff(fpr1[id]) * zoo::rollmean(tpr1[id], 2))
# TPR versus PPV
ppv1 <- ppv[!is.na(ppv) & !is.na(tpr)]
tpr1 <- tpr[!is.na(ppv) & !is.na(tpr)]
id <- order(ppv1)
scores[[j]][i, "TPRvsPPV_AUC"] <- sum(diff(ppv1[id]) * zoo::rollmean(tpr1[id], 2))
obs.fdr = as.vector(res@fdrtpr[,c("FDR")])
tmp.nom = as.vector(res@fdrtpr[,c("thr")])
nom.fdr = as.numeric(gsub("thr", "", tmp.nom))
dat.fdr = data.frame(obs.fdr, nom.fdr)
# TPR vs FDR (liberal); MCC (liberal); F1 score (liberal); ACC (liberal)
a.fdr = utils::tail(dat.fdr[dat.fdr$obs.fdr <=alpha.nominal & dat.fdr$nom.fdr <= alpha.nominal, "obs.fdr"], 1)
if (all(c(!length(a.fdr) == 0, !a.fdr == 0))) {
if(a.fdr <= alpha.nominal) {
a.fdr = utils::tail(dat.fdr[dat.fdr$obs.fdr <= alpha.nominal, "obs.fdr"], 1)
}
x <- as.vector(res@fdrtprcurve[,c("FDR")])[-1]
y <- as.vector(res@fdrtprcurve[,c("TPR")])[-1]
y <- y[x<a.fdr]
x <- x[x<a.fdr]
x1 <- x[!is.na(x) & !is.na(y)]
y1 <- y[!is.na(x) & !is.na(y)]
id <- order(x1)
pauc_lib <- sum(diff(x1[id]) * zoo::rollmean(y1[id], 2)) / alpha.nominal
mcc <- as.vector(perf[[j]][[i]][,c("MCC")])[-1]
mcc_lib <- utils::tail(mcc[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
f1 <- as.vector(perf[[j]][[i]][,c("F1score")])[-1]
f1_lib <- utils::tail(f1[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
acc <- as.vector(perf[[j]][[i]][,c("ACC")])[-1]
acc_lib <- utils::tail(acc[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
}
if (any(c(length(a.fdr) == 0, a.fdr == 0))) {
pauc_lib <- f1_lib <- acc_lib <- 0
mcc_lib <- NA
}
scores[[j]][i, "TPRvsFDR_pAUC_lib"] <- pauc_lib
scores[[j]][i, "MCC_lib"] <- mcc_lib
scores[[j]][i, "F1score_lib"] <- f1_lib
scores[[j]][i, "ACC_lib"] <- acc_lib
# TPR vs FDR (conservative); MCC (conversative); F1 score (conservative); ACC (conservative)
a.fdr = utils::tail(dat.fdr[dat.fdr$obs.fdr <= alpha.nominal &
dat.fdr$nom.fdr <= alpha.nominal,
"obs.fdr"], 1)
if (all(c(!length(a.fdr) == 0, !a.fdr == 0))) {
x <- as.vector(res@fdrtprcurve[,c("FDR")])[-1]
y <- as.vector(res@fdrtprcurve[,c("TPR")])[-1]
y <- y[x<a.fdr]
x <- x[x<a.fdr]
x1 <- x[!is.na(x) & !is.na(y)]
y1 <- y[!is.na(x) & !is.na(y)]
id <- order(x1)
pauc_conv <- sum(diff(x1[id]) * zoo::rollmean(y1[id], 2)) / alpha.nominal
mcc <- as.vector(perf[[j]][[i]][,c("MCC")])[-1]
mcc_conv <- utils::tail(mcc[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
f1 <- as.vector(perf[[j]][[i]][,c("F1score")])[-1]
f1_conv <- utils::tail(f1[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
acc <- as.vector(perf[[j]][[i]][,c("ACC")])[-1]
acc_conv <- utils::tail(acc[as.vector(res@fdrtprcurve[,c("FDR")])[-1] < a.fdr], 1)
}
if (any(c(length(a.fdr) == 0, a.fdr == 0))) {
pauc_conv <- f1_conv <- acc_conv <- 0
mcc_conv <- NA
}
scores[[j]][i, "TPRvsFDR_pAUC_conv"] <- pauc_conv
scores[[j]][i, "MCC_conv"] <- mcc_conv
scores[[j]][i, "F1score_conv"] <- f1_conv
scores[[j]][i, "ACC_conv"] <- acc_conv
fdrtpr <- res@fdrtpr %>%
dplyr::mutate(Samples = my.names[j], Sim = i)
fdrtpr[, "Threshold"] <- as.numeric(gsub(x = fdrtpr$thr, pattern = "thr", replacement = ""))
tprvsfdr[[j]][[i]] <- fdrtpr[, c("Samples", "Sim", "Threshold",
"TP", "FP", "TN", "FN",
"TPR", "FDR", "NBR")]
}
}
names(tprvsfdr) <- names(perf) <- names(scores) <- my.names
if (verbose) { message(paste0("Summarising performance measures.")) }
if(raw == FALSE) {
PERF <- .perf.summary.calc(calc.obj = perf)
TPR_FDR <- .tprvsfdr.summary.calc(calc.obj = tprvsfdr)
SCORES <- .scores.summary.calc(calc.obj = scores)
RAW <- NULL
}
if(raw == TRUE) {
PERF <- .perf.summary.calc(calc.obj = perf)
TPR_FDR <- .tprvsfdr.summary.calc(calc.obj = tprvsfdr)
SCORES <- .scores.summary.calc(calc.obj = scores)
RAW <- list("PERF" = perf,
"TPR_FDR" = tprvsfdr,
"SCORES" = scores)
}
output <- list(Performances = PERF,
TPRvsFDR = TPR_FDR,
Scores = SCORES,
Raw = RAW,
Settings = list(alpha.type = alpha.type,
MTC = ifelse(alpha.type == "adjusted", MTC, "not applicable"),
alpha.nominal=alpha.nominal,
target.by = target.by,
delta = delta,
raw = raw))
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.