pipeComp_dea.R
In pipeComp: pipeComp pipeline benchmarking framework

## ---- include=FALSE-----------------------------------------------------------
library(BiocStyle)

## -----------------------------------------------------------------------------
suppressPackageStartupMessages({
  library(pipeComp)
  library(S4Vectors)
})

evaluateDEA <- function(dea, truth=NULL, th=c(0.01,0.05,0.1)){
  ## we make sure that the column names of `dea` are standard:
  dea <- pipeComp:::.homogenizeDEA(dea)
  ## within Pipecomp, the truth should be passed along with the `dea` object, so
  ## we retrieve it here:
  if(is.null(truth)) truth <- metadata(dea)$truth
  dea <- cbind(dea, truth[row.names(dea),])
  ## we get rid of genes for which the truth is unknown:
  dea <- dea[!is.na(dea$expected.beta),]
  ## comparison of estimated and expected log2 folchanges:
  res <- c(logFC.pearson=cor(dea$logFC, dea$expected.beta, use = "pairwise"),
           logFC.spearman=cor(dea$logFC, dea$expected.beta, 
                              use = "pairwise", method="spearman"),
           logFC.mad=median(abs(dea$logFC-dea$expected.beta),na.rm=TRUE),
           ntested=sum(!is.na(dea$PValue) & !is.na(dea$FDR)))
  ## evaluation of singificance calls
  names(th) <- th
  res2 <- t(vapply( th, FUN.VALUE=vector(mode="numeric", length=6), 
                    FUN=function(x){
    ## for each significance threshold, calculate the various metrics
    called=sum(dea$FDR<x,na.rm=TRUE)
    P <- sum(dea$isDE)
    TP <- sum(dea$FDR<x & dea$isDE, na.rm=TRUE)
    c( TP=TP, FP=called-TP, TPR=TP/P, PPV=TP/called, FDR=1-TP/called, 
       FPR=(P-TP)/sum(!dea$isDE) )
  }))
  res2 <- cbind(threshold=as.numeric(row.names(res2)), as.data.frame(res2))
  row.names(res2) <- NULL
  list(logFC=res, significance=res2)
}

## -----------------------------------------------------------------------------
# we build a random DEA dataframe and truth:
dea <- data.frame( row.names=paste0("gene",1:10), logFC=rnorm(10) )
dea$PValue <- dea$FDR <- c(2:8/100, 0.2, 0.5, 1)
truth <- data.frame( row.names=paste0("gene",1:10), expected.beta=rnorm(10),
                    isDE=rep(c(TRUE,FALSE,TRUE,FALSE), c(3,1,2,4)) )
evaluateDEA(dea, truth)

## -----------------------------------------------------------------------------
step.filtering <- function(x, filt, minCount=10){
  # we apply the function `filt` to `x` with the parameter `minCount`
  get(filt)(x, minCount=minCount)
}

## -----------------------------------------------------------------------------
step.sva <- function(x, sva.method, k=1){
  # we create the model.matrix:
  mm <- stats::model.matrix(~condition, data=as.data.frame(colData(x)))
  # we apply the function `sva.method` to `x` with the parameter `k`
  get(sva.method)(x, k=k, mm=mm)
}

## -----------------------------------------------------------------------------
step.dea <- function(x, dea.method){
  # run the DEA method, and transform the results into a DataFrame:
  x2 <- DataFrame(get(dea.method)(x,mm))
  # attach the truth to the results:
  metadata(x2)$truth <- metadata(x)$truth
  x2
}

## -----------------------------------------------------------------------------
pip <- PipelineDefinition( list( filtering=step.filtering,
                                          sva=step.sva,
                                          dea=step.dea )
                                    )
pip

## -----------------------------------------------------------------------------
stepFn(pip, step="dea", type="evaluation") <- evaluateDEA
pip

## -----------------------------------------------------------------------------
def.filter <- function(x, minCounts=10){
  library(edgeR)
  minCounts <- as.numeric(minCounts)
  x[filterByExpr(assay(x), model.matrix(~x$condition), min.count=minCounts),]
}

sva.svaseq <- function(x, k, mm){
  k <- as.integer(k)
  if(k==0) return(x)
  library(sva)
  # run SVA
  sv <- svaseq(assay(x), mod=mm, n.sv=k)
  if(sv$n.sv==0) return(x)
  # rename the SVs and add them to the colData of `x`:
  colnames(sv$sv) <- paste0("SV", seq_len(ncol(sv$sv)))
  colData(x) <- cbind(colData(x), sv$sv)
  x
}

dea.edgeR <- function(x, mm){
  library(edgeR)
  dds <- calcNormFactors(DGEList(assay(x)))
  dds <- estimateDisp(dds, mm)
  fit <- glmFit(dds, mm)
  as.data.frame(topTags(glmLRT(fit, "condition"), Inf))
}

# we also define a function not doing anything:
none <- function(x, ...) x

## -----------------------------------------------------------------------------
alternatives <- list(
  filt=c("none","def.filter"),
  minCount=10,
  sva.method=c("none","sva.svaseq"),
  k=1:2,
  dea.method="dea.edgeR"
)

## ---- eval=FALSE--------------------------------------------------------------
#  datasets <- list.files("path/to/datasets", pattern="rds$", full.names=TRUE)
#  names(datasets) <- paste0("dataset",1:2)
#  datasets <- lapply(datasets, readRDS)

## ---- eval=FALSE--------------------------------------------------------------
#  # not run
#  stepFn(pip, type="initiation") <- function(x){
#    if(is.character(x) && length(x)==1) x <- readRDS(x)
#    x
#  }

## ---- eval=FALSE--------------------------------------------------------------
#  res <- runPipeline( datasets, alternatives, pipelineDef=pip, nthreads=4 )
#  
#  lapply(res$evaluation$dea, head)

## -----------------------------------------------------------------------------
data("exampleDEAresults", package = "pipeComp")
res <- exampleDEAresults

## -----------------------------------------------------------------------------
plotElapsed(res, agg.by="sva.method")
evalHeatmap( res, what=c("TPR","FDR","logFC.pearson"), 
             agg.by=c("sva.method", "dea.method"), row_split = "sva.method" )

## -----------------------------------------------------------------------------
evalHeatmap( res, what=c("TPR","FDR"), agg.by=c("sva.method", "dea.method"), 
             row_split="sva.method", filterExpr=threshold==0.05 )

## ---- fig.width=9, fig.height=4-----------------------------------------------
library(ggplot2)
dea_evalPlot_curve(res, agg.by=c("sva.method","dea.method"), 
                   colourBy="sva.method", shapeBy="dea.method")
dea_evalPlot_curve(res, agg.by=c("sva.method")) + 
  ggtitle("SVA methods, averaging across DEA methods")

## ---- fig.height=7, fig.width=5-----------------------------------------------
evalHeatmap(res, what=c("TPR","FDR"), agg.by=c("sva.method","k"), 
                 show_column_names=TRUE, anno_legend = FALSE, 
                 row_split="sva.method", filterExpr=threshold==0.05 )