inst/doc/bioqc-efficiency.R

## ----setup, include=FALSE-----------------------------------------------------
options(fig_caption=TRUE)
library(knitr)
opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center")

## ----lib, warning=FALSE, message=FALSE, results="hide", include=FALSE---------
library(testthat)
library(BioQC)
library(org.Hs.eg.db) ## to simulate an microarray expression dataset
library(lattice)
library(latticeExtra)
library(gridExtra)
library(gplots)
library(rbenchmark)

pdf.options(family="ArialMT", useDingbats=FALSE)

set.seed(1887)

## list human genes
humanGenes <- keys(revmap(org.Hs.egSYMBOL))

## read tissue-specific gene signatures
gmtFile <- system.file("extdata/exp.tissuemark.affy.roche.symbols.gmt",
                       package="BioQC")
gmt <- readGmt(gmtFile)

## ----time_benchmark, echo=FALSE-----------------------------------------------
randomMatrix <- function(rows=humanGenes, ncol=5) {
  nrow <- length(rows)
  mat <- matrix(rnorm(nrow*ncol),
                nrow=nrow, byrow=TRUE)
  rownames(mat) <- rows
  return(mat)
}
noSamples <- c(1, 5, 10, 20, 50, 100)
noBenchRep <- 100
tmRandomMats <- lapply(noSamples, function(x) randomMatrix(ncol=x))
tissueInds <- sapply(gmt, function(x) match(x$genes, humanGenes))

wmwTestR <- function(matrix, indices, alternative) {
  res <- apply(matrix, 2, function(x) {
    sapply(indices, function(index) {
      sub <- rep(FALSE, length(x))
      sub[index] <- TRUE
      wt <- wilcox.test(x[sub], x[!sub],
                        alternative=alternative,
                        exact=FALSE)
      return(wt$p.value)
      })
    })
  return(res)
  }

## WARNING: very slow (~1-2 hours)
benchmarkFile <- "simulation-benchmark.RData"
if(!file.exists(benchmarkFile)) {
  bioqcRes <- lapply(tmRandomMats, function(mat) {
    bench <- benchmark(wmwTestRes<- wmwTest(mat,
                                            tissueInds,
                                            valType="p.greater",
                                            simplify=TRUE),
                       replications=noBenchRep)
    elapTime <- c("elapsed"=bench$elapsed,
                  "user"=bench$user.self,
                  "sys"=bench$sys.self)/noBenchRep
    res <- list(elapTime=elapTime,
                wmwTestRes=wmwTestRes)
    return(res)
    })
  
  rRes <- lapply(tmRandomMats, function(mat) {
    elapTime <- system.time(wmwTestRes <- wmwTestR(mat, tissueInds, alternative="greater"))
    res <- list(elapTime=elapTime,
                wmwTestRes=wmwTestRes)
    return(res)
    })
  save(bioqcRes, rRes, file=benchmarkFile)
  } else {
    load(benchmarkFile)
 }
getWmwTestRes <- function(x) x$wmwTestRes
rNumRes <- lapply(rRes, getWmwTestRes)
bioqcNumRes <- lapply(bioqcRes, getWmwTestRes)

## ----time_benchmark_identical-------------------------------------------------
expect_equal(bioqcNumRes, rNumRes)

## ----trellis_prepare, echo=FALSE----------------------------------------------
op <- list(layout.widths = list(left.padding = 0, key.ylab.padding = 0.5,
                                ylab.axis.padding = 0.5, axis.right = 0.5, right.padding = 0),
           layout.heights = list(top.padding = 0, bottom.padding = 0,
                                 axis.top = 0, main.key.padding = 0.5, key.axis.padding = 0.5),
           axis.text=list(cex=1.2),
           par.xlab.text=list(cex=1.4),
           par.sub.text=list(cex=1.4),
           add.text=list(cex=1.4),
           par.ylab.text=list(cex=1.4))

## ----time_benchmark_vis, echo=FALSE, fig.width=8, fig.height=4.5, dev='svg', dev.args=list(pointsize=2.5), fig.cap="**Figure 2**: Time benchmark results of BioQC and R implementation of Wilcoxon-Mann-Whitney test. Left panel: elapsed time in seconds (logarithmic Y-axis). Right panel: ratio of elapsed time by two implementations. All results achieved by a single thread on in a RedHat Linux server."----
getTimeRes <- function(x) x$elapTime["elapsed"]
bioqcTimeRes <- sapply(bioqcRes, getTimeRes)
rTimeRes <- sapply(rRes, getTimeRes)
timeRes <- data.frame(NoSample=noSamples,
                      Time=c(bioqcTimeRes, rTimeRes),
                      Method=rep(c("BioQC", "NativeR"), each=length(noSamples)))

timeXY <- xyplot(Time ~ NoSample, group=Method, data=timeRes,  type="b",
                 auto.key=list(columns=2L),
                 xlab="Number of samples", ylab="Time [s]",
                 par.settings=list(superpose.symbol=list(cex=1.25, pch=16, col=c("black", "red")),
                                   superpose.line=list(col=c("black", "red"))),
                 scales=list(tck=c(1,0), alternating=1L,
                             x=list(at=noSamples),
                             y=list(log=2, at=10^c(-2, -1, 0,1,2,3, log10(2000)), labels=c(0.01, 0.1, 1, 10,100,1000, 2000))))
timeFactor <- with(timeRes,
                   tapply(1:nrow(timeRes),
                          list(NoSample),  function(x) {
                            bioqcTime <- subset(timeRes[x,], Method=="BioQC")$Time
                            rTime <- subset(timeRes[x,], Method=="NativeR")$Time
                            rTime/bioqcTime
                            }))
timeFactor.yCeiling <- max(ceiling(timeFactor/500))*500
timeFactorBar <- barchart(timeFactor ~ noSamples, horizontal=FALSE,
                          xlab="Number of samples", ylab="Ratio of elapsed time [R/BioQC]",
                          ylim=c(-20, timeFactor.yCeiling+50), col=colorRampPalette(c("lightblue", "navyblue"))(length(noSamples)),
                          scales=list(tck=c(1, 0), alternating=1L,
                                      y=list(at=seq(0,timeFactor.yCeiling, by=500)),
                                      x=list(at=seq(along=timeFactor), labels=noSamples)))

grid.arrange(timeXY, timeFactorBar, ncol=2)


## ----session_info-------------------------------------------------------------
sessionInfo()

Try the BioQC package in your browser

Any scripts or data that you put into this service are public.

BioQC documentation built on Nov. 8, 2020, 7:16 p.m.