In arcolombo/junk: qusage: Quantitative Set Analysis for Gene Expression

-- title: "Qusage: Speeding up AggregatePDF in RcppArmadillo" author: "Timothy J. Triche, Jr, Anthony R. Colombo" output: pdf_document: toc: true number_sections: true date: "r format(Sys.time(), '%d %B, %Y')"

SpeedSage Intro

qusage is published software that is slow for large runs, SpeedSage corrects for speed and efficiency at large orders. there is Bottlenecking of Functions Qusage can improve the speed of its algorithm by minimizing the cost of computaiton.

aggregatePDF Function

This test will perform the computations in the apply functions in c++. for makeComparisons and calcVIF, speedSage assumes the Welch's approximation for variances between groups being unequal. Welch's approximation is used for aggregateGeneSet function. Previously the qusage.single/makeComparison in Armadillo/speedSage has a 1.3-1.48X speed up, aggregateGeneSet is the second bottleneck that must be accelerated, it gained milliseconds, and moderate gains. the next bottlenect was the PDF generation and the multiConv function, this attempts to speed up the PDF. the speed gains is calculated by the differences of the sub-routeins called by the parent-routines between respective methods. it is linear.

library(inline)
library(microbenchmark)
library(Rcpp)
library(parallel)
library(speedSage)
library(qusage)
library(ggplot2)
eset<-system.file("extdata","eset.RData",package="speedSage")
load(eset)
labels<-c(rep("t0",134),rep("t1",134))
contrast<-"t1-t0"
colnames(eset)<-c(rep("t0",134),rep("t1",134))
fileISG<-system.file("extdata","c2.cgp.v5.1.symbols.gmt",package="speedSage")
ISG.geneSet<-read.gmt(fileISG)
geneSets<-ISG.geneSet[grepl("DER_IFN_GAMMA_RESPONSE_UP",names(ISG.geneSet))]


#cpp functions
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/sigmaArm.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/sigmaSingle.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/bayesEstimation.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/notbayesEstimation.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarm.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarmalt.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarm_nosdalphaalt.cpp")
sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregategsSumSigma.cpp")
source(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/qusageArm.R")
#sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregatePDF.cpp")


pairVector<-NULL
var.equal<-FALSE
filter.genes<-FALSE
n.points<-2^12
NumSDs<-abs(qt(10^-10,1:250))
NumSDs[NumSDs>750] = 750
armadillo<-TRUE



#setting up aggregateGeneSetArm call objects
results = makeComparisonArm(eset, labels, contrast, pairVector=pairVector,var.equal=var.equal)
nu = floor(min(results$dof,na.rm=T))
defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points)
geneResults<-results
 Means = geneResults$mean
 COLS = names(Means)

 if(is.vector(geneSets) & !is.list(geneSets)){
    n = deparse(substitute(geneSets))
    geneSets = list(geneSets)
    names(geneSets) = n
  }
  if(is.null(names(geneSets))){names(geneSets) = 1:length(geneSets)}
  geneSets = lapply(geneSets,function(x){
    if(is.numeric(x)){
      if(any(!(x %in% 1:length(COLS)))){stop("Numeric gene set indices out of bounds")}
      return(x)
    }
 which(COLS%in%x)
  })


#myAg<-aggregategsSumSigma(results$SD, results$dof,geneSets) 

 if ( armadillo=="TRUE"){
 SumSigma_MinDof<-aggregategsSumSigma(geneResults$SD,geneResults$dof,geneSets)
  SumSigma<-unlist(SumSigma_MinDof)[1:length(geneSets)]
  MinDof<-unlist(SumSigma_MinDof)[(length(geneSets)+1):length(unlist(SumSigma_MinDof))]
names(SumSigma)<-names(geneSets)
names(MinDof)<-names(geneSets)
 }


 MaxDiff<-pmax(NumSDs[MinDof]*SumSigma,1,na.rm=TRUE)

myPDF<-aggregatePDF(MaxDiff,geneSets,geneResults$sd,n.points,geneResults$dof)

source("/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregateGeneSet.R")
source("/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregateGeneSetArm.R")



defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points)
myAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=TRUE)
defAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=FALSE)

singleMb<-microbenchmark(
myAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=TRUE),
defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points),times=200)

#single gene set no speed pu
singleMb  


multiSets<-geneSets
multiSets[2]<-geneSets[1]
multiSets[3]<-geneSets[1]

multiMb<-microbenchmark(
myAg<-aggregateGeneSetArm(results, multiSets, silent=F,n.points=n.points, armadillo=TRUE),
defaultAggregate = aggregateGeneSet(results, multiSets, silent=F, n.points=n.points),times=1000)

multiMb
#the gain is only in terms of microseconds, although additive, it is not noticeable.


#the grouping of the PDF formation is also bottlenecking