In RGLab/MASTdata: This is the data used for the MAST paper

We look at GO enrichment of DE genes using MAST, with and without CDR and compare it to GO enrichment of DE genes using other RNASeq tools.

We compare the ratio of immune specific to non-specific enriched modules.

We'll use GO.db and org.Hs.eg.db for the MAIT cells.

We'll use the the hypergeometric for testing enrichment.

We load the results and databases.

library(data.table)
library(plyr)
library(GO.db)
library(stringr)
library(grid)
library(gridExtra)
library(ggplot2)
library(mHG)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
#diffex = readRDS("../inst/extdata/method_comp_logfc_qvalue.rds")
diffex = readRDS("../inst/extdata/comb_fdr_logfc.rds")

g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

goterms = select(org.Hs.eg.db,diffex[,primerid],"GO","ENTREZID")
setDT(goterms)
goterms = goterms[ONTOLOGY%in%"BP"]
#only experimental evidence
goterms = goterms[EVIDENCE%in%c("EXP","IDA","IPI","IMP","IGI","IEP")]
unique.go = goterms[,unique(ENTREZID),.(GO)]
terms = select(GO.db,columns = c("GOID","TERM"),unique(goterms$GO))
setnames(unique.go,"GO","GOID")
setnames(unique.go,"V1","ENTREZID")
setDT(terms)
terms = unique(terms)
unique.go = unique(unique.go)
unique.go = merge(unique.go,terms,by="GOID")
unique.go[,N:=length(ENTREZID),.(GOID)]
offspring = GOBPOFFSPRING[["GO:0002376"]]

setnames(diffex,"primerid","ENTREZID")
thresholds = c(0.01,0.05,0.1)
prob_thresh=0.01
NN = length(unique(unique.go$ENTREZID))
methods = c("zlm_fdr","zlm_ng_fdr","zlm_fix_fdr","zlm_fix_ng_fdr","limma_fdr","limma_fdr","limma_ng_fdr","deseq_fdr","deseq_ng_fdr","edger_fdr","edger_ng_fdr","scde_fdr")
loops = expand.grid(thresholds,methods)
thresh=0.05

meth="zlm_ng_fdr"
results = apply(loops,1,function(looprow){
  thresh = looprow[1]
  meth = as.character(looprow[2])
  fc = gsub("fdr","log2fc",meth)
  res = unique.go[ENTREZID%in%diffex[get(meth)<thresh&abs(get(fc))>log2(1.25),ENTREZID]][,.(prob = 1-phyper(q=length(ENTREZID),m = N, n = NN-N,k = length(diffex[get(meth)<thresh,ENTREZID])),TERM,N,hit = length(ENTREZID)),.(GOID)]
 nr.imm = length(unique(res)[GOID%in%offspring&p.adjust(prob,"fdr")<prob_thresh,GOID]) 
nr.tot = length(unique(res)[p.adjust(prob,"fdr")<prob_thresh,GOID])
data.frame(thresh,meth,nr.imm,nr.tot,nr.imm/nr.tot)
})

Plot of the rate of detection of enriched immune GO terms vs FDR threshold for DE genes across different methods.

results_r = ldply(results)
setDT(results_r)
results_r[,meth:=gsub("zlm_fix","zlmfix",meth)]
results_r[,rate := nr.imm/nr.tot]
results_r[,thresh:=as.numeric(as.character(thresh))]
results_r[,method_global := str_split_fixed(meth,"_",3)[,1]]
results_r[,withng := factor(str_split_fixed(meth,"_",3)[,2],labels=c("No CDR control","CDR control"))]
p1 = ggplot(results_r[!(method_global%in%"zlmfix")])+aes(x=thresh,y=rate,color=method_global)+geom_point()+geom_line()+scale_x_continuous("FDR Threshold")+scale_y_continuous("Proportion of Immune\nSpecific GO Modules")+facet_wrap(~withng)+scale_color_brewer("Method",palette = 2,type="qual",limits=c("limma","zlm","deseq","edger","scde"),labels=c("Limma","MAST","DESeq","EdgeR","SCDE"))+theme_gray()
leg = g_legend(p1)
p1

mDC

diffex.mdc = readRDS("../inst/extdata/method_comp_logfc_qvalue_mDC.rds")
setnames(diffex.mdc,"primerid","SYMBOL")
diffex.mdc$SYMBOL=gsub("^(.)","\\U\\1",tolower(diffex.mdc$SYMBOL),perl=TRUE)
goterms = select(org.Mm.eg.db,diffex.mdc[,SYMBOL],"GO","SYMBOL")
setDT(goterms)
goterms = goterms[ONTOLOGY%in%"BP"]
#only experimental evidence
goterms = goterms[EVIDENCE%in%c("EXP","IDA","IPI","IMP","IGI","IEP")]
unique.go = goterms[,unique(SYMBOL),.(GO)]
terms = select(GO.db,columns = c("GOID","TERM"),goterms$GO)
setnames(unique.go,"GO","GOID")
setnames(unique.go,"V1","SYMBOL")
setDT(terms)
terms = unique(terms)
unique.go = unique(unique.go)
unique.go = merge(unique.go,terms,by="GOID")
unique.go[,N:=length(SYMBOL),.(GOID)]
NN=length(unique(unique.go[,SYMBOL]))

GO enrichment using hypergeometric for mDC.

thresholds = c(0.01,0.05,0.1)
prob_thresh=0.01
methods = c("fdr_lma_2_wo","fdr_lma_2_ng","fdr_zlm_2_wo","fdr_zlm_2_ng","fdr_lma_4_wo","fdr_lma_4_ng","fdr_zlm_4_wo","fdr_zlm_4_ng","fdr_lma_6_wo","fdr_lma_6_ng","fdr_zlm_6_wo","fdr_zlm_6_ng")
loops = expand.grid(thresholds,methods)
results.mdc = apply(loops,1,function(looprow){
  thresh = looprow[1]
  meth = as.character(looprow[2])
  fc = gsub("fdr","logFC",meth)
    res = unique.go[SYMBOL%in%diffex.mdc[get(meth)<thresh&abs(get(fc))>log2(1.25),SYMBOL]][,.(prob = 1-phyper(q=length(SYMBOL),m = N, n = NN-N,k = length(diffex.mdc[get(meth)<thresh,SYMBOL])),TERM,N,hit = length(SYMBOL)),.(GOID)]

nr.imm = length(unique(res)[GOID%in%offspring&p.adjust(prob,"fdr")<prob_thresh,GOID]) 

nr.tot = length(unique(res)[p.adjust(prob,"fdr")<prob_thresh,GOID])
data.frame(thresh,meth,nr.imm,nr.tot,nr.imm/nr.tot)
})

results.mdc_r = ldply(results.mdc)
setDT(results.mdc_r)
results.mdc_r[,thresh:=as.numeric(as.character(thresh))]
results.mdc_r[,method_global := str_split_fixed(meth,"_",4)[,2]]
results.mdc_r[,time := str_split_fixed(meth,"_",4)[,3]]
results.mdc_r[,withng := factor(str_split_fixed(meth,"_",4)[,4],labels=c("No CDR control","CDR control"))]
results.mdc_r = results.mdc_r[,rate := mean(nr.imm)/(nr.tot),]
results.mdc_r=results.mdc_r[,.(rate = mean(rate)),.(withng,method_global,thresh)]
p2 = ggplot(results.mdc_r)+aes(x=thresh,y=rate,color=method_global)+geom_point()+geom_line()+scale_x_continuous("FDR Threshold")+scale_y_continuous("Proportion of Immune\nSpecific GO Modules")+facet_grid(~withng)+scale_color_brewer("Method",palette = 2,type="qual",limits=c("lma","zlm"),labels=c("Limma","MAST"))+theme_gray()

library(grid)
p3 = grid.arrange(arrangeGrob(p1+theme(legend.position="none"),p2+theme(legend.position="none"),nrow=2),leg,nrow=1,widths=c(10,2))
pdf(file="../inst/extdata/output/SuppFig_10.pdf",width=6,height=8)
p3
grid.draw(p3)
grid.text(c("A","B"),x=c(0.01,0.01),y=c(0.95,0.55))
dev.off()

pdf(file="../inst/extdata/output/SuppFig_10_a.pdf",width=6,height=4)
p1
dev.off()

GSEA of GO BP Modules in mDC

library(RamiGO)
data("c5.go.mapping")
gsea_go = readRDS("../inst/extdata/gsea_result_mDC.rds")
gomap = melt(sapply(gsea_go$module,function(x)c5.go.mapping[match(x,c5.go.mapping[,1]),2]))
gomap = cbind(rownames(gomap),gomap)
colnames(gomap) =c("TERM","GOID")
setDT(gomap)
setnames(gsea_go,"module","TERM")
gsea_go = merge(gsea_go_ids,gsea_go,by="TERM")
setnames(gsea_go,gsub("zlm_fix","zlmfix",colnames(gsea_go)))
gsea_go = melt(gsea_go,id=c("TERM","GOID"))
gsea_go = cbind(gsea_go,data.table((gsea_go[,str_split_fixed(variable,"_",5)])))
setnames(gsea_go,c("V1","V2","V3","V4","V5"),c("V1","Time","signif","Method","ng"))
gsea_go[,V1:=NULL]
ggplot(ldply(lapply(c(0.01,0.05,0.1),function(x){
cbind(gsea_go[signif%in%"FDR",.(sum(GOID%in%offspring&value<substitute(eval(x)))/sum(value<substitute(eval(x) ))),.(Method,ng)],thresh=x)[,ng:=factor(ng,levels=c("","ng"),labels=c("No CDR","With CDR"))]
})))+aes(x=thresh,y=V1,color=Method)+facet_wrap(~ng)+geom_line()+geom_point()+scale_x_continuous("FDR Threshold")+scale_color_brewer("Method",palette = 2,type="qual",limits=c("limma","zlm","zlmfix"),labels=c("Limma","MAST","MAST (fixed threshold)"))+scale_y_continuous("Proportion of Enriched\nImmune Specific GO BP Modules")

GSEA of GO BP Modules Enriched in MAIT

gsea_go = readRDS("../inst/extdata/gsea_result.rds")
gomap = melt(sapply(gsea_go$module,function(x)c5.go.mapping[match(x,c5.go.mapping[,1]),2]))
gomap = cbind(rownames(gomap),gomap)
colnames(gomap) =c("TERM","GOID")
setDT(gomap)
setnames(gsea_go,"module","TERM")
gsea_go = merge(gsea_go_ids,gsea_go,by="TERM")
setnames(gsea_go,gsub("zlm_fix","zlmfix",colnames(gsea_go)))
gsea_go = melt(gsea_go,id=c("TERM","GOID"))
gsea_go = cbind(gsea_go,data.table((gsea_go[,str_split_fixed(variable,"_",3)])))
setnames(gsea_go,c("V1","V2","V3"),c("var","Method","ng"))
ggplot(ldply(lapply(c(0.01,0.05,0.1),function(x){
cbind(gsea_go[var%in%"FDR",.({foo=sum(GOID%in%offspring&value<substitute(eval(x)))/sum(value<substitute(eval(x) ));ifelse(is.nan(foo),0,foo)}),.(Method,ng)],thresh=x)[,ng:=factor(ng,levels=c("","ng"),labels=c("No CDR","With CDR"))]
})))+aes(x=thresh,y=V1,color=Method)+facet_wrap(~ng)+geom_line(alpha=0.7)+geom_point(alpha=0.7)+scale_x_continuous("FDR Threshold")+scale_color_brewer("Method",palette = 2,type="qual")+scale_y_continuous("Proportion of Enriched Immune\nSpecific GO BP Modules")

Simulation study

library(abind)
library(arm)
library(data.table)
library(ggplot2)
library(grid)
library(devtools)
#install_github("RGlab/MAST")
library(MAST)
library(mvtnorm)
library(GSEABase)
library(limma)
library(MCMCpack)
library(pheatmap)
library(GGally)
library(RColorBrewer)
library(reshape2)
library(AnnotationDbi)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

knitr::opts_chunk$set( fig.width = 14, 
                       echo      = FALSE, 
                       warning   = FALSE, 
                       message   = FALSE )

min_gene_in_module <- 5
FCTHRESHOLD        <- log2(1.5)
nrep           <- 10
options("mc.cores"=detectCores())

For portability, all the time consuming steps are not evaluated.

Please set ALL the eval flag in the knitr chunks to TRUE before running the code.

set.seed(123123)

#install.packages("MASTDataPackage",repos="http://gull.fhcrc.org")
library(MASTDataPackage)
data(MASTDataPackage)
sca_mait_original <- sca_mait
sca_mait          <- subset(sca_mait,nGeneOn>4000)
eid               <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,
                           keys=fData(sca_mait)$entrez,
                           keytype="GENEID",
                           columns=c("GENEID","TXNAME"))
ueid              <- unique(na.omit(eid)$GENEID)
sca_mait          <- sca_mait[,fData(sca_mait)$entrez%in%ueid]

MAST:::layer(sca_mait)   <- "tpm"
sca_mait_filtered <- sca_mait
unlogged          <- 2^exprs(sca_mait)-1
tt <- thresholdSCRNACountMatrix( unlogged              ,
                                 nbins          = 20   , 
                                 min_per_bin    = 30   ,
                                 return_log     = TRUE
                               )
#par(mfrow=c(6,5))
#plot(tt)

fd_filt                   <- fData(sca_mait)
fd_filt$cutpoint          <- tt$cutpoint[as.character(tt$bin)]

esetAll<-abind(sca_mait@.Data, atpm=tt$counts_threshold, along=3)

sca_mait_filtered<-FromMatrix('SingleCellAssay',esetAll,
                                cData(sca_mait_filtered),fd_filt)
dimnames(sca_mait_filtered)[[3]]<-dimnames(esetAll)[[3]]
MAST:::layer(sca_mait_filtered)<-"atpm"
sca_mait_filtered         <- sca_mait_filtered[,freq(sca_mait_filtered)>0.2]
sca                       <- sca_mait_filtered
MAST:::layer(sca)<-"atpm"

fit ZLM

MAST:::layer(sca)  <- 'atpm'
# without ngeneson
cData(sca)$condition<-factor(cData(sca)$condition,levels=c("Unstim","Stim"))
zlm_cond <- zlm.SingleCellAssay( ~ condition , sca,  
                                 method='bayesglm', ebayes = TRUE, 
                                 ebayesControl =list(method = "MLE", model = "H1") ,  
                                 hook=deviance_residuals_hook )
lrt_cond  <- lrTest( zlm_cond,CoefficientHypothesis("conditionStim"))
wald_cond <- waldTest(zlm_cond,CoefficientHypothesis("conditionStim"))
logFC_no  <- getLogFC(zlm_cond,c(1,0),c(1,1))
zlm_resid <- do.call(rbind,zlm_cond@hookOut)

# with ngeneson
zlm_cond_sh  <- zlm.SingleCellAssay( ~ condition + cngeneson, sca,  
                                     method='bayesglm', ebayes = TRUE, 
                                     ebayesControl =list(method = "MLE", model = "H1"), 
                                     hook=deviance_residuals_hook )
lrt_cond_sh  <- lrTest(  zlm_cond_sh,CoefficientHypothesis("conditionStim"))
wald_cond_sh <- waldTest(zlm_cond_sh,CoefficientHypothesis("conditionStim"))
logFC_mait   <- getLogFC(zlm_cond_sh,c(1,0,0),c(1,1,0))
zlm_resid_ng <- do.call(rbind,zlm_cond_sh@hookOut)

res_gene          <- data.table(melt(lrt_cond))
res_gene$primerid <- as.character(res_gene$primerid)
res_gene          <- merge(res_gene, fData(sca), by="primerid")
res_gene_hurdle   <- res_gene[metric=="Pr(>Chisq)" & test.type=="hurdle"]
res_gene_hurdle   <- res_gene_hurdle[,adj:=p.adjust(value,"fdr")]
lfc               <- getLogFC(zlm_cond)[contrast=="conditionStim"]
setkey(lfc,primerid)
setkey(res_gene_hurdle,primerid)
res_gene_hurdle   <- merge(lfc,res_gene_hurdle)

res_gene_ng       <- data.table(melt(lrt_cond_sh))
res_gene_ng$primerid <- as.character(res_gene_ng$primerid)
res_gene_ng       <- merge(res_gene_ng, fData(sca), by="primerid")
res_gene_ng_hurdle<- res_gene_ng[metric=="Pr(>Chisq)" & test.type=="hurdle"]
res_gene_ng_hurdle<-res_gene_ng_hurdle[,adj:=p.adjust(value,"fdr")]
lfc_ng               <- getLogFC(zlm_cond_sh)[contrast=="conditionStim"]
setkey(lfc_ng,primerid)
setkey(res_gene_ng_hurdle,primerid)
res_gene_ng_hurdle<-merge(lfc_ng,res_gene_ng_hurdle)

saveRDS( zlm_cond          ,    file = "zlm_cond.rds"   )
saveRDS( zlm_cond_sh       ,    file = "zlm_cond_sh.rds"   )
saveRDS( res_gene_hurdle   ,    file = "res_gene_hurdle.rds"   )
saveRDS( res_gene_ng_hurdle,    file = "res_gene_ng_hurdle.rds"   )

MAST:::layer(sca)  <- 'tpm'

read fit ZLM result

 zlm_cond          <-readRDS( file = "../inst/extdata/zlm_cond.rds"   )
 zlm_cond_sh       <-readRDS( file = "../inst/extdata/zlm_cond_sh.rds"   )
 res_gene_hurdle   <-readRDS( file = "../inst/extdata/res_gene_hurdle.rds"   )
 res_gene_ng_hurdle<-readRDS( file = "../inst/extdata/res_gene_ng_hurdle.rds"   )

The estimated parameters in MAIT

genes_signif<- res_gene_ng_hurdle[adj<0.01&abs(logFC)>FCTHRESHOLD][order(logFC)]#[1:100]
genes_notsig<- res_gene_ng_hurdle[adj>0.5 &abs(logFC)<FCTHRESHOLD][order(logFC)]#[1:900]

cc_sig_ng   <- coef(zlm_cond_sh,"C")[c(genes_signif$primerid,genes_notsig$primerid),]
cd_sig_ng   <- coef(zlm_cond_sh,"D")[c(genes_signif$primerid,genes_notsig$primerid),]

par(mfrow=c(6,2))
hist(cc_sig_ng[genes_signif$primerid,   "(Intercept)"]  ,main="signif  cont base")
hist(cc_sig_ng[genes_notsig$primerid,   "(Intercept)"]  ,main="not sig cont base")
hist(cc_sig_ng[genes_signif$primerid,   "conditionStim"],main="signif  cont stim")
hist(cc_sig_ng[genes_notsig$primerid,   "conditionStim"],main="not sig cont stim")
hist(cc_sig_ng[genes_signif$primerid,   "cngeneson"]    ,main="signif  cont cdr ")
hist(cc_sig_ng[genes_notsig$primerid,   "cngeneson"]    ,main="not sig cont cdr ")
hist(cd_sig_ng[genes_signif$primerid,   "(Intercept)"]  ,main="signif  disc base")
hist(cd_sig_ng[genes_notsig$primerid,   "(Intercept)"]  ,main="not sig disc base")
hist(cd_sig_ng[genes_signif$primerid,   "conditionStim"],main="signif  disc stim")
hist(cd_sig_ng[genes_notsig$primerid,   "conditionStim"],main="not sig disc stim")
hist(cd_sig_ng[genes_signif$primerid,   "cngeneson"]    ,main="signif  disc cdr ")
hist(cd_sig_ng[genes_notsig$primerid,   "cngeneson"]    ,main="not sig disc cdr ")

The estimated stimulation and cdr coefficient in MAIT

par(mfrow=c(1,2))
plot(zlm_cond_sh@coefD[,"conditionStim"],zlm_cond_sh@coefD[,"cngeneson"], main="D", xlab="stim",ylab="CDR")
plot(zlm_cond_sh@coefC[,"conditionStim"],zlm_cond_sh@coefC[,"cngeneson"], main="C", xlab="stim",ylab="CDR")

par(mfrow=c(1,2))
pairs(data.frame(zlm_cond_sh@coefD[genes_signif$primerid,"conditionStim"],zlm_cond_sh@coefD[genes_signif$primerid,"cngeneson"],zlm_cond_sh@coefC[genes_signif$primerid,"conditionStim"],zlm_cond_sh@coefC[genes_signif$primerid,"cngeneson"]))

par(mfrow=c(1,2))
pairs(data.frame(zlm_cond_sh@coefD[genes_notsig$primerid,"conditionStim"],zlm_cond_sh@coefD[genes_notsig$primerid,"cngeneson"],zlm_cond_sh@coefC[genes_notsig$primerid,"conditionStim"],zlm_cond_sh@coefC[genes_notsig$primerid,"cngeneson"]))

N  <- 200    # number of samples
N0 <- 100    # number of samples in group 0
N1 <- 100    # number of samples in group 1
P  <- 2500   # number of features
P0 <- 100    # number of DE features
P1 <- 2400   # number of not DE features

mait_cdr   <- rowMeans(exprs(sca)[,c(genes_signif$primerid,genes_notsig$primerid)]>0)
mait_cdr_us<- mait_cdr[cData(sca)$condition=="Unstim"]
mait_cdr_st<- mait_cdr[cData(sca)$condition=="Stim"]

ncdr    <- 3
mb_u    <- c(0.2,0.3,0.4)
sb_u    <- 20
mb_s    <- c(0.6,0.5,0.4)
sb_s    <- 20
rho     <- c(0,0.3,0.5,0.8)
nro     <- length(rho)
s       <- c(rep(0,N0),rep(1,N1))

th_c    <- 5
om_c    <- 2

th_d    <- 0
om_d    <- 1

nu_s_c  <- 1
nu_u_c  <- 0
tau_s_c <- 2
tau_u_c <- 1

nu_s_d  <- 3
nu_u_d  <- 0
tau_s_d <- 1 
tau_u_d <- 0.3 

rep_list_no      <-vector("list",nrep)
rep_list_cd      <-vector("list",nrep)
rep_list_limma_no<-vector("list",nrep)
rep_list_limma_cd<-vector("list",nrep)

for(ri in 1:nrep){
  cat("replication: ",ri,"\n")
  cdr_list <-vector("list",ncdr)
  names(cdr_list)<- paste("x",mb_u,mb_s,sep="_")
  for(i in 1:3){
    cdr_list[[i]]<-c(rbeta(N0,mb_u[i]*sb_u,(1-mb_u[i])*sb_u),rbeta(N1,mb_s[i]*sb_s,(1-mb_s[i])*sb_s))
  }

  sigma_d<- runif(P,0,1)
  sdnosh <- log(sqrt(zlm_cond_sh@dispersionNoshrink[c(genes_signif$primerid,genes_notsig$primerid),"C"]))

  varnosh <- zlm_cond_sh@dispersionNoshrink[c(genes_signif$primerid,genes_notsig$primerid),"C"]
  a0=(zlm_cond_sh@priorDOF)/2
  b0=(zlm_cond_sh@priorVar)*a0
  # pdf("dispersion.pdf",width=12,heigh=4)
  #   par(mfrow=c(1,3))
  #   x<-seq(0,20,by=1)
  #   hist(varnosh,probability=T)
  #   curve(dgamma(x,shape=3,rate=0.7),add=TRUE,col="blue")
  #   curve(dinvgamma(x,shape=a0,b0),add=TRUE,col="red")
  #   hist(1/rgamma(100,shape=a0,b0),probability=T,main="zlm estimate")
  #   hist(rgamma(100,shape=3,rate=0.7),probability=T,main="gamma(3,0.7)")
  # dev.off()

  sigma_c <-sqrt( 1/rgamma(P,shape=a0,rate=b0))
  mm_d    <- c( nu_s_d, nu_u_d)[c(rep(1,P0),rep(2,P1))]
  tt_d    <- c(tau_s_d,tau_u_d)[c(rep(1,P0),rep(2,P1))]
  mm_c    <- c( nu_s_c, nu_u_c)[c(rep(1,P0),rep(2,P1))]
  tt_c    <- c(tau_s_c,tau_u_c)[c(rep(1,P0),rep(2,P1))]

  coefs_c<- array(NA,c(P,2,nro))
  coefs_d<- array(NA,c(P,2,nro))
  dimnames(coefs_c)<-list(NULL,c("Stim","CDR"),paste("rho",rho,sep="_"))
  dimnames(coefs_d)<-list(NULL,c("Stim","CDR"),paste("rho",rho,sep="_"))
  for( j in 1:nro){
    Rd <- matrix(c(1,-rho[j],-rho[j],1),2,2)
    Rc <- matrix(c(1,-rho[j],-rho[j],1),2,2)
    for(k in 1:P){
      Dc<-diag(c(tt_c[k],10))
      Dd<-diag(c(tt_d[k],10))
      coefs_d[k,,j]=rmvnorm(1,c(mm_d[k],0),Dd%*%Rd%*%Dd)
      coefs_c[k,,j]=rmvnorm(1,c(mm_c[k],0),Dc%*%Rc%*%Dc)
    }
  }

  stim_d  <- coefs_d[,1,1]
  stim_c  <- coefs_c[,1,1]
  mu_du   <- rnorm( P, th_d, om_d )
  mu_ds   <- (mu_du + stim_d)
  temp    <- mu_du[81:100]
  mu_du[81:100]<- mu_ds[81:100]
  mu_ds[81:100]<- temp
  mu_c    <- rnorm( P, th_c, om_c )
  mu_mat  <- matrix( mu_c, nrow=P, ncol=N )
  stim_mat   <- t(t(matrix(stim_c,nrow=P,ncol=N))*s)
  mustim_mat <- mu_mat+stim_mat
  temp2                     <- mustim_mat[81:100,1:N0 ]
  mustim_mat[81:100,1:N0 ]  <- mustim_mat[81:100,(N0+1):(N0+N1)]
  mustim_mat[81:100,(N0+1):(N0+N1)]<- temp2
  cid    <- matrix(1,P,N)

  disc   <- matrix(NA,P,N)
  dt     <- matrix(NA,P,N)
  colnames(dt)<-paste("cell_",1:N,sep="")

  for(i in 1:P){
    disc[i,]    <- c(rbinom( N0, 1, cid[i,     1:N0     ]*invlogit( mu_du[i] ) ) ,
                     rbinom( N1, 1, cid[i,(N0+1):(N0+N1)]*invlogit( mu_ds[i] ) ) )
  }
  for(i in 1:N){
      dt[,i]    <- disc[,i] * rnorm( P, mustim_mat[,i] , sigma_c ) #sigma_c )
  }

  dt_cd=data.table(mb_u=c(NA,rep(mb_u,nro)),mb_s=c(NA,rep(mb_s,nro)),
            rho=c(NA,rep(rho,each=ncdr)),dt_idx=seq(1,nro*ncdr+1,by=1))
  dt_cd        <- dt_cd[1:4,] # running conditions with no correlation in stimulation and CDR coef
  nsim         <- nrow(dt_cd)
  dt_list      <- vector("list",nrow(dt_cd))
  dt_list[[1]] <- dt
  for(icc in 2:nsim){
    cat("sim: ",icc,"\n")
    te=matrix(NA,P,N)
    colnames(te)<-paste("cell_",1:N,sep="")
    dt_list[[dt_cd[icc,]$dt_idx]]=te
    disc_cd<- matrix(NA,P,N)
    cdr    <- cdr_list[[paste("x",dt_cd$mb_u[icc],dt_cd$mb_s[icc],sep="_")]]
    #for(j in 1:nro){
      rn           <- paste("rho",dt_cd$rho[icc],sep="_")
    stim_d       <- coefs_d[,1,rn]
    stim_c       <- coefs_c[,1,rn]
    beta_cdr_d   <- coefs_d[,2,rn]
    beta_cdr_c   <- coefs_c[,2,rn]
    mu_du        <- rnorm( P, th_d, om_d )
    mu_ds        <- (mu_du + stim_d)
    temp         <- mu_du[81:100]
    mu_du[81:100]<- mu_ds[81:100]
    mu_ds[81:100]<- temp
    mu_c         <- rnorm( P, th_c, om_c )
    mu_mat       <- matrix( mu_c, nrow=P, ncol=N )
    stim_mat     <- t(t(matrix(stim_c,nrow=P,ncol=N))*s)
    mustim_mat   <- mu_mat+stim_mat
    temp2                     <- mustim_mat[81:100,1:N0 ]
    mustim_mat[81:100,1:N0 ]  <- mustim_mat[81:100,(N0+1):(N0+N1)]
    mustim_mat[81:100,(N0+1):(N0+N1)]<- temp2
    for(i in 1:P){
      disc_cd[i,] <- c(rbinom( N0, 1, cid[i,1:N0 ]*invlogit( mu_du[i] + beta_cdr_d[i]*cdr[1:N0] ) ) ,
                       rbinom( N1, 1, cid[i,(N0+1):(N0+N1)]*invlogit( mu_ds[i] + beta_cdr_d[i]*cdr[(N0+1):(N0+N1)]) ) )
    }
    for(i in 1:N){
        dt_list[[dt_cd[icc,]$dt_idx]][,i] <- disc_cd[,i] * rnorm( P, mustim_mat[,i]+beta_cdr_c*cdr[i], sigma_c ) 

    }
    #}
  }

  zlm_sim_no_rho   <- vector( "list", nsim )  
  lrt_sim_no_rho   <- vector( "list", nsim )
  lfc_sim_no_rho   <- vector( "list", nsim )
  zlm_sim_cd_rho   <- vector( "list", nsim )
  lrt_sim_cd_rho   <- vector( "list", nsim )
  lfc_sim_cd_rho   <- vector( "list", nsim )
  fdr_no_rho       <- vector( "list", nsim ) 
  fdr_cd_rho       <- vector( "list", nsim ) 
  fdr_no_fc_rho    <- vector( "list", nsim ) 
  fdr_cd_fc_rho    <- vector( "list", nsim ) 
  limma_no_rho     <- vector( "list", nsim ) 
  limma_cd_rho     <- vector( "list", nsim ) 
  reslimma_no_rho  <- vector( "list", nsim ) 
  reslimma_cd_rho  <- vector( "list", nsim ) 

  neglog10alpha    <- rev(seq(0,500,length.out=1000))
  alpha            <- 10^(-neglog10alpha)
  truth            <- rep(c(0,1),c(P0,P1))
  truth_mat        <- matrix(truth,P,P)
  for(j in 1:nsim){
    cat("creating sca object\n")
    fdat    <- data.frame(primerid=1:P, entrez=1:P, symbolid=1:P, stringsAsFactors=FALSE)
    esetAll <- abind(tpm=t(dt_list[[j]]), rev.along=0)
    cdr_est <- colMeans(dt_list[[j]]>0)
    if( j >= 2 ) cdr_est <- cdr_list[[paste("x",dt_cd$mb_u[j],dt_cd$mb_s[j],sep="_")]]
    cdat    <- data.frame(wellKey=colnames(dt_list[[j]]), condition=factor(s), 
                ncells=1, ngeneson=cdr_est, cngeneson=cdr_est - mean(cdr_est), 
                stringsAsFactors=FALSE)
    rownames(cdat)<-colnames(dt_list[[j]])
    sca_sim     <- FromMatrix('SingleCellAssay', esetAll, cdat, fdat)
    name_vec    <- (1:P)  
    true_count  <- c(P0,P1)
    cat("fitting zlm\n")
    zlm_sim_no_rho[[j]]  <- zlm.SingleCellAssay( ~ condition , sca_sim,  
                                         method='bayesglm', ebayes = TRUE, 
                                         ebayesControl =list(method = "MLE", model = "H1"), 
                                         hook=deviance_residuals_hook )
    lrt_sim_no_rho[[j]]  <- lrTest(  zlm_sim_no_rho[[j]],CoefficientHypothesis("condition1"))
    lfc_sim_no_rho[[j]]  <- getLogFC(zlm_sim_no_rho[[j]],c(1,0),c(1,1))

    zlm_sim_cd_rho[[j]]  <- zlm.SingleCellAssay( ~ condition + cngeneson, sca_sim,  
                                         method='bayesglm', ebayes = TRUE, 
                                         ebayesControl =list(method = "MLE", model = "H1"), 
                                         hook=deviance_residuals_hook )
    lrt_sim_cd_rho[[j]]  <- lrTest(  zlm_sim_cd_rho[[j]],CoefficientHypothesis("condition1"))
    lfc_sim_cd_rho[[j]]  <- getLogFC(zlm_sim_cd_rho[[j]],c(1,0,0),c(1,1,0))
    #saveRDS( zlm_sim_no_rho[[j]]          ,    file = paste("zlm_no",ri,j,".rds",sep="_")   )
    #saveRDS( zlm_sim_cd_rho[[j]]          ,    file = paste("zlm_cd",ri,j,".rds",sep="_")   )

    fdr_no_rho[[j]]    <-sapply(neglog10alpha,function(x)by(-log10(lrt_sim_no_rho[[j]][,"hurdle","Pr(>Chisq)"])>x,truth,sum)/true_count)
    fdr_cd_rho[[j]]    <-sapply(neglog10alpha,function(x)by(-log10(lrt_sim_cd_rho[[j]][,"hurdle","Pr(>Chisq)"])>x,truth,sum)/true_count)
    dt_pos             <- t(exprs(sca_sim))#dt_list[[j]]
    tpm                <- 2^dt_pos
    count_matrix       <- round( t((t(tpm)/colSums(tpm))*1000000))
    rownames(count_matrix)<-name_vec

    cat("fitting limma\n")
    x                <- count_matrix
    new_set          <- ExpressionSet( assayData = as.matrix(x)+1 )

      # without ngeneson
      design           <- model.matrix(~ condition, data=cData(sca_sim))
      colnames(design) <- make.names(colnames(design))
      cont_matrix      <- makeContrasts( "condition1",
                                          levels = design)
      voom.data        <- voom( new_set, design = design )
      voom.data$genes  <- rownames(x)
      voom.fitlimma    <- lmFit( voom.data, design = design )
      voom.fitbayes    <- contrasts.fit(voom.fitlimma,cont_matrix)
      voom.fitbayes    <- eBayes(voom.fitbayes)
      voom.pvalues     <- voom.fitbayes$p.value
      reslimma_no_rho[[j]] <- topTable( voom.fitbayes, adjust.method="fdr", number=Inf, sort.by="none" )

      # with ngeneson
      design_ng           <- model.matrix(~ condition + cngeneson, data=cData(sca_sim))
      colnames(design_ng) <- make.names( colnames(design_ng) )
      cont_matrix_ng      <- makeContrasts( "condition1",
                                             levels = design_ng )
      voom.data_ng        <- voom( new_set, design = design_ng )
      voom.data_ng$genes  <- rownames(x)
      voom.fitlimma_ng    <- lmFit( voom.data_ng, design = design_ng )
      voom.fitbayes_ng    <- contrasts.fit(voom.fitlimma_ng, cont_matrix_ng)
      voom.fitbayes_ng    <- eBayes(voom.fitbayes_ng)
      reslimma_cd_rho[[j]]<- topTable(voom.fitbayes_ng, adjust.method="fdr", number=Inf, sort.by="none" )

    limma_no_rho[[j]]   <- sapply( neglog10alpha, function(x) by(-log10(reslimma_no_rho[[j]]$P.Value) > x, truth, sum )/true_count )
    limma_cd_rho[[j]]   <- sapply( neglog10alpha, function(x) by(-log10(reslimma_cd_rho[[j]]$P.Value) > x, truth, sum )/true_count )
  }
  rep_list_no[[ri]]      <- fdr_no_rho
  rep_list_cd[[ri]]      <- fdr_cd_rho
  rep_list_limma_no[[ri]]<- limma_no_rho
  rep_list_limma_cd[[ri]]<- limma_cd_rho
}

saveRDS( rep_list_no,          file ="rep_list_no.rds"  )
saveRDS( rep_list_cd,          file ="rep_list_cd.rds"  )
saveRDS( rep_list_limma_no,    file ="rep_list_limma_no.rds"  )
saveRDS( rep_list_limma_cd,    file ="rep_list_limma_cd.rds"  )

# dtcdr<-melt(data.frame(abind(cdr_list,along=2),stimulation=rep(c("Unstim","Stim"),each=N0)))
# dtcdr$variable<-factor(dtcdr$variable,labels=c("strong confounding", "moderate confounding", "no confounding"))
# pdf("cdr_confounding.pdf",width=12,heigh=4)
# ggplot(dtcdr)+geom_density()+aes(x=value,color=stimulation)+facet_grid(.~variable)+xlab("CDR") + scale_colour_brewer(palette="Set1")
# dev.off()

rep_list_no        <- readRDS( file = "../inst/extdata/rep_list_no.rds") 
rep_list_cd        <- readRDS( file = "../inst/extdata/rep_list_cd.rds" ) 
rep_list_limma_no  <- readRDS( file = "../inst/extdata/rep_list_limma_no.rds" ) 
rep_list_limma_cd  <- readRDS( file = "../inst/extdata/rep_list_limma_cd.rds" ) 

# rep_list_no        <- readRDS(file ="rep_list_no.rds"  )
# rep_list_cd        <- readRDS(file ="rep_list_cd.rds"  )
# rep_list_limma_no  <- readRDS(file ="rep_list_limma_no.rds"  )
# rep_list_limma_cd  <- readRDS(file ="rep_list_limma_cd.rds"  )

ROC

conditions=c("no CDR in TRUTH", "strong confounding", "moderate confounding", "no confounding")
dttmp=NULL
for(j in 1:4){
dttmp=rbind(dttmp, 
    rbind(data.frame(t(abind(lapply(rep_list_no,function(x)x[[j]]),along=2))[,c(2,1)],
              model="MAST no CDR", condition=conditions[j]),
      data.frame(t(abind(lapply(rep_list_cd,function(x)x[[j]]),along=2))[,c(2,1)],
            model="MAST with CDR", condition=conditions[j]),
      data.frame(t(abind(lapply(rep_list_limma_no,function(x)x[[j]]),along=2))[,c(2,1)],
            model="limma no CDR", condition=conditions[j]),
      data.frame(t(abind(lapply(rep_list_limma_cd,function(x)x[[j]]),along=2))[,c(2,1)],
            model="limma with CDR", condition=conditions[j])
      )
    )
}
names(dttmp)[1:2]<-c("FP","TP")
dttmp$condition = factor(dttmp$condition)
#pdf("simulation_ROC_with_without_CDR.pdf",width=16,height=4)
ggplot(dttmp)+aes(x=FP,y=TP,group=model,color=model)+facet_grid(.~condition)+geom_abline(intercept=0,slope=1,linetype="dotted")+ stat_quantile(method = "rqss",quantiles = c(0.05, 0.95),linetype="dashed", lambda = 1)+ stat_quantile(method = "rqss",quantiles = c(0.5), lambda = 1)+ylim(0,1)+xlim(0,1)
#dev.off()

RGLab/MASTdata documentation built on May 8, 2019, 5:52 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

RGLab/MASTdata
This is the data used for the MAST paper

In RGLab/MASTdata: This is the data used for the MAST paper

Plot of the rate of detection of enriched immune GO terms vs FDR threshold for DE genes across different methods.

mDC

GO enrichment using hypergeometric for mDC.

GSEA of GO BP Modules in mDC

GSEA of GO BP Modules Enriched in MAIT

Simulation study

For portability, all the time consuming steps are not evaluated.

Please set ALL the eval flag in the knitr chunks to TRUE before running the code.

fit ZLM

read fit ZLM result

The estimated parameters in MAIT

The estimated stimulation and cdr coefficient in MAIT

ROC

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/MASTdata This is the data used for the MAST paper

In RGLab/MASTdata: This is the data used for the MAST paper

Plot of the rate of detection of enriched immune GO terms vs FDR threshold for DE genes across different methods.

mDC

GO enrichment using hypergeometric for mDC.

GSEA of GO BP Modules in mDC

GSEA of GO BP Modules Enriched in MAIT

Simulation study

For portability, all the time consuming steps are not evaluated.

Please set ALL the eval flag in the knitr chunks to TRUE before running the code.

fit ZLM

read fit ZLM result

The estimated parameters in MAIT

The estimated stimulation and cdr coefficient in MAIT

ROC

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/MASTdata
This is the data used for the MAST paper