Edgar Steiger 2018
work in progress
This document shows and explains how to use the dogss package and how to reproduce Figures 14 and 15 from our paper Sparse-Group Bayesian Feature Selection Using Expectation Propagation for Signal Recovery and Network Reconstruction.
First we need to load some packages that are required for comparisons and plotting (please install if not available on your machine):
library(dogss) # our method for sparse-group Bayesian feature selection with EP
library(glmnet) # standard lasso
library(gglasso) # group lasso
library(SGL) # sparse-group lasso
library(MBSGS) # Bayesian feature selection with Gibbs sampling
library(ggplot2) # for nice plots
library(ggthemes) # for even nicer plots
library(grid); library(gridExtra) # to arrange plots pleasantly
library(reshape2) # to melt data into "tidy" long-format
library(DescTools) # for area computations (AUROC, AUPR)
library("foreach")
library("doParallel")
library(parallel)
Furthermore we need to load three R files with additional code:
source("../auxiliary_rfunctions/my_cvSGL.R") # proper cross validation for SGL package
source("../auxiliary_rfunctions/my_theme.R") # functions to adjust ggplots
Finally, we provide all of the results on our simulated data to reconstruct the plots from the publication. If you wish to re-do all of the simulations/calculations, change the following parameter to TRUE
(only do this if you have access to multiple cores):
selfcompute <- FALSE
B <- 100 # number of simulations
ncores <- 50 # number of cores used for parallelization
nobs_train <- c(100, 300, 500)
geneexpr <- as.matrix(read.table(file="/project/dogss/datasets/dream5/net3_expression_data.tsv", header=TRUE))
tfs <- read.table(file="/project/dogss/datasets/dream5/net3_transcription_factors.tsv", colClasses="character")$V1
truth <- read.table(file="/project/dogss/datasets/dream5/DREAM5_NetworkInference_GoldStandard_Network3", header=FALSE, colClasses=c("character", "character", "integer"))
truth <- truth[truth$V3==1, c(1,2)]
load("/project/dogss/datasets/dream5/G_firstrun.RData")
load("/project/dogss/datasets/dream5/wisdomG.RData")
#####
# net3_expression_data <- read.delim("/run/user/1000/gvfs/sftp:host=geniux.molgen.mpg.de,user=steige_e/project/dogss/datasets/dream5/net3_expression_data.tsv")
# geneexpr <- net3_expression_data
# net3_transcription_factors <- read.table("/run/user/1000/gvfs/sftp:host=geniux.molgen.mpg.de,user=steige_e/project/dogss/datasets/dream5/net3_transcription_factors.tsv", quote="\"", comment.char="", stringsAsFactors=FALSE)
# tfs <- net3_transcription_factors$V1
# DREAM5_NetworkInference_GoldStandard_Network3 <- read.delim("/run/user/1000/gvfs/sftp:host=geniux.molgen.mpg.de,user=steige_e/project/dogss/datasets/dream5/DREAM5_NetworkInference_GoldStandard_Network3", header=FALSE, stringsAsFactors=FALSE)
# truth <- DREAM5_NetworkInference_GoldStandard_Network3[DREAM5_NetworkInference_GoldStandard_Network3$V3==1, c(1,2)]
# load("/run/user/1000/gvfs/sftp:host=geniux.molgen.mpg.de,user=steige_e/project/dogss/datasets/dream5/G_firstrun.RData")
# load("/run/user/1000/gvfs/sftp:host=geniux.molgen.mpg.de,user=steige_e/project/dogss/datasets/dream5/wisdomG.RData")
geneexpr_center <- scale(geneexpr, scale=FALSE)
geneexpr_scale <- scale(geneexpr)
geneexpr_center_full <- geneexpr_center
geneexpr_scale_full <- geneexpr_scale
myG <- as.integer(Gold)
names(myG) <- tfs
genes <- colnames(geneexpr)
wisdomG <- moduleG[tfs]
names(wisdomG) <- tfs
myrandomG <- sample(myG, length(myG))
names(myrandomG) <- tfs
myclusterG <- kmeans(t(geneexpr_scale_full[, tfs]), centers=length(unique(myG)))$cluster
wisdomrandomG <- sample(wisdomG, length(wisdomG))
names(wisdomrandomG) <- tfs
wisdomclusterG <- kmeans(t(geneexpr_scale_full[, tfs]), centers=length(unique(wisdomG)))$cluster
Gs <- cbind(myG, myclusterG, myrandomG, wisdomG, wisdomclusterG, wisdomrandomG)
nfeats <- dim(geneexpr)[2]
nobs <- dim(geneexpr)[1]
ntfs <- length(tfs)
neffs <- dim(truth)[1]
truth_matrix <- matrix(0, nrow=nfeats, ncol=nfeats, dimnames=list(genes1=colnames(geneexpr), genes2=colnames(geneexpr)))
for (i in 1:neffs) {
truth_matrix[truth[i, 1], truth[i, 2]] <- 1
truth_matrix[truth[i, 2], truth[i, 1]] <- 1
}
neffs <- sum(truth_matrix)/2
for (j in seq(nobs_train)) {
cat("\n nobs: ", nobs_train[j], "\n")
for (run in 1:10) {
cat("\n run: ", run, "\n")
train <- sample(seq(nobs), nobs_train[j])
test <- seq(nobs)[-train]
geneexpr_center_train <- geneexpr_center_full[train, ]
geneexpr_center_test <- geneexpr_center_full[test, ]
geneexpr_scale_train <- geneexpr_scale_full[train, ]
geneexpr_scale_test <- geneexpr_scale_full[test, ]
save(train, test, file=paste0("/project/dogss/tests_thesis/03_dream5/train_test_run_", run, "_nobs_", nobs_train[j], "_dream5.RData"))
ncl <- detectCores()
ncl <- min(ncl, ncores)
myCluster <- makeCluster(ncl, outfile="/project/dogss/tests_thesis/dream5.txt")
registerDoParallel(myCluster)
results_dream5 <- foreach (g=genes, .packages=c("dogss2", "SGL", "glmnet", "gglasso", "Matrix")) %dopar% { # "MSGS"
cat("\n", g, "\n")
Y <- as.vector(geneexpr_center_train[, g])
scores <- vector("list", 6)
results <- vector("list", 6)
for (grouping in 1:6) {
if (!(g %in% tfs)) {
index <- Gs[, grouping]
x <- geneexpr_scale_train[, tfs]
} else {
index <- Gs[!(tfs %in% g), grouping]
x <- geneexpr_scale_train[, tfs][, !(tfs %in% g)]
}
groups <- unique(index)
ngroups <- length(groups)
nG <- ngroups
nngroups <- sapply(groups, function(g) sum(index==g))
while (max(groups) != ngroups) { # this is for gglasso which needs consecutively numbered groups :-/
del <- max(groups)
index[index==del] <- which.min(1:nG %in% groups) # this is superweird :D
groups <- unique(index)
ngroups <- length(groups)
nngroups <- sapply(groups, function(g) sum(index==g))
}
res_dogss <- cv_dogss(x, Y, G=index, tol=1e-03)
res_ssep <- cv_dogss(x, Y, G=NULL, tol=1e-03)
res_sgl <- my_cvSGL(data=list(x=x, y=Y), index=index, standardize=FALSE, thresh=1e-03, maxit=1000)
res_gglasso <- cv.gglasso(x=x, y=Y, group=index, nfolds=10, intercept=FALSE, eps=1e-03, maxit=1000)
res_lasso <- cv.glmnet(x, Y, standardize=FALSE, intercept=FALSE, thresh=1e-03, maxit=1000)
res_sgl$lambda <- res_sgl$fit$lambdas
if (is.na(res_gglasso$lambda.1se)) {
whichmin <- which.min(res_gglasso$cvm)
res_gglasso$lambda.1se <- res_gglasso$lambda[which.max(res_gglasso$cvm<=res_gglasso$cvm[whichmin]+res_gglasso$cvsd[whichmin])]
}
new_res_gglasso <- list(lambda=res_gglasso$lambda, gglasso.fit=list(beta=as(res_gglasso$gglasso.fit$beta, "dgCMatrix")), lambda.1se=res_gglasso$lambda.1se)
new_res_lasso <- list(lambda=res_lasso$lambda, glmnet.fit=list(beta=res_lasso$glmnet.fit$beta), lambda.1se=res_lasso$lambda.1se)
new_res_sgl <- list(lambda=res_sgl$lambda, beta=res_sgl$beta, sgl.fit=list(beta=as(res_sgl$fit$beta, "dgCMatrix")), lambda.1se=res_sgl$lambda_1se)
res_gglasso <- new_res_gglasso
res_lasso <- new_res_lasso
res_sgl <- new_res_sgl
res_sgl$mylambda <- apply(res_sgl$sgl.fit$beta, 1, function(b) res_sgl$lambda[which(b!=0)[1]])
res_gglasso$mylambda <- apply(res_gglasso$gglasso.fit$beta, 1, function(b) res_gglasso$lambda[which(b!=0)][1])
res_lasso$mylambda <- apply(res_lasso$glmnet.fit$beta, 1, function(b) res_lasso$lambda[which(b!=0)[1]])
scores[[grouping]] <- cbind(dogss=res_dogss$p_final, ssep=res_ssep$p_final, sgl=res_sgl$mylambda, gglasso=res_gglasso$mylambda, lasso=res_lasso$mylambda) # ...$areascore # ...$p_final # ,
row.names(scores[[grouping]]) <- names(index)
results[[grouping]] <- list(res_dogss=res_dogss, res_ssep=res_ssep, res_sgl=res_sgl, res_gglasso=res_gglasso, res_lasso=res_lasso)
}
scoresresults <- list(scores=scores, results=results)
save(scoresresults, file=paste0("/project/dogss/tests_thesis/03_dream5/scoresresults_gene_", g, "_run_", run, "_nobs_", nobs_train[j], ".RData"))
scoresresults
}
stopCluster(myCluster)
save.image(paste0("/project/dogss/tests_thesis/03_dream5/finalresults_run_", run, "_nobs_", nobs_train[j], ".RData"))
}
}
load("/project/dogss/tests_thesis/03_dream5_analysis/average_dream5_results_300_1.RData")
# FPR <- FPR[, 1:4]
# TPR <- TPR[, 1:4]
# PREC <- PREC[, 1:4]
colnames(FPR) <- methods #[1:4]
colnames(TPR) <- methods #[1:4]
colnames(PREC) <- methods #[1:4]
N <- dim(FPR)[1]
mygrid <- c(1:2000, 2:1450*1000+1)
myranks <- c(0:2000, 2000+1000*1:18)
data <- data.frame(method=melt(FPR[mygrid, ])[, 2], fpr=melt(FPR[mygrid, ])[, 3], tpr=melt(TPR[mygrid, ])[, 3], precision=melt(PREC[mygrid, ])[, 3])
data <- rbind(data, data.frame(method="random", fpr=c(0,0.25,1), tpr=c(0,0.25,1), precision=c(0.0014, 0.0014, 0.0014)))
data$method <- factor(data$method, levels = c(methods, "random"), ordered = TRUE)
colnames(PREDERROR) <- methods
rownames(PREDERROR) <- myranks
data_pred <- melt(PREDERROR)
data_pred$method <- factor(data_pred$method, levels = methods, ordered = TRUE)
Ed_palette_full <- Ed_palette
Ed_palette <- Ed_palette[c(1:5, 7)]
myROC <- ggplot(data, aes(x=fpr, y=tpr, color=method)) +
geom_line(aes(linetype=method), size=1) +
theme_Ed() +
scale_colour_Ed() +
scale_linetype_manual(values=c(rep("solid", 5), "dashed")) +
# geom_segment(aes(x=0, y=0, xend=1, yend=1, colour="random"), linetype="dashed", colour=Ed_palette[7]) +
labs(title= "ROC curve", x = "FPR", y = "TPR")
myPR <- ggplot(data, aes(x=tpr, y=precision, color=method)) +
geom_line(aes(linetype=method), size=1) +
theme_Ed() +
scale_colour_Ed() +
scale_linetype_manual(values=c(rep("solid", 5), "dashed")) +
# geom_segment(aes(x=0, y=2055/N, xend=1, yend=2055/N, colour="random"), linetype="dashed", colour=Ed_palette[7]) +
ylim(0,1) +
labs(title= "Precision-Recall curve", x = "TPR", y = "Precision")
myPR_zoom <- ggplot(data, aes(x=tpr, y=precision, color=method)) +
geom_line(aes(linetype=method), size=1) +
theme_Ed() +
scale_colour_Ed() +
scale_linetype_manual(values=c(rep("solid", 5), "dashed")) +
# geom_segment(aes(x=0, y=2055/N, xend=0.25, yend=2055/N, colour="random"), linetype="dashed", colour=Ed_palette[5]) +
ylim(0, 0.5) +
xlim(0, 0.25) +
labs(title= "PR curve: zoomed", x = "TPR", y = "Precision")
myPred <- ggplot(data_pred, aes(x=rank, y=value, color=method)) +
geom_line(size=1) +
theme_Ed() +
scale_colour_Ed() +
ylim(0, max(PREDERROR)) +
labs(title= "Prediction on test set", x = "number of edges", y = "Prediction error")
grid_arrange_shared_legend(myROC, myPR, myPR_zoom, myPred, ncol = 2, nrow = 2) #
Ed_palette <- Ed_palette_full
myranks <- c(0:2000, 2000+1000*1:18)
methods <- c("dogss", "ssep", "sgl", "gglasso", "lasso") #
mynobs_train <- 300 # c(100, 300, 500)
groupings <- 1:4 # 1:6
for (run in 3:10) {
cat("run", run, "\n")
#for (mynobs in seq(mynobs_train)) {
cat("nobs", mynobs_train, "\n")
# load(paste0("/project/dogss/tests_thesis/03_dream5/nobs_100/train_test_run_", run, "_nobs_100_dream5.RData"))
load(paste0("/project/dogss/tests_thesis/03_dream5/nobs_", mynobs_train, "/finalresults_run_", run, "_nobs_", mynobs_train, ".RData"))
for (gr in groupings) {
cat("grouping", gr, "\n")
allscores <- matrix(0, nrow=nfeats*ntfs-ntfs, ncol=5)
pairs <- matrix("", nrow=nfeats*ntfs-ntfs, ncol=2)
mia <- 1
mie <- 0
betabeta <- vector("list")
for (g in 1:length(genes)) {
mie <- mie+dim(results_dream5[[g]]$scores[[gr]])[1]
pairs[mia:mie, ] <- cbind(rep(genes[g], dim(results_dream5[[g]]$scores[[gr]])[1]), as.character(rownames(results_dream5[[g]]$scores[[gr]])))
allscores[mia:mie, ] <- results_dream5[[g]]$scores[[gr]]
mia <- mia+dim(results_dream5[[g]]$scores[[gr]])[1]
betas <- matrix(0, ncol=5, nrow=length(tfs), dimnames=list(tfs=tfs, method=methods))
betas[!(tfs %in% genes[g]), "dogss"] <- results_dream5[[g]]$results[[gr]]$res_dogss$m_final
betas[!(tfs %in% genes[g]), "ssep"] <- results_dream5[[g]]$results[[gr]]$res_ssep$m_final
betas[!(tfs %in% genes[g]), "sgl"] <- results_dream5[[g]]$results[[gr]]$res_sgl$beta
betas[!(tfs %in% genes[g]), "gglasso"] <- results_dream5[[g]]$results[[gr]]$res_gglasso$gglasso.fit$beta[, which.max(results_dream5[[g]]$results[[gr]]$res_gglasso$lambda==results_dream5[[g]]$results[[gr]]$res_gglasso$lambda.1se)]
betas[!(tfs %in% genes[g]), "lasso"] <- results_dream5[[g]]$results[[gr]]$res_lasso$glmnet.fit$beta[, which.max(results_dream5[[g]]$results[[gr]]$res_lasso$lambda==results_dream5[[g]]$results[[gr]]$res_lasso$lambda.1se)]
betabeta[[genes[g]]] <- betas
}
save(allscores, run, gr, file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/allscores_", run, "_", mynobs_train, "_", gr, ".RData"))
K <- (nfeats-ntfs)*ntfs+(ntfs-1)*ntfs/2
P <- sum(truth_matrix)/2
N <- K-P
ranks <- apply(allscores, 2, function(r) rank(1-r, ties.method="min"))
colnames(ranks) <- methods
recov_matrices <- list()
for (m in methods) recov_matrices[[m]] <- matrix(0, nrow=dim(truth_matrix)[1], ncol=dim(truth_matrix)[2], dimnames=list(from=rownames(truth_matrix), to=colnames(truth_matrix)))
if (gr==1) {
for (m in methods) {
for (j in seq(dim(pairs)[1])) {
if (recov_matrices[[m]][pairs[j,2], pairs[j,1]]==0 | recov_matrices[[m]][pairs[j,2], pairs[j,1]] > ranks[j, m]) {
recov_matrices[[m]][pairs[j,2], pairs[j,1]] <- ranks[j, m]
recov_matrices[[m]][pairs[j,1], pairs[j,2]] <- ranks[j, m]
}
}
}
} else {
for (m in methods[c(1,3,4)]) {
for (j in seq(dim(pairs)[1])) {
if (recov_matrices[[m]][pairs[j,2], pairs[j,1]]==0 | recov_matrices[[m]][pairs[j,2], pairs[j,1]] > ranks[j, m]) {
recov_matrices[[m]][pairs[j,2], pairs[j,1]] <- ranks[j, m]
recov_matrices[[m]][pairs[j,1], pairs[j,2]] <- ranks[j, m]
}
}
}
}
save(ranks, run, gr, file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/ranks_", run, "_", mynobs_train, "_", gr, ".RData"))
M <- length(methods)
tpr <- matrix(0, K, M)
fpr <- matrix(0, K, M)
precision <- matrix(0, K, M)
logicalK <- matrix(FALSE, nrow=nfeats, ncol=nfeats)
logicalK[1:ntfs, (ntfs+1):nfeats] <- TRUE
logicalK[1:ntfs, 1:ntfs][upper.tri(logicalK[1:ntfs, 1:ntfs])] <- TRUE
for (m in 1:M) {
truth_ordered <- truth_matrix[logicalK][order(recov_matrices[[m]][logicalK])]
tpr[, m] <- cumsum(truth_ordered)/P
fpr[, m] <- cumsum(!truth_ordered)/N
precision[, m] <- tpr[, m]*P/(tpr[, m]*P + fpr[, m]*N)
}
AUROC <- sapply(1:length(methods), function(m) AUC(c(0,fpr[, m],1), c(0,tpr[, m],1)))
AUPR <- sapply(1:length(methods), function(m) AUC(c(0,tpr[, m],1), c(1,precision[, m],0)))
save(AUROC, AUPR, tpr, fpr, precision, run, gr, file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/aucs_", run, "_", mynobs_train, "_", gr, ".RData"))
# prediction
Y <- geneexpr_center_test[, !(colnames(geneexpr_center_test)%in%tfs)]
sumY2 <- sum(Y^2)
X <- geneexpr_scale_test[, tfs]
BETA <- vector("list")
RANKS <- vector("list")
prederror <- matrix(0, nrow=length(myranks), ncol=5, dimnames=list(rank=myranks, method=methods))
if (gr==1) {
for (m in methods) {
cat(m,"\n")
BETA[[m]] <- do.call(cbind, lapply(betabeta, function(b) b[, m]))
BETA[[m]] <- BETA[[m]][, !(colnames(geneexpr_center_test)%in%tfs)]
RANKS[[m]] <- matrix(max(ranks)+1, nrow=ntfs, ncol=nfeats)
RANKS[[m]][-c(0:(ntfs-1)*(ntfs+1)+1)] <- ranks[, m]
RANKS[[m]] <- RANKS[[m]][, !(colnames(geneexpr_center_test)%in%tfs)]
cl <- makeCluster(no_cores)
clusterExport(cl, c("Y", "X", "RANKS", "myranks", "BETA", "m", "sumY2"))
prederror[, m] <- parSapply(cl, seq(myranks), function(r) sum((Y-X%*%ifelse(RANKS[[m]]>myranks[r], 0, BETA[[m]]))^2)/sumY2)
stopCluster(cl)
}
}
save(prederror, run, gr, file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/prederror_", run, "_", mynobs_train, "_", gr, ".RData"))
}
# }
}
##### average the curves
#for (mynobs in seq(mynobs_train)) {
cat("averages nobs", mynobs_train, "\n")
for (gr in groupings) {
cat("averages grouping", gr, "\n")
load(file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/aucs_", 1, "_", mynobs_train, "_", gr, ".RData"))
load(file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/prederror_", 1, "_", mynobs_train, "_", gr, ".RData"))
TPR <- tpr
FPR <- fpr
PREC <- precision
PREDERROR <- prederror
allAUROC <- matrix(0, nrow=10, ncol=5)
allAUPR <- matrix(0, nrow=10, ncol=5)
allAUROC[1, ] <- AUROC
allAUPR[1, ] <- AUPR
for (run in 2:10) {
load(file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/aucs_", run, "_", mynobs_train, "_", gr, ".RData"))
load(file=paste0("/project/dogss/tests_thesis/03_dream5_analysis/prederror_", run, "_", mynobs_train, "_", gr, ".RData"))
allAUROC[run, ] <- AUROC
allAUPR[run, ] <- AUPR
TPR <- TPR + tpr
FPR <- FPR + fpr
PREC <- PREC + precision
PREDERROR <- PREDERROR + prederror
}
TPR <- TPR/10
FPR <- FPR/10
PREC <- PREC/10
PREDERROR <- PREDERROR/10
meaned_auroc <- sapply(1:length(methods), function(m) AUC(FPR[, m], TPR[, m]))
meaned_aupr <- sapply(1:length(methods), function(m) AUC(TPR[, m], PREC[, m]))
save(TPR, FPR, PREC, PREDERROR, allAUPR, allAUROC, meaned_aupr, meaned_auroc, methods, file="/project/dogss/tests_thesis/03_dream5_analysis/average_dream5_results_", mynobs_train, "_", gr, ".RData")
}
#}
cat("done\n")
load("/project/dogss/testing/withpred_dream5_withpred_diffgroups_whole_final.RData")
old_FPR <- FPR[, 1]
old_TPR <- TPR[, 1]
old_PREC <- PREC[, 1]
load("~/Programme/dogss/testing/dream5_withpred_diffgroups_whole_averaged_10.RData")
FPR <- cbind(old_FPR, FPR)
TPR <- cbind(old_TPR, TPR)
PREC <- cbind(old_PREC, PREC)
methods <- c("co-binding", "unif. random", "perm. groups", "kmeans")
colnames(FPR) <- methods
colnames(TPR) <- methods
colnames(PREC) <- methods
N <- dim(FPR)[1]
mygrid <- c(1:2000, 2:1500*1000+1)
data <- data.frame(method=melt(FPR[mygrid, ])[, 2], fpr=melt(FPR[mygrid, ])[, 3], tpr=melt(TPR[mygrid, ])[, 3], precision=melt(PREC[mygrid, ])[, 3])
myROC <- ggplot(data, aes(x=fpr, y=tpr, color=method)) +
geom_line(size=1.3) +
theme_Ed() +
scale_colour_Ed() +
geom_segment(aes(x=0, y=0, xend=1, yend=1, colour="random"), linetype="dashed", colour=Ed_palette[5]) +
labs(title= "ROC curve", x = "FPR", y = "TPR")
myPR_zoom <- ggplot(data, aes(x=tpr, y=precision, color=method)) +
geom_line(size=1.3) +
theme_Ed() +
scale_colour_Ed() +
geom_segment(aes(x=0, y=2066/N, xend=0.25, yend=2066/N, colour="random"), linetype="dashed", colour=Ed_palette[5]) +
ylim(0, 0.5) +
xlim(0, 0.25) +
labs(title= "PR curve: zoomed", x = "TPR", y = "Precision")
grid_arrange_shared_legend(myROC, myPR_zoom, ncol = 2, nrow = 1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.