In insilico/STIR:

Install STIR and privateEC:

knitr::opts_chunk$set(echo = TRUE, warning=FALSE)
rm(list = ls())

if (!("devtools" %in% installed.packages()[,"Package"])){
  install.packages("devtools", repos = "http://cran.us.r-project.org", dependencies = TRUE)
}
library(devtools)
if (!("privateEC" %in% installed.packages()[,"Package"])){
  devtools::install_github("insilico/privateEC", build_vignettes = TRUE)
}
if (!("stir" %in% installed.packages()[,"Package"])){
  devtools::install_github("insilico/stir", build_vignettes = TRUE)
}
library(privateEC)  # used to simulate data
library(stir)

# load other helper packages
packages <- c("ggplot2", "reshape2", "dplyr")
check.packages(packages)  # helper function from STIR

Load variance filtered RNA-Seq data for major depressive disorder (MDD) and healthy controls (HC)

Trang T. Le, et al. “Identification and replication of RNA-Seq gene network modules associated with depression severity,” Translational Psychiatry. 2018.

class.lab <- "Diag"  # diagnosis, MDD/HC
writeResults <- F
data(mdd.RNAseq)

# ls()
# "covs.short"   157 x 39 covariates
# "my_subjs"     list of 157 subject ids   
# "num.genes"    5912  
# "phenos"       factor with levels MDD and HC phenotypes     
# "rnaSeq"       157 x 5912 expression levels

# 39 covariates, but we focus on MDD/HC status 
pheno.df <- mdd.RNAseq$covs.short[, "Diag", drop = F]  # Diagnosis dataframe column, MDD/HC
# create gene expression data set
gene.exp.dat <- merge(mdd.RNAseq$rnaSeq, pheno.df, by = "row.names", sort = F)
rownames(gene.exp.dat) <- gene.exp.dat$Row.names
dat <- gene.exp.dat[, -1] # remove columne of subject ids

predictors.mat <- dat[, - which(colnames(dat) == class.lab)]
dat[, class.lab] <- as.factor(dat[, class.lab]) 
pheno.class <- dat[, class.lab]
attr.names <- colnames(predictors.mat)
num.samp <- nrow(dat)

Run STIR-multiSURF:

RF.method = "multisurf"
metric <- "manhattan"
# let k=0 because multisurf does not use k
system.time(neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = 0, method = RF.method))
system.time(results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method))
t_sorted_multisurf <- results.list$STIR_T[, -3]  # remove cohen-d
colnames(t_sorted_multisurf) <- paste(c("t.stat", "t.pval", "t.pval.adj"), "stir", sep=".")

STIR-significant genes:

stir_msurf.sig <- as.character(row.names(t_sorted_multisurf[t_sorted_multisurf$t.pval.adj.stir<0.05,]))
# p-adj < .05 STIR-multiSURF:
t_sorted_multisurf_short <- t_sorted_multisurf[stir_msurf.sig, ]
(t_sorted_multisurf_short)
# optional write genes with t.pval.adj.stir < 0.05:
if (writeResults) write.csv(t_sorted_multisurf_short, file = "stirGenes.csv")
t_sorted_multisurf$attribute <- rownames(t_sorted_multisurf) # adds a column for merge

Run STIR-ReliefF constant $k=\lfloor(m-1)/6\rfloor$:

ReliefF with $k=\lfloor(m-1)/6\rfloor$ (where m is the number of samples) is similar to multiSURF:

RF.method = "relieff"
k <- floor(num.samp/6)  # k=m/6 should be similar to MultiSURF
neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = k, method = RF.method)
results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method)
t_sorted_relieff <- results.list$STIR_T[, -3]
colnames(t_sorted_relieff) <- paste(c("t.stat", "t.pval", "t.pval.adj"), k, sep=".")
(t_sorted_relieff[1:32,])
t_sorted_relieff$attribute <- rownames(t_sorted_relieff)

Run STIR-ReliefF constant $k=\lfloor(m-1)/2\rfloor$:

ReliefF with $k=\lfloor(m-1)/2\rfloor$ (where $m$ is the number of samples) is the maximum $k$, which is more myopic like a t-test:

t_sorted_relieff_kmax <- list()
i <- 0
RF.method = "relieff"
# k=(m-1)/2 should be similar to a t-test
# there is a slight imblance, so subtract 1 from smaller class size to get kmax
k <- min(c(sum(mdd.RNAseq$phenos=="MDD"), sum(mdd.RNAseq$phenos=="HC")))-1  # 77
neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = k, method = RF.method)
results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method)
t_sorted_relieff_kmax <- results.list$STIR_T[, -3]
colnames(t_sorted_relieff_kmax) <- paste(c("t.stir", "p.stir", "p.adj"), k, sep=".")
(t_sorted_relieff_kmax[1:32,]) # top 32 genes from STIR-kmax
t_sorted_relieff_kmax$attribute <- rownames(t_sorted_relieff_kmax)

Standard t-test:

regular.ttest.results <- sapply(1:ncol(predictors.mat), regular.ttest.fn, dat = dat)
names(regular.ttest.results) <- colnames(predictors.mat)
regular.ttest.sorted <- sort(regular.ttest.results)
regular.t.padj <- data.frame(regT.padj = p.adjust(regular.ttest.sorted))
top32t <- rownames(regular.t.padj)[1:32]
(regular.t.padj[top32t, , drop = F]) # top 32 genes from t-test
regular.t.padj$attribute <- rownames(regular.t.padj)
# intersection between the top 32 genes from STIR-multiSURF and top 32 genes from t-test:
if (writeResults){
  write.csv(intersect(top32t, stir_msurf.sig), file = "tAndSTIR.csv")
  write.csv(setdiff(top32t, stir_msurf.sig), file = "tMinusSTIR.csv")
  write.csv(setdiff(stir_msurf.sig, top32t), file = "STIRMinust.csv")
}

t.stir <- merge(regular.t.padj, t_sorted_multisurf, by = "attribute")
t.stir$nlogp.t <- -log10(t.stir$regT.padj)
t.stir$nlogp.stir <- -log10(t.stir$t.pval.adj.stir)
t.stir$sig.genes <- NA
t.stir[t.stir$t.pval.adj.stir < 0.05, "sig.genes"] <- t.stir[t.stir$t.pval.adj.stir < 0.05, "attribute"] 
t.stir$t.sig <- as.factor((t.stir$regT.padj <0.05) + (t.stir$t.pval.adj.stir <0.05))

my.cols <- c("#009E73", "#CC79A7")

pboth <- ggplot(t.stir, aes(x = nlogp.t, y = nlogp.stir)) + geom_point(alpha = 0.7, shape = 21, size = 2.5, aes(fill = t.sig)) + theme_bw() +
  geom_hline(yintercept = -log10(0.05), linetype = 2, alpha = 0.8, color = my.cols[2]) + 
  geom_vline(xintercept = -log10(0.05), linetype = 2, alpha = 0.8, color = my.cols[1]) + 
  geom_abline(slope = 1, intercept = 0, alpha = 0.5, linetype = 3) +
  annotate("text", 4, -log10(0.05) + 0.1, label = "STIR 0.05 FDR", 
           size = 3, color = my.cols[2]) +
  annotate("text", -log10(0.05) - 0.1, 6.5, label = "t-test 0.05 FDR", 
           size = 3, color = my.cols[1], angle = 90) +
  scale_x_continuous(limits = c(0, 4.5)) +
  labs(x = bquote('t-test ('~-log[10]~'p'[adj]~')'),
       y = bquote('STIR-multiSURF ('~-log[10]~'p'[adj]~')')) +
  geom_text(aes(label = sig.genes, size = t.sig), check_overlap = TRUE, 
            hjust=-0.12, vjust=1.5, fontface = "italic") + 
  scale_fill_manual(values = c("white", my.cols[2], "grey30")) +
  guides(size=FALSE, fill = F) + coord_fixed(ratio = 1) +
  theme(axis.title = element_text(size = 8),
        axis.text = element_text(size = 7)) +
  scale_size_discrete(range = c(2,2))
pboth

if (writeResults) ggsave("tstir.pdf", pboth, width = 4, height = 6)

Aggregate results of STIR with ReliefF and STIR with MultiSURF and regular t-test:

t_sorted_relieff_list <- list(t_sorted_relieff, t_sorted_multisurf, regular.t.padj)
final.mat <- Reduce(function(x, y) merge(x, y, by = "attribute", sort = F), t_sorted_relieff_list)
if (writeResults) write.csv(final.mat,file="final.mat.csv")
head(final.mat)