knitr::opts_chunk$set(echo = TRUE, warning=FALSE) rm(list = ls()) if (!("devtools" %in% installed.packages()[,"Package"])){ install.packages("devtools", repos = "http://cran.us.r-project.org", dependencies = TRUE) } library(devtools) if (!("privateEC" %in% installed.packages()[,"Package"])){ devtools::install_github("insilico/privateEC", build_vignettes = TRUE) } if (!("stir" %in% installed.packages()[,"Package"])){ devtools::install_github("insilico/stir", build_vignettes = TRUE) } library(privateEC) # used to simulate data library(stir) # load other helper packages packages <- c("ggplot2", "reshape2", "dplyr") check.packages(packages) # helper function from STIR
Trang T. Le, et al. “Identification and replication of RNA-Seq gene network modules associated with depression severity,” Translational Psychiatry. 2018.
class.lab <- "Diag" # diagnosis, MDD/HC writeResults <- F data(mdd.RNAseq) # ls() # "covs.short" 157 x 39 covariates # "my_subjs" list of 157 subject ids # "num.genes" 5912 # "phenos" factor with levels MDD and HC phenotypes # "rnaSeq" 157 x 5912 expression levels # 39 covariates, but we focus on MDD/HC status pheno.df <- mdd.RNAseq$covs.short[, "Diag", drop = F] # Diagnosis dataframe column, MDD/HC # create gene expression data set gene.exp.dat <- merge(mdd.RNAseq$rnaSeq, pheno.df, by = "row.names", sort = F) rownames(gene.exp.dat) <- gene.exp.dat$Row.names dat <- gene.exp.dat[, -1] # remove columne of subject ids predictors.mat <- dat[, - which(colnames(dat) == class.lab)] dat[, class.lab] <- as.factor(dat[, class.lab]) pheno.class <- dat[, class.lab] attr.names <- colnames(predictors.mat) num.samp <- nrow(dat)
RF.method = "multisurf" metric <- "manhattan" # let k=0 because multisurf does not use k system.time(neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = 0, method = RF.method)) system.time(results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method)) t_sorted_multisurf <- results.list$STIR_T[, -3] # remove cohen-d colnames(t_sorted_multisurf) <- paste(c("t.stat", "t.pval", "t.pval.adj"), "stir", sep=".")
STIR-significant genes:
stir_msurf.sig <- as.character(row.names(t_sorted_multisurf[t_sorted_multisurf$t.pval.adj.stir<0.05,])) # p-adj < .05 STIR-multiSURF: t_sorted_multisurf_short <- t_sorted_multisurf[stir_msurf.sig, ] (t_sorted_multisurf_short) # optional write genes with t.pval.adj.stir < 0.05: if (writeResults) write.csv(t_sorted_multisurf_short, file = "stirGenes.csv") t_sorted_multisurf$attribute <- rownames(t_sorted_multisurf) # adds a column for merge
ReliefF with $k=\lfloor(m-1)/6\rfloor$ (where m is the number of samples) is similar to multiSURF:
RF.method = "relieff" k <- floor(num.samp/6) # k=m/6 should be similar to MultiSURF neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = k, method = RF.method) results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method) t_sorted_relieff <- results.list$STIR_T[, -3] colnames(t_sorted_relieff) <- paste(c("t.stat", "t.pval", "t.pval.adj"), k, sep=".") (t_sorted_relieff[1:32,]) t_sorted_relieff$attribute <- rownames(t_sorted_relieff)
ReliefF with $k=\lfloor(m-1)/2\rfloor$ (where $m$ is the number of samples) is the maximum $k$, which is more myopic like a t-test:
t_sorted_relieff_kmax <- list() i <- 0 RF.method = "relieff" # k=(m-1)/2 should be similar to a t-test # there is a slight imblance, so subtract 1 from smaller class size to get kmax k <- min(c(sum(mdd.RNAseq$phenos=="MDD"), sum(mdd.RNAseq$phenos=="HC")))-1 # 77 neighbor.idx.observed <- find.neighbors(predictors.mat, pheno.class, k = k, method = RF.method) results.list <- stir(predictors.mat, neighbor.idx.observed, k = k, metric = metric, method = RF.method) t_sorted_relieff_kmax <- results.list$STIR_T[, -3] colnames(t_sorted_relieff_kmax) <- paste(c("t.stir", "p.stir", "p.adj"), k, sep=".") (t_sorted_relieff_kmax[1:32,]) # top 32 genes from STIR-kmax t_sorted_relieff_kmax$attribute <- rownames(t_sorted_relieff_kmax)
regular.ttest.results <- sapply(1:ncol(predictors.mat), regular.ttest.fn, dat = dat) names(regular.ttest.results) <- colnames(predictors.mat) regular.ttest.sorted <- sort(regular.ttest.results) regular.t.padj <- data.frame(regT.padj = p.adjust(regular.ttest.sorted)) top32t <- rownames(regular.t.padj)[1:32] (regular.t.padj[top32t, , drop = F]) # top 32 genes from t-test regular.t.padj$attribute <- rownames(regular.t.padj) # intersection between the top 32 genes from STIR-multiSURF and top 32 genes from t-test: if (writeResults){ write.csv(intersect(top32t, stir_msurf.sig), file = "tAndSTIR.csv") write.csv(setdiff(top32t, stir_msurf.sig), file = "tMinusSTIR.csv") write.csv(setdiff(stir_msurf.sig, top32t), file = "STIRMinust.csv") }
t.stir <- merge(regular.t.padj, t_sorted_multisurf, by = "attribute") t.stir$nlogp.t <- -log10(t.stir$regT.padj) t.stir$nlogp.stir <- -log10(t.stir$t.pval.adj.stir) t.stir$sig.genes <- NA t.stir[t.stir$t.pval.adj.stir < 0.05, "sig.genes"] <- t.stir[t.stir$t.pval.adj.stir < 0.05, "attribute"] t.stir$t.sig <- as.factor((t.stir$regT.padj <0.05) + (t.stir$t.pval.adj.stir <0.05)) my.cols <- c("#009E73", "#CC79A7")
pboth <- ggplot(t.stir, aes(x = nlogp.t, y = nlogp.stir)) + geom_point(alpha = 0.7, shape = 21, size = 2.5, aes(fill = t.sig)) + theme_bw() + geom_hline(yintercept = -log10(0.05), linetype = 2, alpha = 0.8, color = my.cols[2]) + geom_vline(xintercept = -log10(0.05), linetype = 2, alpha = 0.8, color = my.cols[1]) + geom_abline(slope = 1, intercept = 0, alpha = 0.5, linetype = 3) + annotate("text", 4, -log10(0.05) + 0.1, label = "STIR 0.05 FDR", size = 3, color = my.cols[2]) + annotate("text", -log10(0.05) - 0.1, 6.5, label = "t-test 0.05 FDR", size = 3, color = my.cols[1], angle = 90) + scale_x_continuous(limits = c(0, 4.5)) + labs(x = bquote('t-test ('~-log[10]~'p'[adj]~')'), y = bquote('STIR-multiSURF ('~-log[10]~'p'[adj]~')')) + geom_text(aes(label = sig.genes, size = t.sig), check_overlap = TRUE, hjust=-0.12, vjust=1.5, fontface = "italic") + scale_fill_manual(values = c("white", my.cols[2], "grey30")) + guides(size=FALSE, fill = F) + coord_fixed(ratio = 1) + theme(axis.title = element_text(size = 8), axis.text = element_text(size = 7)) + scale_size_discrete(range = c(2,2)) pboth if (writeResults) ggsave("tstir.pdf", pboth, width = 4, height = 6)
t_sorted_relieff_list <- list(t_sorted_relieff, t_sorted_multisurf, regular.t.padj) final.mat <- Reduce(function(x, y) merge(x, y, by = "attribute", sort = F), t_sorted_relieff_list) if (writeResults) write.csv(final.mat,file="final.mat.csv") head(final.mat)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.