The first section, Analysis has the code used in the computational analsyis of [@QSPpaper]. Intermediate data files are stored throughout the notebook as commented out RDS
files. These are typically saved when creating the intermediate data is time consuming and/or used at branch points within the analysis. Some of the code chunks rely on functions from my personal packages which are not on CRAN
or BioConductor
installed from github.
The second section Results contain the code used to create the tables, figures, and data files for this paper. Figures and tables created by hand are not included.
knitr::opts_chunk$set(eval = FALSE)
require(devtools) # My package(s) my_pkgs <- c("delutils") if(!any(my_pkgs %in% row.names(installed.packages()))){ devtools::install_github(paste0("lefeverde/", my_pkgs)) }
Checks for missing packages and then installs if not found.
inst_packages <- row.names(installed.packages()) cran_packages <- c("tidyverse", "reshape2", "ggplot2", "matrixStats", "dendextend") miss_cran <- cran_packages[!cran_packages %in% inst_packages] if(length(miss_cran) > 0){ install.packages(miss_cran) } if(!require("BiocManager", quietly = TRUE)){ install.packages("BiocManager") } bioc_packages <- c("biomaRt", "limma", "sva", "GSVA", "cmapR") miss_bioc <- bioc_packages[!bioc_packages %in% inst_packages] if(length(miss_bioc) > 0){ BiocManager::install(bioc_packages) }
gene_dir <- 'DiStefano_raw_data/kallisto/' # Getting ENS ids from 1st tsv ens_trans <- paste0(gene_dir, list.files(gene_dir)[[1]]) %>% read_delim(., "\t") %>% pull(target_id) %>% str_split(., "\\.") %>% map(1) %>% unlist attributes <- c("ensembl_gene_id", "ensembl_transcript_id") ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", version=94) tx2gene <- getBM(attributes, "ensembl_transcript_id", ens_trans, ensembl) tx2gene <- tx2gene[,c(2,1)] # saveRDS(tx2gene, "tx2gene.rds") attributes <- c("ensembl_gene_id", "external_gene_name", "gene_biotype", "description") ens_genes <- unique(tx2gene$ensembl_gene_id) gene_annots <- getBM(attributes, "ensembl_gene_id", ens_genes, ensembl) #save(gene_annots, file = "gene_annots.RData")
library(tximport) tx2gene <- readRDS("tx2gene.rds") p <- readRDS('DiStefano_raw_data/pdata.rds') files <- paste0(p$file_name, '.gz') %>% paste0('DiStefano_raw_data/kallisto/', .) names(files) <- row.names(p) txi <- tximport(files, type = 'kallisto', tx2gene = tx2gene, ignoreTxVersion = T, countsFromAbundance = 'lengthScaledTPM') #saveRDS(list(p, txi), file='DiStefano_txi_and_p_192.rds')
sex_linked_genes <- c('ENSG00000131002', 'ENSG00000183878', 'ENSG00000067048') txi_n192 <- readRDS('DiStefano_txi_and_p_192.rds') p192 <- txi_n192[[1]] txi_n192 <- txi_n192[[2]] cnts <- txi_n192$counts cnts_per_million <- cnts[rowMedians(cnts) > 1,] %>% data.frame(.) %>% cpm(., log = FALSE) plot_data <- data.frame(cnts_per_million[sex_linked_genes[1],], sex=p192$sex, sample_id=p192$dcl_patient_id) %>% melt(.) %>% dplyr::select( -one_of('variable')) %>% arrange(., value)
Since this gene is Y-linked, I expect the males to have much higher expression than the females. I think the non-zero expression in the females can be attributed to multi-mapping reads. Kallisto does mapping a little differently, and so these are not discarded by default like with STAR and HISAT2.
female_cutoff <- plot_data %>% dplyr::filter(., sex =='Female') %>% arrange(., value) %>% pull() %>% quantile(., probs = .95) male_cutoff <- plot_data %>% dplyr::filter(., sex =='Male') %>% arrange(., value) %>% pull() %>% quantile(., probs = .05) suspicious_samples <- list( dplyr::filter(plot_data[plot_data$sex == 'Male',], value < male_cutoff), dplyr::filter(plot_data[plot_data$sex == 'Female',], value > female_cutoff) ) %>% lapply(., function(x){ x$sample_id }) %>% do.call(c, .)
suspicious_samples <- c("DLDR_0189","DLDR_0031","DLDR_0022","DLDR_0017","DLDR_0131", "DLDR_0130","DLDR_0140","DLDR_0122","DLDR_0123","DLDR_0111") load('tx2gene.RData') p192 <- readRDS('DiStefano_raw_data/pdata.rds') p <- p192 %>% dplyr::filter(., !dcl_patient_id %in% suspicious_samples) row.names(p) <- p$dcl_patient_id files <- paste0(p$file_name, '.gz') %>% paste0('DiStefano_raw_data/kallisto/', .) names(files) <- row.names(p) txi <- tximport(files, type = 'kallisto', tx2gene = tx2gene, ignoreTxVersion = T, countsFromAbundance = 'lengthScaledTPM') # saveRDS(list(p, txi), file='DiStefano_txi_and_p_182.rds')
load('gene_annots.RData') txi_list <- readRDS('DiStefano_txi_and_p_182.rds') p <- txi_list[[1]] txi <- txi_list[[2]] mod <- delutils::better_model_matrix(~0 + diagnosis, data=p) mod0 <- model.matrix(~1, data=p ) dge <- edgeR::DGEList(counts = txi$counts, samples = p, genes = gene_annots[row.names(txi$counts),]) dge <- dge[edgeR::filterByExpr(dge, group = dge$samples$diagnosis),] dge <- dge[(rowSums(dge$counts > 1) >= .75*dim(dge)[2]),] dge <- dge[!is.na(dge$genes$entrezgene),] v <- voom(dge, mod, normalize.method = 'quantile') svobj <- sva(v$E, mod=mod, mod0=mod0) # saveRDS(list(v, svobj), 'DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')
library(cowplot) v <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[1]] svobj <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[2]] p <- v$targets mod <- better_model_matrix(~0 + group, data=p) mod0 <- model.matrix(~1, data=p) e_rm <- removeBatchEffect(v$E, covariates = svobj$sv, design = mod) plt1 <- pca_plotter(v$E, p['group']) + labs( title="Unadjusted") + theme(plot.title = element_text(hjust = 0.5, face="bold", size=20)) ggsave("unadjusted_pca_plot.png", plt1, width = 6, height = 6, dpi = 600) plt2 <- pca_plotter(e_rm, p['group']) + labs( title="SVA adjusted") + theme(plot.title = element_text(hjust = 0.5, face="bold", size=20), aspect.ratio = 1) leg <- get_legend(plt2)
These files are not going to be included because of licensing issues with KEGG. The MSigDB pathways can be found at this link.
# This file wont be included due to licensing but can be obtained from here: # https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/MSigDB_v7.0_Release_Notes # The uncommented code has example data. pth_data <- read_rds('../../reference_files/msigdb_v7.0/msigdb_v7.0_all_tibble.rds') %>% # read_rds("example_msigdb_v7.0_all_tibble.rds") filter(SUB_CATEGORY_CODE == "CP:KEGG") %>% dplyr::select(STANDARD_NAME,EXACT_SOURCE) %>% setNames(., c('name','id')) # These pathways were causing issues and so they've been removed from the analysis pths_to_remove <- c("KEGG_TAURINE_AND_HYPOTAURINE_METABOLISM", "KEGG_FOLATE_BIOSYNTHESIS") pth_data <- pth_data %>% filter(!name %in% pths_to_remove) # This file was created by compiling the KEGG hiearchy files: "br08901.json","br08902.json", "br08904.json", and "br08907.json". # These files can be found at the following link https://www.genome.jp/kegg-bin/get_htext?br08902 by using the search function # and downloading and parsing the jsons. The uncommented code has example data. hier_kegg <- read_rds('../../thesisAnalysis/data-raw/kegg_pathway_hierarchy.rds') %>% # read_rds("example_kegg_pathway_hierarchy.rds") %>% left_join(pth_data, .) %>% arrange(category_level_1) # File can be directly downloaded from the link: https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/MSigDB_v7.0_Release_Notes # cp_kegg <- '../../reference_files/msigdb_v7.0/c2_kegg_entrez_msigdb_v7.0.gmt' %>% # "example_c2_kegg_entrez_msigdb_v7.0.gmt" %>% clusterProfiler::read.gmt(.) %>% filter(term %in% hier_kegg$name) %>% split(., .$term) %>% lapply(., function(x){ x$gene }) c2_kegg <- "../../reference_files/msigdb_v7.0/c2_kegg_entrez_msigdb_v7.0.gmt" %>% clusterProfiler::read.gmt(.) %>% group_by(term) %>% summarise(entrezgene=paste(gene, collapse = "|")) %>% rename(name="term") %>% inner_join(hier_kegg, .) # saveRDS(c2_kegg, file = "c2_kegg_entrez_msigdb_v7.0_database.rds") kegg_annots <- hier_kegg$category_level_1 %>% set_names(., hier_kegg$name)
Since I'm going run GSVA, I'm removing the batch effects predicted by SVA instead of including them in the downstream deferentially enriched pathway analysis. I'm also uniquefying the genes by variance. In other words, of the genes which match to the same entrez gene ID, I'm choosing the one that has the most variance.
load('gene_annots.RData') v <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[1]] svobj <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[2]] p <- v$targets e_rm <- removeBatchEffect(v, covariates = svobj$sv, design = v$design) e_rm <- uniquefy_by_variance(e_rm, gene_annots, 'entrezgene') row.names(e_rm) <- gene_annots[row.names(e_rm),]$entrezgene v_e <- uniquefy_by_variance(v$E, gene_annots, 'entrezgene') row.names(v_e) <- gene_annots[row.names(v_e),]$entrezgene
I'm converting the gene x patient expression matrix into a pathway x patient enrichment matrix via GSVA. I then perform a clustering analysis on the pathway x patient enrichment matrix.
library(GSVA) gsva_res_sva <- gsva(e_rm, cp_kegg, method='gsva', min.sz=1, max.sz=100000) gsva_res_sva <- gsva_res_sva[names(kegg_annots),row.names(p)] # saveRDS(gsva_res_sva, "gsva_enrichment_matrix_sva.rds") tmp_data <- gsva_res_sva # Scaling row wise tmp_data <- t(scale(t(tmp_data))) tmp_data <- as.data.frame(tmp_data, check.names=FALSE) col_clust <- as.dist(1 - cor(tmp_data, method = 'pearson')) %>% hclust(., "ward.D2") coefs_from_clusters <- dendextend::cutree(col_clust, 3, order_clusters_as_data=FALSE) %>% paste0("c", .) coefs_from_clusters <- tibble(sample_id=col_clust$labels, cluster_idx=coefs_from_clusters) # saveRDS(coefs_from_clusters, file = "coefs_from_clusters.rds")
This chunk just creates a dendrogram of the clustering, like the one in figure 2. First, I create a color map of the different diagnoses, then the actual plot itself
library(dendextend) library(RColorBrewer) colours <- brewer.pal(n = 7, name = "Paired") # Ok so creating the color map was a bit involved. I started with # a palette of discrete colors, 2 for each group (e.g., Fibrosis) # and then interpolated them so that I had darker colors for the # advanced grade/stage within each group. groups <- p[,c('group', 'diagnosis')] %>% distinct() %>% dplyr::slice(-1) %>% arrange(diagnosis) col_map <- tibble(idx=sort(rep(1:3, 2)),colours=colours[1:6]) %>% split(., .$idx) %>% lapply(., function(x){ x$colours }) col_map2 <- lapply(seq_along(col_map), function(i){ x <- unique(groups$group)[i] tmp_lvls <- groups %>% filter(group==x) %>% pull(diagnosis) %>% droplevels %>% as.character out_cols <- colorRampPalette(col_map[[i]])(length(tmp_lvls)) %>% setNames(., tmp_lvls) }) %>% do.call(c, .) col_map <- c(NORMAL=colours[length(colours)], col_map2) # Getting the diagnosis for the labels in the dendrogram ordered_colors <- p$diagnosis[match(labels(col_clust), row.names(p))] ordered_colors <- col_map[ordered_colors] dend <- as.dendrogram(col_clust) labels_colors(dend) <- ordered_colors # Finally, the plot par(cex=.5) plot.new() plot(dend) # save(dend, col_map, tmp_data,file = "data_for_figure2_heatmap.RData")
These 2 sections use cluster defined patient groupings identified in [GSVA Cluster analysis section]{#cluster}. Basically, limma-voom is performed the normal way except that the groupings were identified by the cluster analysis.
gsva_res_sva <- read_rds("gsva_enrichment_matrix_sva.rds") group_cont <- c("c2-c1", "c3-c1", "c3-c2") coefs_from_clusters <- read_rds("coefs_from_clusters.rds") mod <- better_model_matrix(~0 + cluster_idx, data = coefs_from_clusters) cont_mod <- makeContrasts(contrasts = group_cont, levels=data.frame(mod, check.names = FALSE)) fit <- t(scale(t(gsva_res_sva))) %>% # Row scaling lmFit(., data.frame(mod)) %>% contrasts.fit(., cont_mod) %>% eBayes res <- get_limma_results(fit, group_cont) names(res)[2] <- 'name' res <- right_join( hier_kegg, res) # saveRDS(res, 'clust_gsva_path_res.rds')
Same cluster based groups identified in the [GSVA Cluster analysis section]{#cluster} are used here to identify differentially expressed genes.
fit <- lmFit(v, data.frame(mod)) %>% contrasts.fit(., cont_mod) %>% eBayes res <- get_limma_results(fit, group_cont) # saveRDS(res, "clust_gene_res.rds")
These analyses are performed using the clinical groupings defined in the patient metadata.
p <- v$targets mod <- better_model_matrix(~0 + group, data = p) gsva_res_sva <- read_rds("gsva_enrichment_matrix_sva.rds") group_cont <- c('Lob-(NORMAL+STEATOSIS)/2','Fibrosis-(NORMAL+STEATOSIS)/2','Fibrosis-Lob') cont_mod <- makeContrasts(contrasts = group_cont, levels=data.frame(mod, check.names = FALSE)) fit <- t(scale(t(gsva_res_sva))) %>% # Row scaling lmFit(., data.frame(mod)) %>% contrasts.fit(., cont_mod) %>% eBayes res <- get_limma_results(fit, group_cont) names(res)[2] <- 'name' res <- right_join( hier_kegg, res) # saveRDS(res, "clin_gsva_path_res.rds")
v <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[1]] svobj <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[2]] p <- v$targets group_cont <- c('Lob-(NORMAL+STEATOSIS)/2','Fibrosis-(NORMAL+STEATOSIS)/2','Fibrosis-Lob') mod <- better_model_matrix(~0 + group, data = p) modSV <- cbind(mod, svobj$sv) cont_mod <- makeContrasts(contrasts = group_cont, levels=data.frame(modSV, check.names = FALSE)) fit_sva <- lmFit(v, data.frame(modSV)) %>% contrasts.fit(., cont_mod) %>% eBayes res_sva <- get_limma_results(fit_sva, group_cont) # saveRDS(res_sva, "clin_gene_res.rds")
library(Biobase) Sys.setenv(VROOM_CONNECTION_SIZE=1310720) filt_esets <- c("GSE48452", "GSE49541", "GSE89632") %>% map(function(x){ cur_geo <- GEOquery::getGEO(x, AnnotGPL = TRUE)[[1]] }) nm_to_fix <- which(grepl("group", names(pData(filt_esets[[1]])))) names(pData(filt_esets[[1]]))[nm_to_fix] <- "group" pData(filt_esets[[1]])$group <- sub(" ", "_", pData(filt_esets[[1]])$group) %>% str_to_upper nm_to_fix <- which(grepl("Stage", names(pData(filt_esets[[2]])))) names(pData(filt_esets[[2]]))[nm_to_fix] <- "group" pData(filt_esets[[2]])$group <- str_split(pData(filt_esets[[2]])$group, ' ') %>% map(1) %>% unlist nm_to_fix <- which(names(pData(filt_esets[[3]])) == "characteristics_ch1.1") names(pData(filt_esets[[3]]))[nm_to_fix] <- "group" pData(filt_esets[[3]])$group <- str_split(pData(filt_esets[[3]])$group, ': ') %>% map(2) %>% unlist cols_to_keep <- c("Gene symbol", "Gene ID") filt_esets[1:2] <- filt_esets[1:2] %>% map(function(x){ f <- fData(x) f <- f[,cols_to_keep] names(f) <- c("external_gene_name", "entrezgene") f[f == ""] <- NA fData(x) <- f return(x) }) f <- fData(filt_esets[[3]]) f <- f[,c("ILMN_Gene", "Entrez_Gene_ID")] names(f) <- c("external_gene_name", "entrezgene") f$entrezgene <- as.character(f$entrezgene) fData(filt_esets[[3]]) <- f coef_list <- list(c("NASH-HEALTHY_OBESE"), c("advanced-mild" ), c("NASH-SS")) micro_gene_res <- seq_along(filt_esets) %>% map(function(i){ x <- filt_esets[[i]] group_cont <- coef_list[[i]] x <- x[rowMin(exprs(x)) > quantile(exprs(x), probs=.1),] mod <- better_model_matrix(~0 + group, data=pData(x)) cont_mod <- makeContrasts(contrasts = group_cont, levels=data.frame(mod, check.names = FALSE)) fit <- lmFit(x, data.frame(mod)) %>% contrasts.fit(., cont_mod) %>% eBayes gene_res <- get_limma_results(fit, group_cont) }) %>% bind_rows cp_kegg <- '../../reference_files/msigdb_v7.0/c2_kegg_entrez_msigdb_v7.0.gmt' %>% clusterProfiler::read.gmt(.) kegg_objct <- split(micro_gene_res, micro_gene_res$coefficient) %>% map(function(x){ x <- x %>% group_by(entrezgene) %>% arrange(desc(abs(t))) %>% dplyr::slice(1) %>% ungroup x <- x$t %>% setNames(., x$entrezgene) %>% sort(., decreasing = TRUE) clusterProfiler::GSEA(x, TERM2GENE = cp_kegg, pvalueCutoff = 1, minGSSize=1) }) pths <- kegg_objct %>% map(as.data.frame) %>% rbind_named_df_list(., "coefficient") names(pths)[2] <- "name"
tmp_fun <- function(x) { length(unique(na.omit(x))) == 1 } tmp_fun2 <- function(x) { sign(sum(na.omit(x))) } micro_sub <- pths %>% filter(p.adjust < 0.05) %>% mutate(pth_sign = sign(NES)) %>% select(coefficient, name, pth_sign) %>% distinct table(micro_sub$coefficient, micro_sub$pth_sign) conc_pths <- micro_sub %>% group_by(name) %>% summarise(is_concordant = tmp_fun(pth_sign), pth_sign = tmp_fun2(pth_sign)) %>% filter(is_concordant) sig_pth_res <- read_rds("clust_gsva_path_res.rds") %>% filter(adj.P.Val < .001) tmp_data <- sig_pth_res %>% mutate(in_microarrays = (name %in% conc_pths$name)) micro_pths <- conc_pths[,-2] names(micro_pths) <- str_to_lower(names(micro_pths)) tmp_data <- left_join(sig_pth_res, micro_pths) %>% mutate(conc_w_microarray=(sign(logFC) == pth_sign)) sum_tbl <- tmp_data %>% group_by(coefficient) %>% # select(-disease_category) %>% distinct() %>% summarise(n_pths = n(), n_missing = sum(is.na(conc_w_microarray)), n_conc = sum(na.omit(conc_w_microarray)), n_disc = sum(!na.omit(conc_w_microarray))) %>% mutate(total=n_conc + n_disc) %>% mutate(cnc_prct=n_conc/total, dsc_prct=n_disc/total)
03/24/22
library(eulerr) clust_tmp <- split(sig_pth_res, sig_pth_res$coefficient) %>% map(2) %>% map(unique) coef_tbl <- tibble(sig_idx=1:3, cluster_comparison=c("PLI vs PN&S", "PF vs PN&S", "PF vs PLI"), micro_comp=rep("Microarrays", 3)) micro_tmp <- unique(micro_pths$name) out_list <- 1:3 %>% map(function(i){ tmp_list <- list(clust_tmp[[i]], micro_tmp) tmp_nms <- coef_tbl[i,2:3] %>% unlist %>% as.character # Euler doesn't work with names that have & # so going to that by hand names(tmp_list) <- c("clust", "micro") # tmp_list <- rev(tmp_list) return(tmp_list) }) plot_list <- out_list %>% map(function(x){ euler(x) %>% plot(., quantities = list(cex = 1.125), labels = FALSE, adjust_labels=TRUE) # plot(., quantities = list(cex = 1.125), labels = list(cex = 1.25), adjust_labels=TRUE) }) for(i in 1:3){ # fn <- paste0("msig_kegg_clust_v_microarray_gsea_", i, "_03-24-22.png") png(fn, width = 2.5, height = 2.5, units = "in", res=500) print(plot_list[[i]]) dev.off() }
cat_pths <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("nafld_category", "id")) kegg_db <- read_rds("c2_kegg_entrez_msigdb_v7.0_database.rds") %>% separate_rows(entrezgene, sep = '\\|') %>% select(name, id, entrezgene) sig_pth_res <- read_rds("clust_gsva_path_res.rds") %>% filter(adj.P.Val < .001) cols_to_keep <- c("coefficient","name","id", "logFC", "adj.P.Val" ) sig_pth_res <- sig_pth_res[,cols_to_keep] names(sig_pth_res)[4:5] <- paste0("pathway_", names(sig_pth_res)[4:5]) pths_to_keep <- cat_pths %>% filter(nafld_category %in% 1:4) %>% pull(id) sig_pth_res <- sig_pth_res %>% filter(id %in% pths_to_keep) sig_pth_res <- inner_join(sig_pth_res, cat_pths) sig_pth_res <- inner_join(sig_pth_res, kegg_db) gene_res <- read_rds('clust_gene_res.rds') %>% filter(adj.P.Val < .001) gene_res$entrezgene <- as.character(gene_res$entrezgene) cols_to_keep <- c("coefficient","external_gene_name", "entrezgene", "logFC", "adj.P.Val") gene_res <- gene_res[,cols_to_keep] names(gene_res)[4:5] <- paste0("gene_", names(gene_res)[4:5]) gene_pth_filt <- inner_join(gene_res, sig_pth_res) upreg_genes <- gene_pth_filt %>% filter(gene_logFC > 0) %>% group_by(coefficient, nafld_category) %>% summarise(up_genes_entrez=paste(unique(entrezgene), collapse = ";"), n_upreg=length(unique(entrezgene))) downreg_genes <- gene_pth_filt %>% filter(gene_logFC < 0) %>% group_by(coefficient, nafld_category) %>% summarise(down_genes_entrez=paste(unique(entrezgene), collapse = ";"), n_downreg=length(unique(entrezgene))) gene_sig <- left_join(upreg_genes, downreg_genes) %>% arrange(coefficient, nafld_category) %>% ungroup # saveRDS(gene_sig, file = "cluster_gene_signatures.rds")
cat_pths <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("nafld_category", "id")) kegg_db <- read_rds("c2_kegg_entrez_msigdb_v7.0_database.rds") %>% separate_rows(entrezgene, sep = '\\|') %>% select(name, id, entrezgene) sig_pth_res <- read_rds("clin_gsva_path_res.rds") %>% filter(adj.P.Val < .001) cols_to_keep <- c("coefficient","description","id", "logFC", "adj.P.Val" ) sig_pth_res <- sig_pth_res[,cols_to_keep] names(sig_pth_res)[4:5] <- paste0("pathway_", names(sig_pth_res)[4:5]) pths_to_keep <- cat_pths %>% filter(nafld_category %in% 1:4) %>% pull(id) sig_pth_res <- sig_pth_res %>% filter(id %in% pths_to_keep) sig_pth_res <- inner_join(sig_pth_res, cat_pths) sig_pth_res <- inner_join(sig_pth_res, kegg_db) gene_res <- read_rds('clin_gene_res.rds') gene_res$entrezgene <- as.character(gene_res$entrezgene) sig_gene_res <- gene_res %>% filter(adj.P.Val < .001) cols_to_keep <- c("coefficient","external_gene_name", "entrezgene", "logFC", "adj.P.Val") sig_gene_res <- sig_gene_res[,cols_to_keep] names(sig_gene_res)[4:5] <- paste0("gene_", names(sig_gene_res)[4:5]) gene_pth_filt <- inner_join(sig_gene_res, sig_pth_res) upreg_genes <- gene_pth_filt %>% filter(gene_logFC > 0) %>% group_by(coefficient, nafld_category) %>% summarise(up_genes_entrez=paste(unique(entrezgene), collapse = ";"), n_upreg=length(unique(entrezgene))) downreg_genes <- gene_pth_filt %>% filter(gene_logFC < 0) %>% group_by(coefficient, nafld_category) %>% summarise(down_genes_entrez=paste(unique(entrezgene), collapse = ";"), n_downreg=length(unique(entrezgene))) gene_sig <- left_join(upreg_genes, downreg_genes) %>% arrange(coefficient, nafld_category) %>% ungroup gene_sig$coefficient <- factor(gene_sig$coefficient, levels = levels(sig_pth_res$coefficient)) gene_sig <- gene_sig %>% arrange(coefficient, nafld_category) gene_sig$sig_idx <- 1:12 gene_sig <- gene_sig %>% relocate(sig_idx) # saveRDS(gene_sig, file = "clin_gene_signatures.rds")
cols_to_keep <- c("pert_id", "pert_iname", "pert_type", "is_touchstone", "inchi_key_prefix", "inchi_key", "canonical_smiles", "pubchem_cid") # This file was created by joining the metadata files listed here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742 tmp_map <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% distinct tmp_map <- tmp_map[,cols_to_keep] # This is example of the data. The full DrugBank database can be downloaded from here: # https://go.drugbank.com/releases/5-1-4/release_notes drugbank_data <- read_rds("example_drugbank_v5.1.4_data.rds") %>% separate_rows(., Synonyms, sep = '\\|') names(drugbank_data) <- str_to_lower(names(drugbank_data)) dd1 <- drugbank_data[,-3] names(dd1)[2] <- 'drug_name' dd2 <- drugbank_data[,-2] names(dd2)[2] <- 'drug_name' tmp_db_data <- rbind(dd1, dd2) %>% distinct names(tmp_db_data)[which(names(tmp_db_data) == 'smiles')] <- 'canonical_smiles' names(tmp_db_data)[which(names(tmp_db_data) == 'pubchem_compound_id')] <- 'pubchem_cid' tmp_db_data$pert_iname <- str_to_lower(tmp_db_data$drug_name) cols_to_keep <- c('canonical_smiles', 'pubchem_cid', 'pert_iname') db_cmap_matches <- lapply(cols_to_keep, function(x){ db_sub <- tmp_db_data[,c('drugbank_id', x)] %>% na.omit %>% distinct inner_join(tmp_map, db_sub, na_matches = "never") }) %>% do.call(rbind, .) # Removing extra columns and then re-adding based on trt_cps file db_cmap_matches <- db_cmap_matches %>% select(pert_id, pert_iname, drugbank_id) %>% left_join(., drugbank_data[,c("drugbank_id", "name")]) %>% distinct man_annots <- read_rds("manual_lincs_drugbank_drug_annots.rds") # these were matches done manually db_cmap_matches <- db_cmap_matches %>% filter(!pert_id %in% man_annots$pert_id) db_cmap_matches <- rbind(db_cmap_matches, man_annots) dbids_to_remove <- c("DB07159", "DB02507", "DB02731", "DB00452", "DB00624") # These are duplicates with extraneous DB IDs db_cmap_matches <- db_cmap_matches %>% filter(!drugbank_id %in% dbids_to_remove) # saveRDS(db_cmap_matches, file = 'GSE92742_perts_in_DrugBank_03-30-21.rds')
The gctx
files can be downloaded following the directions clue.io. They are very large and resource intensive. For this portion of the notebook, I used a machine with anywhere from 30-40 cores and 200-300 gb of memory. It still took many hours to run.
The 2020 metadata (which was downloaded from here [clue.io]) was matched to DrugBank by common name only.
# See clue.io to obtain the 2020 metadata drb_sigs <- read_rds("cmap2020_drubank_overlap_sig_info_01-24-22.rds") %>% select(1:4) pert_info <- drb_sigs[,2:4] %>% distinct pert_info2 <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") %>% select(-name) %>% distinct pert_info <- rbind(pert_info, pert_info2) %>% distinct rm(pert_info2) trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, cell_id, pert_id) %>% distinct %>% inner_join(., pert_info) names(trt_cp)[2] <- "cell_iname"
library(cmapR) # See clue.io to obtain file gctx_file <- '/zfs1/dtaylor/cmap_data/clue_test_data/sig_score_trt_cp_n204975x10174.gctx' cids <- read_gctx_meta(gctx_file,dim = "col") %>% pull gids <- read_gctx_meta(gctx_file,dim = "row") %>% pull cid_chunks <- chunker(cids, 5) clust_gene_sig <- read_rds("cluster_gene_signatures.rds") qunt_thresh <- 0 p_thresh <- 1 gene_sets <- clust_gene_sig %>% select(sig_idx, up_genes_entrez, down_genes_entrez) n_perm <- 5e+05 ptm <- proc.time() clust_res_sub <- map(cid_chunks, function(y){ sig_list <- gctx_to_sigList(gctx_file, y, decreasing_order = TRUE) # Sorts in descending order (big to smol) out_res <- multi_broad_wrapper(gene_sets, sig_list, 10) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() return(out_res) }) %>% bind_rows ptm <- proc.time() gsva_random_res_sub <- map(1:nrow(gene_sets), function(i){ x <- gene_sets[i,] print(i) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() up_genes <- str_split(x[,2], pattern = ';') %>% unlist up_genes <- up_genes[up_genes %in% gids] down_genes <- str_split(x[,3], pattern = ';') %>% unlist down_genes <- down_genes[down_genes %in% gids] random_cs <- get_random_cmap_scores(up_genes, down_genes, gctx_file, cids, n_perm= n_perm, num_chunks = 10, n_cores = 14) random_cs$r_up_genes <- NULL random_cs$r_down_genes <- NULL return(random_cs) }) # These cols were included for debugging purposes but are not neccessary clust_res_sub$down_genes <- NULL clust_res_sub$up_genes <- NULL names(clust_res_sub)[1] <- "sig_idx" gc() clust_res_sub <- split(clust_res_sub, clust_res_sub$sig_idx) plan(multisession, workers=13) clust_res_sub <- add_p_vals_to_cmap_para(clust_res_sub, gsva_random_res_sub) # saveRDS(clust_res_sub, "cluster_sigs_cmap_lincs17_res.rds") lincs_top20_table <- lincs2017_score_top20(pert_score) ol <- list(best_score_mat(tmp_cmap_sub), pert_score) %>% setNames(., c("best_score","prct_67th_score")) # saveRDS(ol, "cluster_sigs_cmap_lincs17_pert_sums.rds")
clust_res_l2017 <- read_rds("cluster_sigs_cmap_lincs17_res.rds") trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, cell_id, pert_id, pert_iname) %>% distinct clust_res_l2017 <- inner_join(trt_cp, clust_res_l2017) names(clust_res_l2017)[2] <- "cell_iname" drg_db <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") clust_res_l2017 <- inner_join(drg_db, clust_res_l2017) %>% calculate_ncs() best_score_table <- best_score_top20(best_score_mat(clust_res_l2017)) pert_score <- calculate_pert_score(calculate_pert_cell_score(clust_res_l2017)) pert_score <- inner_join(pert_score, drg_db) ol <- list(best_score_mat(clust_res_l2017), pert_score) %>% setNames(., c("best_score","prct_67th_score")) # saveRDS(ol, "cluster_sigs_cmap_lincs17_pert_sums.rds")
library(cmapR) library(QSPpaper) library(tidyverse) nperm <- 1+07 # Figuring out which perts to analyze meta_cols_to_keep <- c("sig_id", "pert_id", "pert_iname") trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% distinct trt_cp <- trt_cp[,meta_cols_to_keep] drgbnk_db <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") # BRD-A60245366 is the AS-601245 compound perts_to_keep <- c("BRD-A60245366", drgbnk_db$pert_id) trt_cp <- trt_cp %>% filter(pert_id %in% perts_to_keep) %>% left_join(., drgbnk_db) gctx_file <- '/zfs1/dtaylor/cmap_data/clue_test_data/sig_score_trt_cp_n204975x10174.gctx' cids <- read_gctx_meta(gctx_file,dim = "col") %>% pull gids <- read_gctx_meta(gctx_file,dim = "row") %>% pull # Filters all but sigs with DrugBank pert cids <- cids[cids %in% trt_cp$sig_id] cid_chunks <- chunker(cids, 6) gsva_gene_sig <- readRDS("clin_gene_signatures.rds") gene_sets <- gsva_gene_sig %>% select(sig_idx, up_genes_entrez, down_genes_entrez) # For debugging purposes # cid_chunks <- cid_chunks[1] # cid_chunks[[1]] <- cid_chunks[[1]][1:50] # gene_sets <- gene_sets[1,] gsva_res_sub <- map(cid_chunks, function(y){ sig_list <- gctx_to_sigList(gctx_file, y, decreasing_order = FALSE) out_res <- multi_broad_wrapper(gene_sets, sig_list, 14) }) %>% bind_rows ptm <- proc.time() gsva_random_res_sub <- map(1:nrow(gene_sets), function(i){ x <- gene_sets[i,] print(i) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() up_genes <- str_split(x[,2], pattern = ';') %>% unlist up_genes <- up_genes[up_genes %in% gids] down_genes <- str_split(x[,3], pattern = ';') %>% unlist down_genes <- down_genes[down_genes %in% gids] # random_cs <- get_random_cmap_scores(up_genes, down_genes, gctx_file, cids, n_perm=1e+07, num_chunks = 6, n_cores = 25) random_cs <- get_random_cmap_scores(up_genes, down_genes, gctx_file, cids, n_perm= nperm, num_chunks = 6, n_cores = 29) random_cs$r_up_genes <- NULL random_cs$r_down_genes <- NULL return(random_cs) }) gsva_res_sub$down_genes <- NULL gsva_res_sub$up_genes <- NULL names(gsva_res_sub)[1] <- "sig_idx" gc() gsva_res_sub <- split(gsva_res_sub, gsva_res_sub$sig_idx) plan(multisession, workers=13) gsva_res_sub <- add_p_vals_to_cmap_para(gsva_res_sub, gsva_random_res_sub) # saveRDS(gsva_res_sub, "clinical_sigs_cmap_lincs17_res.rds")
trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, cell_id, pert_id, pert_iname) %>% distinct gsva_cmap17 <- read_rds("clinical_sigs_cmap_lincs17_res.rds") gsva_cmap17 <- left_join(gsva_cmap17, trt_cp) gsva_cmap17 <- inner_join(trt_cp, gsva_cmap17) names(gsva_cmap17)[2] <- "cell_iname" drg_db <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") gsva_cmap17 <- inner_join(drg_db, gsva_cmap17) %>% calculate_ncs() best_score_table <- best_score_top20(best_score_mat(gsva_cmap17)) pert_score <- calculate_pert_score(calculate_pert_cell_score(gsva_cmap17)) pert_score <- inner_join(pert_score, drg_db) lincs_top20_table <- lincs2017_score_top20(pert_score) ol <- list(best_score_mat(gsva_cmap17), pert_score) %>% setNames(., c("best_score","prct_67th_score")) # saveRDS(ol, "clinical_sigs_cmap_lincs17_pert_sums.rds.rds")
The same CMap analysis using the cluster gene signatures and clincal label based signatures are performed here using the 2020 LINCS database. These files can be obtained following the instructions listed on LINCS site. This was about an order of magnitude larger than the 2017 database, so it's even more resource intensive.
gctx_file <- "/zfs1/dtaylor/cmap_data/cmap_lincs_2020/level5_beta_trt_cp_n720216x10174.gctx" sig_info <- readRDS("cmap2020_drubank_overlap_sig_info_01-24-22.rds") cids <- read_gctx_meta(gctx_file,dim = "col") %>% pull cids <- cids[cids %in% sig_info$sig_id] gids <- read_gctx_meta(gctx_file,dim = "row") %>% pull cid_chunks <- chunker(cids, 5) clust_gene_sig <- read_rds("cluster_gene_signatures.rds") qunt_thresh <- 0 p_thresh <- 1 gene_sets <- clust_gene_sig %>% select(sig_idx, up_genes_entrez, down_genes_entrez) n_perm <- 5e+05 # n_perm <- 5 ptm <- proc.time() clust_res_sub <- map(cid_chunks, function(y){ sig_list <- gctx_to_sigList(gctx_file, y, decreasing_order = TRUE) # Sorts in descending order (big to smol) out_res <- multi_broad_wrapper(gene_sets, sig_list, 10) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() return(out_res) }) %>% bind_rows ptm <- proc.time() gsva_random_res_sub <- map(1:nrow(gene_sets), function(i){ x <- gene_sets[i,] print(i) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() up_genes <- str_split(x[,2], pattern = ';') %>% unlist up_genes <- up_genes[up_genes %in% gids] down_genes <- str_split(x[,3], pattern = ';') %>% unlist down_genes <- down_genes[down_genes %in% gids] random_cs <- get_random_cmap_scores(up_genes, down_genes, gctx_file, cids, n_perm= n_perm, num_chunks = 10, n_cores = 14) random_cs$r_up_genes <- NULL random_cs$r_down_genes <- NULL return(random_cs) }) clust_res_sub <- split(clust_res_sub, clust_res_sub$sig_idx) clust_gsva_cmap_res_w_p_value <- add_p_vals_to_cmap_para(clust_res_sub, gsva_random_res_sub) # saveRDS(clust_gsva_cmap_res_w_p_value, "cluster_sigs_cmap_lincs20_res.rds")
The siginfo_beta.txt
file can be obtained from the LINCS site.
library(flextable) cs_mat <- read_rds("cluster_sigs_cmap_lincs20_res.rds") cs_mat$up_genes <- NULL cs_mat$down_genes <- NULL sig_info <- read_delim("siginfo_beta.txt") %>% select(pert_id, cell_iname, sig_id) %>% distinct cs_mat <- inner_join(cs_mat, sig_info) drb_sigs <- read_rds("cmap2020_drubank_overlap_sig_info_01-24-22.rds") %>% select(1:4) pert_info <- drb_sigs[,2:4] %>% distinct trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, cell_id, pert_id) %>% distinct clust_res_l2017 <- read_rds("cluster_sigs_cmap_lincs17_res.rds") clust_res_l2017 <- inner_join(trt_cp, clust_res_l2017) names(clust_res_l2017)[2] <- "cell_iname" tmp1 <- clust_res_l2017 %>% filter(!sig_id %in% cs_mat$sig_id) tmp_cmap <- rbind(cs_mat, tmp1) tmp_cmap <- calculate_ncs(tmp_cmap) tmp_cmap_sub <- inner_join(tmp_cmap, pert_info) best_score_table <- best_score_top20(best_score_mat(tmp_cmap_sub)) pert_score <- calculate_pert_score(calculate_pert_cell_score(tmp_cmap_sub)) pert_score <- inner_join(pert_score, pert_info) ol <- list(best_score_mat(tmp_cmap_sub), pert_score) %>% setNames(., c("best_score","prct_67th_score")) # saveRDS(ol, "cluster_sigs_cmap_lincs20_pert_sums.rds")
The
gctx_file <- "/zfs1/dtaylor/cmap_data/cmap_lincs_2020/level5_beta_trt_cp_n720216x10174.gctx" sig_info <- readRDS("cmap2020_drubank_overlap_sig_info_01-24-22.rds") cids <- read_gctx_meta(gctx_file,dim = "col") %>% pull cids <- cids[cids %in% sig_info$sig_id] gids <- read_gctx_meta(gctx_file,dim = "row") %>% pull cid_chunks <- chunker(cids, 5) gsva_gene_sig <- readRDS("clinical_sigs_cmap_lincs20_res.rds.rds") # gsva_gene_sig <- gsva_gene_sig %>% filter(sig_idx == 1) qunt_thresh <- unique(gsva_gene_sig$abs_quantile_thresh) p_thresh <- unique(gsva_gene_sig$fdr_p_value_thresh) gene_sets <- gsva_gene_sig %>% select(sig_idx, up_genes_entrez, down_genes_entrez) n_perm <- 5e+05 # n_perm <- 5 ptm <- proc.time() gsva_res_sub <- map(cid_chunks, function(y){ sig_list <- gctx_to_sigList(gctx_file, y, decreasing_order = TRUE) # Sorts in descending order (big to smol) out_res <- multi_broad_wrapper(gene_sets, sig_list, 10) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() return(out_res) }) %>% bind_rows ptm <- proc.time() gsva_random_res_sub <- map(1:nrow(gene_sets), function(i){ x <- gene_sets[i,] print(i) tot_time <- proc.time() - ptm print(tot_time) ptm <- proc.time() up_genes <- str_split(x[,2], pattern = ';') %>% unlist up_genes <- up_genes[up_genes %in% gids] down_genes <- str_split(x[,3], pattern = ';') %>% unlist down_genes <- down_genes[down_genes %in% gids] random_cs <- get_random_cmap_scores(up_genes, down_genes, gctx_file, cids, n_perm= n_perm, num_chunks = 10, n_cores = 14) random_cs$r_up_genes <- NULL random_cs$r_down_genes <- NULL return(random_cs) }) gsva_res_sub <- split(gsva_res_sub, gsva_res_sub$sig_idx) plan(multisession, workers=13) gsva_res_sub <- add_p_vals_to_cmap_para(gsva_res_sub, gsva_random_res_sub) # saveRDS(gsva_res_sub, "clinical_sigs_cmap_lincs20_res.rds")
library(officer) library(flextable) cs_mat <- read_rds("clinical_sigs_cmap_lincs20_res.rds") cs_mat$up_genes <- NULL cs_mat$down_genes <- NULL sig_info <- read_delim("siginfo_beta.txt") %>% select(pert_id, cell_iname, sig_id) %>% distinct cs_mat <- inner_join(cs_mat, sig_info) drb_sigs <- read_rds("cmap2020_drubank_overlap_sig_info_01-24-22.rds") %>% select(1:4) pert_info <- drb_sigs[,2:4] %>% distinct trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, cell_id, pert_id) %>% distinct l2017_res <- read_rds("clinical_sigs_cmap_lincs17_res.rds") l2017_res$cmap_score <- l2017_res$cmap_score*-1 l2017_res <- inner_join(trt_cp, l2017_res) names(l2017_res)[2] <- "cell_iname" tmp1 <- l2017_res %>% filter(!sig_id %in% cs_mat$sig_id) tmp_cmap <- rbind(cs_mat, tmp1) tmp_cmap <- calculate_ncs(tmp_cmap) tmp_cmap_sub <- inner_join(tmp_cmap, pert_info) best_score_table <- best_score_top20(best_score_mat(tmp_cmap_sub)) pert_score <- calculate_pert_score(calculate_pert_cell_score(tmp_cmap_sub)) pert_score <- inner_join(pert_score, pert_info) lincs_top20_table <- lincs2017_score_top20(pert_score) ol <- list(best_score_mat(tmp_cmap_sub), pert_score) %>% setNames(., c("best_score","prct_67th_score")) # saveRDS(ol, "clinical_sigs_cmap_lincs20_pert_sums.rds")
The python code for this portion of the analysis can be downloaded from this repo. I ran everything in a local directory with the package. The BioSnap PPI network can be downloaded from here.
# This file is just an edgelist created by subsetting just the liver. I'm not including it # because I'm not sure about the license. # biosnap <- read_rds("PPT-Ohmnet_edgelist_liver.rds") biosnap <- read_rds("example_PPT-Ohmnet_edgelist_liver.rds") bsp_genes <- do.call(c, biosnap[,1:2]) %>% unique nafld_associated_pths <- c("Non-alcoholic fatty liver disease (NAFLD)", "TNF signaling pathway", "Insulin signaling pathway", "Type II diabetes mellitus", "PI3K-Akt signaling pathway", "Adipocytokine signaling pathway", "PPAR signaling pathway", "Fatty acid biosynthesis", "Protein processing in endoplasmic reticulum", "Oxidative phosphorylation", "Apoptosis") kegg_data <- read_rds("kegg_pathway_entrezgene.rds") %>% setNames(., c("id", "entrezgene")) kegg_data <- left_join(kegg_data, read_rds("kegg_pathway_hierarchy.rds")) gene_res <- read_rds("clust_gene_res.rds") %>% filter(adj.P.Val < .001) nap_genes <- kegg_data %>% filter(description %in% nafld_associated_pths) %>% pull(entrezgene) %>% unique() degs_to_keep <- gene_res %>% filter(entrezgene %in% nap_genes) %>% pull(entrezgene) %>% unique() %>% as.character degs_to_keep <- degs_to_keep[degs_to_keep %in% bsp_genes] #write(degs_to_keep, file = "updated_degs_in_network.txt")
drug_list <- read_rds("cmap2020_pert_drugbank_overlap_01-18-22.rds") drug_list2 <- read_rds("manual_lincs_drugbank_drug_annots.rds") %>% select(-name) names(drug_list2)[2] <- "drug_name" drug_list2 <- drug_list2 %>% relocate(drug_name) drug_list3 <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") %>% select(-name) %>% rename(drug_name=pert_iname) %>% relocate(drug_name) drug_list <- do.call(rbind, list(drug_list, drug_list2, drug_list3)) %>% distinct drugbank_data <- read_rds("drugbank_db_metadata_v5.1.4.rds") %>% select(drugbank_id, targets) %>% separate_rows(targets, sep ="\\|") names(drugbank_data)[2] <- "gene_name" drugbank_data$gene_name <- gsub('\\-', '', drugbank_data$gene_name) xx <- as.data.frame(org.Hs.eg.db::org.Hs.egALIAS2EG) %>% setNames(., c("entrezgene", "gene_name")) drugbank_data <- left_join(drugbank_data, xx) %>% na.omit drug_list <- left_join(drug_list, drugbank_data) tmp_files <- list.files(".", "_pert_sums") idx <- 1 nms <- str_split(tmp_files, '_p') %>% map(1) %>% unlist %>% gsub('sig_', "", .) lincs_score_out <- map(seq_along(tmp_files), function(idx){ x <- read_rds(paste0('.', tmp_files[[idx]]))[[2]] t20_tbl <- lincs2017_score_top20(x) pert_map <- x %>% select(pert_iname, pert_id) %>% distinct t20_tbl <- left_join(t20_tbl, pert_map) %>% left_join(., drug_list) %>% na.omit t20_tbl <- t20_tbl %>% group_by(pert_iname) %>% summarise(targets=paste(unique(entrezgene), collapse = ';')) %>% setNames(., c("drug_name", "targets")) }) %>% setNames(., nms) %>% rbind_named_df_list(.) %>% distinct %>% mutate(score_method="prct_67th_score") best_score_out <- map(seq_along(tmp_files), function(idx){ x <- read_rds(paste0('./', tmp_files[[idx]]))[[1]] t20_tbl <- best_score_top20(x) pert_map <- x %>% select(pert_iname, pert_id) %>% distinct t20_tbl <- left_join(t20_tbl, pert_map) %>% left_join(., drug_list) %>% na.omit t20_tbl <- t20_tbl %>% group_by(pert_iname) %>% summarise(targets=paste(unique(entrezgene), collapse = ';')) %>% setNames(., c("drug_name", "targets")) }) %>% setNames(., nms) %>% rbind_named_df_list(.) %>% distinct %>% mutate(score_method="best_score") long_t20_mat <- rbind(best_score_out,lincs_score_out ) long_t20_mat$col_from_list <- gsub("lincs", "", long_t20_mat$col_from_list) nms <- limma::strsplit2(long_t20_mat$col_from_list, "_") %>% as.data.frame %>% setNames(., c("signature_set", "lincs_db")) long_t20_mat <- cbind(nms, long_t20_mat) %>% select(-col_from_list) %>% mutate(lincs_db=as.numeric(lincs_db)) out_mat <- rbind(best_score_out,lincs_score_out ) %>% select(drug_name, targets) %>% distinct # write_csv(out_mat, "targets_from_top_cmap_drugs.csv")
The python code can be found here. I executed in this in the top-level dir of this repo.
import random import toolbox from toolbox import wrappers # Loading the data drug_targets = [i.strip().split(',') for i in open("targets_from_top_cmap_drugs.csv")] drug_targets = drug_targets[1:] network = wrappers.get_network("biosnap_liver_ppi_network.txt", only_lcc = True) degs = [i.strip() for i in open("updated_degs_in_network.txt")] # Running analysis and writing results with open("network_proximity_results.csv", "w+") as f: hd = "db_id,d,z,mean,sd\n" f.write(hd) for i in drug_targets: db_id = i[0] targets = i[1].split(';') print(i) try: d, z, tmp_val = wrappers.calculate_proximity(network, targets, degs, min_bin_size=90) out_line = [db_id, d, z, tmp_val[0], tmp_val[1]] out_line = [str(i) for i in out_line] out_line = ','.join(out_line) f.write(out_line + '\n') except: continue
library(tximport) library(limma) library(edgeR) library(delutils) load('gene_annots.RData') load('tx2gene.RData') p <- readRDS("NAFLD_model/pdata.rds") files <- paste0('NAFLD_model/kallisto/', p$file_name) names(files) <- p$sample_id txi <- tximport(files, type = 'kallisto', tx2gene = tx2gene, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM') # saveRDS(txi, "NAFLD_model/nafld_model_txi.rds") txi <- read_rds("NAFLD_model/nafld_model_txi.rds") dge <- edgeR::DGEList(counts = txi$counts, samples = p[colnames(txi$counts),], genes = gene_annots[row.names(txi$counts),]) # saveRDS(dge, "NAFLD_model/nafld_model_dge.rds") dge <- dge[edgeR::filterByExpr(dge, group = dge$samples$treatment),] dge <- dge[(rowSums(dge$counts > 1) >= .75*dim(dge)[2]),] dge <- dge[!is.na(dge$genes$entrezgene),] # dge <- calcNormFactors(dge) mod <- delutils::better_model_matrix(~0 + treatment + time_point, data=p) v <- voom(dge, mod, normalize.method = "quantile") # saveRDS(v, "NAFLD_model/lamps_nafld_qn_voom_07-08-21.rds") trt_conts <- c("EMS-NF", "LMS-NF", "LMS-EMS") cont_mod <- makeContrasts(contrasts = trt_conts, levels=mod) fit <- lmFit(v, mod) %>% contrasts.fit(., cont_mod) %>% eBayes(.) res <- get_limma_results(fit, coefs = trt_conts) res$coefficient <- factor(res$coefficient, levels = trt_conts) # saveRDS(res, "NAFLD_model/treatment_gene_res_07-21-21.rds") # write_csv(res, file = "Data_file_S8_LAMPS_DEGs.csv")
p_val_thresh <- .01 cols_to_keep <- c("coefficient","gene_id", "entrezgene", "external_gene_name","logFC") hu_coefs <- c("Lob-(NORMAL+STEATOSIS)/2","Fibrosis-(NORMAL+STEATOSIS)/2", "Fibrosis-Lob" ) hum_res <- read_rds("clin_gene_res.rds") hu_tmp <- hum_res %>% filter(adj.P.Val < p_val_thresh) hu_tmp <- hu_tmp %>% select(cols_to_keep) %>% mutate(hum_sign=sign(logFC)) %>% select(-logFC) %>% distinct hu_tmp$coefficient <- factor(hu_tmp$coefficient, levels = hu_coefs) hu_tmp$num_coef <- as.numeric(hu_tmp$coefficient) lamps_res <- read_rds("NAFLD_model/treatment_gene_res_07-21-21.rds") lamps_tmp <- lamps_res %>% filter(adj.P.Val < p_val_thresh) lamps_tmp <- lamps_tmp %>% select(cols_to_keep) %>% mutate(lamps_sign=sign(logFC)) %>% select(-logFC) %>% distinct lamps_tmp$entrezgene <- as.character(lamps_tmp$entrezgene) lamps_tmp$num_coef <- as.numeric(lamps_tmp$coefficient) gene_overlap <- inner_join(lamps_tmp[,-1], hu_tmp[,-1]) %>% relocate(num_coef) gene_overlap$sgn_prd <- gene_overlap$lamps_sign * gene_overlap$hum_sign # saveRDS(gene_overlap, file = "NAFLD_model/lamps_human_overlap_degs_08-21-21.rds") cncdnt_genes <- gene_overlap %>% filter(sgn_prd > 0) disc_genes <- gene_overlap %>% filter(sgn_prd < 0)
library(clusterProfiler) cp_kegg <- "msigdb_v7.0/c2_kegg_entrez_msigdb_v7.0.gmt" %>% clusterProfiler::read.gmt(.) pth_data <- read_rds("msigdb_v7.0_all_tibble.rds") %>% filter(SUB_CATEGORY_CODE == "CP:KEGG") %>% dplyr::select(STANDARD_NAME,EXACT_SOURCE, DESCRIPTION_BRIEF) %>% setNames(., c('name','ID', "Description")) cols_to_keep <- c("coefficient","ID","Description","NES", "p.adjust") hu_kegg_objct <- split(hum_res, hum_res$coefficient) %>% map(function(x){ x <- x$t %>% setNames(., x$entrezgene) %>% sort(., decreasing = TRUE) GSEA(x, TERM2GENE = cp_kegg, pvalueCutoff = 1, minGSSize=1) }) # KEGG_TAURINE_AND_HYPOTAURINE_METABOLISM and KEGG_FOLATE_BIOSYNTHESIS are not included # because they do not make the default cut off of min size 10 genes. They have more, # but the input genes vec only includes a subset of the genes of these pathways. I # chnaged this to have a set of 1 as min size. hu_pths <- hu_kegg_objct %>% map(as.data.frame) %>% rbind_named_df_list(., "coefficient") hu_pths$coefficient <- factor(hu_pths$coefficient, levels = hu_coefs) names(hu_pths)[2] <- "name" hu_pths$Description <- NULL hu_pths <- left_join(hu_pths, pth_data) %>% relocate(ID, Description, .after=name) lamps_kegg_objct <- split(lamps_res, lamps_res$coefficient) %>% map(function(x){ x <- x$t %>% setNames(., x$entrezgene) %>% sort(., decreasing = TRUE) GSEA(x, TERM2GENE = cp_kegg, pvalueCutoff = 1, minGSSize=1) }) lamps_pths <- lamps_kegg_objct %>% map(as.data.frame) %>% rbind_named_df_list(., "coefficient") lamps_pths$coefficient <- factor(lamps_pths$coefficient, levels(lamps_res$coefficient)) names(lamps_pths)[2] <- "name" lamps_pths$Description <- NULL lamps_pths <- left_join(lamps_pths, pth_data) %>% relocate(ID, Description, .after=name) tmp_out <- lamps_pths %>% filter(p.adjust < .05) # write_csv(tmp_out, file = "Data_file_S9_LAMPS_pathways.csv") # save(hu_pths, hu_kegg_objct, lamps_pths, lamps_kegg_objct, file = "NAFLD_model/msig_gsea_pathway_results_09-07-21.RData") load("NAFLD_model/msig_gsea_pathway_results_09-07-21.RData") cols_to_keep <- c("coefficient","ID","Description","NES", "p.adjust") hu_pths_tmp <- hu_pths %>% arrange(coefficient) %>% filter(p.adjust < .05) %>% select(cols_to_keep) %>% mutate(hum_sign=sign(NES), num_coef=as.numeric(coefficient)) %>% select(-NES, -p.adjust) lamps_pths_tmp <- lamps_pths %>% arrange(coefficient) %>% filter(p.adjust < .05) %>% select(cols_to_keep) %>% mutate(lamps_sign=sign(NES), num_coef=as.numeric(coefficient)) %>% select(-NES, -p.adjust) pth_overlap <- inner_join(lamps_pths_tmp[,-1], hu_pths_tmp[,-1]) %>% relocate(num_coef) pth_overlap$sgn_prd <- pth_overlap$lamps_sign * pth_overlap$hum_sign # saveRDS(pth_overlap,file = "NAFLD_model/msig_lamps_human_overlap_pths_09-07-21.rds") cncdnt_pths <- pth_overlap %>% filter(sgn_prd > 0) disc_pths <- pth_overlap %>% filter(sgn_prd < 0)
library(eulerr) hu_pth_list <- hu_pths_tmp %>% split(., .$num_coef) %>% map(function(x){ unique(x$ID) }) lamps_pth_list <- lamps_pths_tmp %>% split(., .$num_coef) %>% map(function(x){ unique(x$ID) }) out_list <- 1:3 %>% map(function(i){ tmp_list <- list(hu_pth_list[[i]], lamps_pth_list[[i]]) tmp_nms <- coef_map[i,2:3] %>% unlist %>% as.character names(tmp_list) <- tmp_nms # Reverses order so LAMPS comes first tmp_list <- rev(tmp_list) return(tmp_list) }) plot_list <- out_list %>% map(function(x){ euler(x) %>% plot(., quantities = list(cex = 1.125), labels = FALSE, adjust_labels=TRUE) # plot(., quantities = list(cex = 1.125), labels = list(cex = 1.25), adjust_labels=TRUE) }) for(i in 1:3){ # fn <- paste0("./figures/msig_kegg_venn_comparison_", i, "_09-08-21.png") png(fn, width = 2.5, height = 2.5, units = "in", res=500) print(plot_list[[i]]) dev.off() } ol <-1:3 %>% map_dbl(function(i){ overlap_ratio(hu_pth_list[[i]], lamps_pth_list[[i]]) }) sum_tbl <- pth_overlap %>% group_by(num_coef) %>% summarise(cnc_cnt=sum(sgn_prd == 1), dsc_cnt=sum(sgn_prd == -1)) %>% mutate(total=cnc_cnt + dsc_cnt) sum_tbl$prct_cnc <- sum_tbl$cnc_cnt/(sum_tbl$cnc_cnt + sum_tbl$dsc_cnt) sum_tbl$prct_dsc <- 1 - sum_tbl$prct_cnc
library(matrixStats) library(sva) library(glmnet) n_genes <- 7500 nfolds <- 20 v <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[1]] svobj <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[2]] p <- v$targets x_0 <- v$E x_0 <- x_0[order(rowVars(x_0), decreasing = TRUE)[1:n_genes],] lamps_v <- read_rds("NAFLD_model/lamps_nafld_qn_voom_07-08-21.rds") y_0 <- lamps_v$E y_0 <- y_0[order(rowVars(y_0), decreasing = TRUE)[1:n_genes],] genes_to_keep <- intersect(row.names(y_0), row.names(x_0)) y_0 <- y_0[genes_to_keep,] mod <- delutils::better_model_matrix(~0 + treatment, data=lamps_v$targets) mod0 <- model.matrix(~1, data=lamps_v$targets) lamps_svobj <- sva(y_0, mod, mod0) y_0 <- removeBatchEffect(y_0, covariates = lamps_svobj$sv, design = mod) mod <- delutils::better_model_matrix(~treatment, data=lamps_v$targets) x_0 <- x_0[genes_to_keep,] mlenet <- cv.glmnet(scale(t(x_0)), p$group, statndardize=FALSE, family = "multinomial", type.multinomial = "grouped", alpha = .95, nfolds = nfolds, intercept = FALSE) # saveRDS(mlenet, "NAFLD_model/mlenet_model.rds") table(lamps_v$targets$treatment, predict(mlenet, scale(t(y_0)), type="class", s=mlenet$lambda.1se, exact=TRUE))
library(ComplexHeatmap) load("data_for_figure2_heatmap.RData") col_tits <- c("Normal &\nSteatosis\n(PN&S)", "Predominately\nLobular\nInflammation\n(PLI)", "Predominately\nFibrosis\n(PF)") cluster_colors <- c("#6a3d9a","#ff7f00", "#b15928") v <- read_rds('DiStefano_diagnosis_n182_voom_qn_and_svobj.rds')[[1]] p <- v$targets cols_to_keep <- c("diagnosis", "bmi_surg", "sex", "age","icd_250") annot_map <- p[,cols_to_keep] %>% rownames_to_column %>% map_if(is.factor, as.character) %>% bind_cols %>% as.data.frame row.names(annot_map) <- annot_map$rowname annot_map <- annot_map[,-1] colnames(annot_map) <- str_to_title(colnames(annot_map)) colnames(annot_map)[ncol(annot_map)] <- "T2D" sex <- c('palevioletred1', 'royalblue') %>% setNames(., c('Female', 'Male')) col_map <- list(Sex=sex, Diagnosis=col_map) age_plot <- anno_points(annot_map['Age'], size = unit(1.5, "mm"), axis_param =list(side="right", at=c(30, 60), gp=gpar(fontsize = 4.5, fontface='bold'))) bmi <- anno_barplot(annot_map['Bmi_surg'], axis_param = list(side="right", at=c(30, 90 ), gp=gpar(fontsize = 4.5, fontface='bold'))) tmp_ht <- Heatmap(tmp_data, # Column args show_column_names = FALSE, cluster_columns= col_dend, column_title = col_tits, column_title_gp = gpar(fontsize=12, fontface="bold", col=cluster_colors), # column_dend_reorder = TRUE, column_split = 3, column_gap = unit(2, "mm"), # Row args cluster_rows = TRUE, cluster_row_slices = FALSE, show_row_names = FALSE, show_row_dend = FALSE, row_split = kegg_annots, row_title_gp = gpar(fontsize=12, fontface="bold"), row_title_rot = 0, # Annotations top_annotation = ha, # Dimensions height = unit(.025*nrow(tmp_data), 'inch'), width = unit(.025*ncol(tmp_data), 'inch'), # Misc parameters show_heatmap_legend = TRUE, heatmap_legend_param = list(title = "ES", just = "right", legend_width = unit(.0125*ncol(tmp_data), 'inch'), grid_width =unit(.75, 'cm'), direction = "horizontal", title_position = "lefttop") ) ht_height <- ComplexHeatmap:::height(draw(tmp_ht, heatmap_legend_side="bottom")) %>% convertUnit(., 'inch') %>% ceiling ht_width <- ComplexHeatmap:::width(draw(tmp_ht, heatmap_legend_side="bottom")) %>% convertUnit(., 'inch') %>% ceiling # pdf(file = "figure2_heatmap.pdf", # height = ht_height, # width = ht_width, # bg = "transparent") # draw(tmp_ht, heatmap_legend_side="bottom") draw(tmp_ht, show_annotation_legend = FALSE, show_heatmap_legend=FALSE) dev.off()
sig_pth_res <- read_rds("clust_gsva_path_res.rds") %>% filter(adj.P.Val < .001) coef_nms <- c("PLI vs. N&S", "PF vs. N&S", "PF vs. PLI" ) %>% setNames(.,unique(sig_pth_res$coefficient)) sig_pth_res$coefficient <- factor(coef_nms[sig_pth_res$coefficient], levels = coef_nms) cat_pths <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("disease_category", "id")) cat_pths$disease_category <- paste0("C", cat_pths$disease_category) sig_pth_res <- left_join(sig_pth_res, cat_pths)
library(scales) library(cowplot) plot_data <- sig_pth_res %>% select(coefficient, name, category_level_1) %>% distinct %>% group_by(coefficient, category_level_1) %>% summarise(cnt=n()) %>% mutate(prct=cnt/sum(cnt)) %>% ungroup %>% mutate(category_categorization="KEGG Groups") names(plot_data)[2] <- "pathway_category" plot_data2 <- sig_pth_res %>% select(coefficient, name, disease_category) %>% distinct %>% group_by(coefficient, disease_category) %>% summarise(cnt=n()) %>% mutate(prct=cnt/sum(cnt)) %>% ungroup %>% mutate(category_categorization="NAFLD Categories") names(plot_data2)[2] <- "pathway_category" plot_data <- rbind(plot_data, plot_data2) rm(plot_data2) plot_data$category_categorization <- factor(plot_data$category_categorization, levels = c("KEGG Groups", "NAFLD Categories")) lvls <- c(unique(sig_pth_res$category_level_1), unique(cat_pths$disease_category)) %>% rev plot_data$pathway_category <- factor(plot_data$pathway_category, levels=lvls) plot_data$prct <- label_percent(accuracy = .1)(plot_data$prct) plt <- ggplot(data=plot_data, aes(x=pathway_category, y=cnt)) + geom_bar(stat='identity',aes(fill=category_categorization)) + theme(panel.border = element_rect(colour = "black", fill=NA, size=1), text=element_text(face="bold"), axis.title.y = element_blank(), axis.title.x = element_text(face="bold", size=12, colour="black"), axis.text = element_text(face="bold", size=12, colour="black"), aspect.ratio = 1, strip.text = element_text(size =12), panel.spacing = unit(1.5, "lines"), legend.position = "none") + facet_grid(category_categorization ~ coefficient , scales = "free_y") + scale_y_continuous(expand = expansion(mult = c(0, .4))) + coord_flip() plt <- plt + geom_text(aes(x=pathway_category, y=cnt, label=prct),size=3.5, hjust = -.075) + labs(y="Number of differentially enriched pathways") # ggsave("updated_fig3_pathway_summary.pdf",plt, height = 4.5, width = 8.75)
library(readr) library(cowplot) cats_to_keep <- c("C1", "C2", "C3", "C4") pth_data <- read_csv("Data_file_S2_Differentially_enriched_pathways.csv") %>% separate_rows(nafld_categories, sep = '\\|') plot_data <- pth_data %>% # select(-2) %>% mutate(pathway_neg_log10_pval=-log(adj.P.Val)) %>% distinct plot_data <- plot_data %>% group_by(comparison) %>% arrange(desc(pathway_neg_log10_pval), .by_group=FALSE) %>% dplyr::slice(1:10) %>% ungroup plot_data$comparison <- factor(plot_data$comparison, levels = c("PLI vs. N&S", "PF vs. N&S","PF vs. PLI" )) # plot_data <- plot_data %>% arrange(comparison, desc(pathway_logFC)) #plot_data <- plot_data %>% arrange(comparison, pathway_logFC) plot_data <- plot_data %>% arrange(comparison, pathway_neg_log10_pval) plot_data$tmp_pth_name <- paste(plot_data$comparison, plot_data$pathway_name, sep="_") %>% factor(., levels = .) plot_data$fill_factor <- factor(if_else(plot_data$pathway_logFC < 0, 1, 2)) plt <- ggplot(data=plot_data, aes(x=tmp_pth_name, y=pathway_neg_log10_pval, # alpha=pathway_neg_log10_pval, fill=pathway_logFC)) + geom_bar(stat="identity") + coord_flip() + # scale_x_discrete(labels=plot_data$pathway_name) + facet_wrap(~comparison, nrow=3, scales = "free_y") + scale_x_discrete(label=function(x){ x %>% str_split(., '_') %>% map_chr(2) }) + scale_fill_gradient2(low = "blue",mid = "white", high = "red") plt$labels$fill <- "" plt <- plt + theme(legend.title = element_text(face="bold"), legend.text = element_text(face="bold")) # ggsave("log2fc_legend_for_top10_pathways.pdf", get_legend(plt), width = .75, height = 1.5) plt <- plt + #labs(x = '', y="Pathway log2 fold change") + guides(fill=FALSE) + theme_cowplot() + theme(legend.position = "none", plot.margin = unit(c(0, 0, 0, 0), "cm"), axis.title.x = element_blank(), axis.title.y = element_blank(), text=element_text(face="bold")) # ggsave("top_pathways_bar_plot.pdf", aligned_plots[[1]], height = 6.5, width = 7)
library(settleR) cat_list <- pth_data %>% select(disease_category, pathway_name) %>% distinct %>% filter(disease_category %in% cats_to_keep) cat_list$disease_category <- names(cats_to_keep)[match(cat_list$disease_category, cats_to_keep)] # cat_list <- split(cat_list, cat_list$disease_category) %>% # map(function(x) unique(x$pathway_name)) cat_list <- split(cat_list, cat_list$pathway_name) %>% map(function(x) unique(x$disease_category)) cat_mat <- sets_to_matrix(cat_list) cat_mat[cat_mat == 0] <- NA cat_mat <- melt(cat_mat) %>% na.omit cat_mat <- cat_mat[,-3] names(cat_mat) <- c("disease_category", "pathway_name") plot_data <- left_join(plot_data, cat_mat) grid_data <- plot_data %>% split(., .$tmp_pth_name) %>% map(function(x) as.character(x$disease_category)) %>% sets_to_matrix(.) grid_data <- grid_data[sort(row.names(grid_data)),] %>% melt %>% setNames(., c("intersect_id","set_names","observed")) grid_data$observed <- as.logical(grid_data$observed) grid_data$comparison <- str_split(grid_data$set_names, '_') %>% map_chr(1) %>% factor(., levels = levels(plot_data$comparison)) v_max <- max(1, length(unique(grid_data$intersect_id)) -1) v_breaks <- seq(1, v_max, by = 1)+ .5 h_max <- max(1, length(unique(grid_data$set_names)) - 1) h_breaks <- seq(1, h_max, by = 1)+ .5 dot_size <- 4.25 p <- ggplot(data=grid_data, aes(x=intersect_id, y=set_names)) + geom_point(size=dot_size, alpha=.25) + theme_empty() # just plots empty dots # p <- p + # geom_line(data=grid_data[grid_data$observed,], # adds the lines # aes(x=intersect_id, y=set_names, group=set_names), # size=dot_size/3, # inherit.aes = FALSE) p <- p + geom_point(data=grid_data[grid_data$observed,], # Add `filled in points` aes(x=intersect_id, y=set_names), size=dot_size, inherit.aes = FALSE) p <- p + theme(axis.text.y = element_blank(), plot.margin = unit(c(0, 0, 0, 0), "cm")) + geom_vline(xintercept = v_breaks) + geom_hline(yintercept = h_breaks) p <- p + facet_wrap(~comparison, nrow=3, scales = "free_y") aligned_plots <- cowplot::align_plots(plt, p, align="hv", axis="tblr") ggsave("disease_category_grid_plot.pdf", aligned_plots[[2]], height = 6.5, width = 6)
sig_pth_res <- read_rds("clin_gsva_path_res.rds") %>% filter(adj.P.Val < .001) cat_pths <- read_delim("NAFLD_progression2path_01-14-2022.txt", "\t", col_names = FALSE) %>% set_names(., c("disease_category", "id")) sig_pth_res <- left_join(sig_pth_res, cat_pths) levels(sig_pth_res$coefficient) <- c("Lob vs N&S", "Fib vs N&S", "Fib vs Lob") pth_map <- read_delim("pth_map.txt", ";", col_names = FALSE) %>% setNames(., c("nafld_category", "disease_category"))
library(scales) library(cowplot) plot_data <- sig_pth_res %>% select(coefficient, name, category_level_1) %>% distinct %>% group_by(coefficient, category_level_1) %>% summarise(cnt=n()) %>% mutate(prct=cnt/sum(cnt)) %>% ungroup %>% mutate(category_categorization="KEGG Groups") names(plot_data)[2] <- "pathway_category" plot_data2 <- sig_pth_res %>% select(coefficient, name, disease_category) %>% distinct %>% group_by(coefficient, disease_category) %>% summarise(cnt=n()) %>% mutate(prct=cnt/sum(cnt)) %>% ungroup %>% mutate(category_categorization="NAFLD Categories") names(plot_data2)[2] <- "pathway_category" plot_data2$pathway_category <- pth_map$disease_category[plot_data2$pathway_category] plot_data <- rbind(plot_data, plot_data2) rm(plot_data2) plot_data$category_categorization <- factor(plot_data$category_categorization, levels = c("KEGG Groups", "NAFLD Categories")) lvls <- c(sort(unique(sig_pth_res$category_level_1)), pth_map$disease_category) %>% rev lvls <- c(sort(unique(sig_pth_res$category_level_1)), c(pth_map$disease_category, "Uncharacterized")) %>% rev plot_data$pathway_category <- factor(plot_data$pathway_category, levels=lvls) plot_data$prct <- label_percent(accuracy = .1)(plot_data$prct) plt <- ggplot(data=plot_data, aes(x=pathway_category, y=cnt)) + geom_bar(stat='identity',aes(fill=category_categorization)) + theme(panel.border = element_rect(colour = "black", fill=NA, size=1), text=element_text(face="bold"), axis.title.y = element_blank(), axis.title.x = element_text(face="bold", size=12, colour="black"), axis.text = element_text(face="bold", size=12, colour="black"), aspect.ratio = 1, strip.text = element_text(size =12), panel.spacing = unit(.5, "lines"), legend.position = "none") + facet_grid(category_categorization ~ coefficient , scales = "free_y") + scale_y_continuous(limits=c(0, 56),expand=c(0,0)) + # 02/22/22 kludge to make same scale as prev version of fig coord_flip() plt <- plt + geom_text(aes(x=pathway_category, y=cnt, label=prct),size=3.5, hjust = -.075) + labs(y="Number of differentially enriched pathways") plt <- set_panel_size(plt, height = unit(4,"cm"), width = unit(5,"cm")) # ggsave("clinical_rowscaled_fig3_pathway_summary_03-03-22.png", plt, height = 4.5, width = 12, dpi=600)
library(readr) library(cowplot) plot_data <- sig_pth_res %>% filter(disease_category %in% 1:4) %>% select(-disease_category) %>% mutate(pathway_neg_log10_pval=-log(adj.P.Val)) %>% distinct plot_data <- plot_data %>% group_by(coefficient) %>% arrange(desc(pathway_neg_log10_pval), .by_group=FALSE) %>% dplyr::slice(1:10) %>% ungroup plot_data <- plot_data %>% arrange(coefficient, pathway_neg_log10_pval, abs(logFC)) # updated 02/22/22 to add additional sort by NES plot_data$tmp_pth_name <- paste(plot_data$coefficient, plot_data$description, sep="_") %>% factor(., levels = .) plot_data$fill_factor <- factor(if_else(plot_data$logFC < 0, 1, 2)) plt <- ggplot(data=plot_data, aes(x=tmp_pth_name, y=pathway_neg_log10_pval, fill=logFC)) + geom_bar(stat="identity") + coord_flip() + facet_wrap(~coefficient, nrow=3, scales = "free_y") + scale_x_discrete(label=function(x){ x %>% str_split(., '_') %>% map_chr(2) }) + scale_fill_gradient2(low = "blue",mid = "white", high = "red", limits=c(-2,2)) scale_y_continuous(limits=c(0, 60)) # updated 02/22/22 to make limits same as in prev fig plt$labels$fill <- "" plt <- plt + theme(legend.title = element_text(face="bold"), legend.text = element_text(face="bold")) # ggsave("clin_label_gsea_nes_legend_for_top10_pathways.png", get_legend(plt), width = .75, height = 1.5, dpi=600) plt <- plt + guides(fill=FALSE) + theme_cowplot() + theme(legend.position = "none", plot.margin = unit(c(0, 0, 0, 0), "cm"), axis.title.x = element_blank(), axis.title.y = element_blank(), text=element_text(face="bold"))
library(settleR) library(reshape2) cat_list <- sig_pth_res %>% select(disease_category, name) %>% distinct %>% filter(disease_category %in% 1:4) cat_list <- split(cat_list, cat_list$name) %>% map(function(x) paste0("C", unique(x$disease_category))) cat_mat <- sets_to_matrix(cat_list) cat_mat[cat_mat == 0] <- NA cat_mat <- melt(cat_mat) %>% na.omit cat_mat <- cat_mat[,-3] names(cat_mat) <- c("disease_category", "name") plot_data <- left_join(plot_data, cat_mat) grid_data <- plot_data %>% split(., .$tmp_pth_name) %>% map(function(x) as.character(x$disease_category)) %>% sets_to_matrix(.) grid_data <- grid_data[sort(row.names(grid_data)),] %>% melt %>% setNames(., c("intersect_id","set_names","observed")) grid_data$observed <- as.logical(grid_data$observed) grid_data$coefficient <- str_split(grid_data$set_names, '_') %>% map_chr(1) grid_data$coefficient <- factor(grid_data$coefficient, levels = levels(plot_data$coefficient)) v_max <- max(1, length(unique(grid_data$intersect_id)) -1) v_breaks <- seq(1, v_max, by = 1)+ .5 h_max <- max(1, length(unique(grid_data$set_names)) - 1) h_breaks <- seq(1, h_max, by = 1)+ .5 dot_size <- 4.25 p <- ggplot(data=grid_data, aes(x=intersect_id, y=set_names)) + geom_point(size=dot_size, alpha=.25) + theme_empty() # just plots empty dots p <- p + geom_point(data=grid_data[grid_data$observed,], # Add `filled in points` aes(x=intersect_id, y=set_names), size=dot_size, inherit.aes = FALSE) p <- p + theme(axis.text.y = element_blank(), plot.margin = unit(c(0, 0, 0, 0), "cm")) + geom_vline(xintercept = v_breaks) + geom_hline(yintercept = h_breaks) p <- p + facet_wrap(~coefficient, nrow=3, scales = "free_y") out_plt <- plot_grid(plt, NULL, p, rel_widths = c(7.5, .1, 1.5), align = "h", axis='t', nrow = 1) tmp_plt <- set_panel_size(plt, height = unit(4.5,"cm"), width = unit(6,"cm")) # ggsave("clinical_scaled_gsva_logfc_bar_top10_pathways_03-03-22.png", tmp_plt, width = 8, height = 6.5, dpi=600) tmp_plt <- set_panel_size(p, height = unit(4.5,"cm"), width = unit(3.5,"cm")) # ggsave("clinical_scaled_gsva_logfc_grid_top10_pathways_03-03-22.png", tmp_plt, width = 1.25, height = 6.5, dpi=600)
library(eulerr) clust_tmp <- read_rds("clust_gsva_path_res.rds") clust_tmp$coefficient <- factor(clust_tmp$coefficient) clust_tmp <- clust_tmp %>% filter(adj.P.Val < .001) %>% mutate(sig_idx=as.numeric(coefficient)) %>% select(sig_idx, name) %>% split(., .$sig_idx) %>% map(function(x){ unique(x$name) }) clin_tmp <- read_rds("clin_gsva_path_res.rds") %>% filter(adj.P.Val < .001) %>% mutate(sig_idx=as.numeric(coefficient)) %>% select(sig_idx, name) %>% split(., .$sig_idx) %>% map(function(x){ unique(x$name) }) out_list <- 1:3 %>% map(function(i){ tmp_list <- list(clust_tmp[[i]], clin_tmp[[i]]) tmp_nms <- coef_tbl[i,2:3] %>% unlist %>% as.character # Euler doesn't work with names that have & # so going to that by hand names(tmp_list) <- c("clust", "clin") # tmp_list <- rev(tmp_list) return(tmp_list) }) plot_list <- out_list %>% map(function(x){ euler(x) %>% plot(., quantities = list(cex = 1.125), labels = FALSE, adjust_labels=TRUE) # plot(., quantities = list(cex = 1.125), labels = list(cex = 1.25), adjust_labels=TRUE) }) for(i in 1:3){ # fn <- paste0("msig_kegg_clust_v_clin_comparisons_", i, "_03-03-22.png") png(fn, width = 2.5, height = 2.5, units = "in", res=500) print(plot_list[[i]]) dev.off() }
See the Microarray meta-analysis section
See the Machine learning section
See the Pathway concordance section
library(igraph) library(visNetwork) biosnap <- read_rds("PPT-Ohmnet_edgelist_liver.rds") bsp_genes <- do.call(c, biosnap[,1:2]) %>% unique nafld_associated_pths <- c("Non-alcoholic fatty liver disease (NAFLD)", "TNF signaling pathway", "Insulin signaling pathway", "Type II diabetes mellitus", "PI3K-Akt signaling pathway", "Adipocytokine signaling pathway", "PPAR signaling pathway", "Fatty acid biosynthesis", "Protein processing in endoplasmic reticulum", "Oxidative phosphorylation", "Apoptosis") kegg_data <- read_rds("kegg_pathway_entrezgene.rds") %>% setNames(., c("id", "entrezgene")) kegg_data <- left_join(kegg_data, read_rds("kegg_pathway_hierarchy.rds")) gene_res <- read_rds("clust_gene_res.rds") %>% filter(adj.P.Val < .001) nap_genes <- kegg_data %>% filter(description %in% nafld_associated_pths) %>% pull(entrezgene) %>% unique() degs_to_keep <- gene_res %>% filter(entrezgene %in% nap_genes) %>% pull(entrezgene) %>% unique() %>% as.character degs_to_keep <- degs_to_keep[degs_to_keep %in% bsp_genes] mapped_degs <- degs_to_keep[degs_to_keep %in% bsp_genes] g <- graph_from_data_frame(biosnap, directed = FALSE) nds_to_remove <- !V(g)$name %in% mapped_degs g_filt <- delete_vertices(g, nds_to_remove) g_filt <- simplify(g_filt) xx <- as.data.frame(org.Hs.eg.db::org.Hs.egSYMBOL) %>% as.data.frame %>% setNames(., c("entrezgene", "gene_name")) n <- V(g_filt)$name %>% data.frame(id=.) n$label <- xx$gene_name[match(n$id, xx$entrezgene)] e <- as_edgelist(g_filt) %>% as.data.frame %>% setNames(., c("from", "to")) vnet <- visNetwork(n,e) %>% visIgraphLayout() %>% visOptions(highlightNearest = list(enabled = T, degree = 1, hover = T)) # visSave(vnet, file = "network.html")
See
library(flextable) library(officer) sig_pth_res <- read_rds("clust_gsva_path_res.rds") %>% filter(adj.P.Val < .001) coef_nms <- c("PLI vs. N&S", "PF vs. N&S", "PF vs. PLI" ) %>% setNames(.,unique(sig_pth_res$coefficient)) sig_pth_res$coefficient <- factor(coef_nms[sig_pth_res$coefficient], levels = coef_nms) cat_pths <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("nafld_category", "id")) pth_map <- read_delim("pth_map.txt", ";", col_names = FALSE) %>% setNames(., c("nafld_category", "disease_category")) cat_pths <- left_join(cat_pths, pth_map) cat_pths$nafld_category <- paste0("C", cat_pths$nafld_category) # sig_pth_res <- left_join(sig_pth_res, cat_pths) nafld_category <- split(cat_pths, cat_pths$id) %>% map(function(x){ tmp_cats <- x %>% pull(nafld_category) %>% paste0(., collapse = '|') %>% enframe }) %>% rbind_named_df_list(., "id") %>% select(-name) names(nafld_category)[2] <- "nafld_categories" sig_pth_res <- left_join(sig_pth_res, nafld_category) pth_refs <- read_excel("pth_refs.xlsx", sheet = 1) pth_refs <- pth_refs[,c(2,3)] pth_refs <- split(pth_refs, pth_refs$pathway_id) %>% map(function(x){ tmp_cats <- x %>% pull(pmids) %>% paste0(., collapse = '|') %>% enframe }) %>% rbind_named_df_list(., "id") %>% select(-name) names(pth_refs)[2] <- "PMID" tbl_data <- left_join(sig_pth_res, pth_refs) tbl_data <- left_join(tbl_data, nafld_category) fixed_pathway_name <- paste0("(", tbl_data$id, ")") %>% paste(tbl_data$description, .) tbl_data$fixed_pathway_name <- fixed_pathway_name cols_to_keep <- c("coefficient", "fixed_pathway_name", "category_level_1","category_level_2","nafld_categories", "logFC", "adj.P.Val", "PMID") tbl_data <- tbl_data[,cols_to_keep] fixed_nms <- c("coefficient","Pathway name", "KEGG pathway group", "KEGG pathway subgroup", "NAFLD category", "log2 Fold change", "FDR corrected p-value","PMIDs") split_tbls <- split(tbl_data, tbl_data$coefficient) %>% map(function(x){ tmp_tbl <- x %>% arrange(logFC) names(tmp_tbl) <- fixed_nms return(tmp_tbl) })
top_n_drugs <- 25 best_score <- read_rds("cluster_sigs_cmap_lincs17_pert_sums.rds")[["best_score"]] tgts <- read_rds("drugbank_db_metadata_v5.1.4.rds") %>% select(drugbank_id, targets) best_score <- left_join(best_score, tgts) %>% mutate(out_name=paste0(pert_iname, " (", drugbank_id, ")")) pth_cats <- c("Insulin Resistance and Oxidative Stress", "Cell Stress, Apoptosis and Lipotoxicity", "Inflammation", "Fibrosis") sig_map <- tibble(sig_idx=1:12, path_cats=rep(pth_cats, 3)) sig_map$gs_name <- paste0("s", sig_map$sig_idx, ": ", sig_map$path_cats) sig_map$path_cats <- NULL best_score <- right_join(sig_map, best_score) pert_score <- right_join(sig_map, pert_score) tmp1 <- best_score %>% filter(drug_idx <= 20) %>% group_by(pert_iname, out_name, targets) %>% # group_by(pert_iname) %>% arrange(drug_idx, .by_group = TRUE) %>% summarise(ordered_gene_sig=paste(gs_name, collapse = "\n")) %>% ungroup best_score_table <- best_score_top20(best_score)[1:top_n_drugs,] %>% left_join(., tmp1) %>% select(-pert_iname) %>% relocate(out_name) %>% relocate(targets, .after=ordered_gene_sig) best_score_table <- best_score_table %>% arrange(desc(freq), desc(unique_inst)) names(best_score_table) <- c("Drug name (DrugBank ID)", "Gene signature-query frequency", "Unique instances", "Gene signature indices (see Table S4) and their disease categorization", "Canonical targets") tb1 <- make_flextable(best_score_table) %>% border_inner(fp_border(color="gray")) %>% fix_border_issues %>% fontsize(., size = 9, part = "all") %>% set_caption(., "Best score") %>% width(., 1:5, 1) docx <- read_docx() %>% body_add_flextable(., tb1) %>% body_add_flextable(., tb2) # print(docx, target = "cluster_signatures_lincs2017_w_sigs_02-05-22_.docx")
top_n_drugs <- 25 fn <- "cluster_sigs_cmap_lincs20_pert_sums.rds" pert_score <- read_rds(fn)[["prct_67th_score"]] pth_cats <- c("Insulin Resistance and Oxidative Stress", "Cell Stress, Apoptosis and Lipotoxicity", "Inflammation", "Fibrosis") sig_map <- tibble(sig_idx=1:12, path_cats=rep(pth_cats, 3)) sig_map$gs_name <- paste0("s", sig_map$sig_idx, ": ", sig_map$path_cats) sig_map$path_cats <- NULL pert_score <- right_join(sig_map, pert_score) pert_score <- left_join(pert_score, tgts) %>% mutate(out_name=paste0(pert_iname, " (", drugbank_id, ")")) pth_cats <- c("Insulin Resistance and Oxidative Stress", "Cell Stress, Apoptosis and Lipotoxicity", "Inflammation", "Fibrosis") sig_map <- tibble(sig_idx=1:12, path_cats=rep(pth_cats, 3)) sig_map$gs_name <- paste0("s", sig_map$sig_idx, ": ", sig_map$path_cats) sig_map$path_cats <- NULL pert_score <- right_join(sig_map, pert_score) lincs_top20_table <- lincs2017_score_top20(pert_score)[1:top_n_drugs,] %>% left_join(., tmp2) %>% select(-pert_iname) %>% relocate(out_name) %>% relocate(targets, .after=ordered_gene_sig) names(lincs_top20_table) <- c("Drug name (DrugBank ID)", "Gene signature-query frequency", "Gene signature indices (see Table S4) and their disease categorization", "Canonical targets") tb2 <- make_flextable(lincs_top20_table) %>% border_inner(fp_border(color="gray")) %>% fix_border_issues %>% fontsize(., size = 9, part = "all") %>% set_caption(., "LINCS percentile Score") %>% width(., 1:4, 1) docx <- read_docx() %>% body_add_flextable(., tb2) # print(docx, target = "clust_signatures_lincs2020_w_sigs_02-22-22_.docx")
hier_kegg <- read_rds('kegg_pathway_hierarchy.rds') cat_pths <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("nafld_category", "id")) pth_map <- read_delim("pth_map.txt", ";", col_names = FALSE) %>% setNames(., c("nafld_category", "disease_category")) cat_pths <- left_join(cat_pths, pth_map) cat_pths$nafld_category <- NULL kegg_df <- read_rds("c2_kegg_entrez_msigdb_v7.0_database.rds") %>% right_join(., cat_pths) %>% separate_rows(., entrezgene, sep = '\\|') per_gene_kegg <- split(kegg_df, kegg_df$entrezgene) %>% map(function(x){ kegg_pathway_names <- paste0(unique(x$description), collapse = '|') kegg_pathway_ids <- paste0(unique(x$id), collapse = '|') entrezgene <- unique(x$entrezgene) out_df <- tibble(entrezgene, kegg_pathway_names, kegg_pathway_ids) }) %>% do.call(rbind, .) per_gene_kegg <- distinct(per_gene_kegg) adj_p_val_thresh <- .05 cols_to_remove <- c("gene_biotype","description") gene_res <- read_rds('clust_gene_res.rds') %>% select(-cols_to_remove) gene_res$coefficient <- factor(gene_res$coefficient) levels(gene_res$coefficient) <- c("PLI vs. PN&S", "PF vs. PN&S", "PF vs. PLI" ) gene_res2 <- read_rds("clin_gene_res.rds") gene_res2$coefficient <- factor(gene_res2$coefficient, levels = unique(gene_res2$coefficient)) levels(gene_res2$coefficient) <- c("Lob vs N&S", "Fib vs N&S", "Fib vs Lob" ) cols_to_keep <- intersect(colnames(gene_res), colnames(gene_res2)) gene_res <- rbind(gene_res[,cols_to_keep], gene_res2[,cols_to_keep]) gene_res$entrezgene <- as.character(gene_res$entrezgene) sig_gene_res <- gene_res %>% filter(adj.P.Val < adj_p_val_thresh) sig_gene_res <- left_join(sig_gene_res, per_gene_kegg) names(sig_gene_res)[1:4] <- c("comparison", "ensembl_id", "gene_symbol", "entrez_gene_id") # clst_map <- # c("PLI vs. N&S", "PF vs. N&S", "PF vs. PLI") %>% # setNames(., unique(sig_gene_res$comparison)) # sig_gene_res$comparison <- unlist(clst_map[sig_gene_res$comparison]) sig_gene_res <- sig_gene_res %>% relocate(gene_symbol, .before=ensembl_id) %>% relocate(entrez_gene_id, .before=ensembl_id) # write_csv(sig_gene_res, file = "data_files/Data_file_S2_DEG_details.csv")
Updated 03/30/22 to include the clinical stuff
cols_to_remove <- c("name", "category_id") sig_pth_res <- read_rds("clust_gsva_path_res.rds") %>% filter(adj.P.Val < .001) %>% select(-cols_to_remove) sig_pth_res$coefficient <- factor(sig_pth_res$coefficient) levels(sig_pth_res$coefficient) <- c("PLI vs. PN&S", "PF vs. PN&S", "PF vs. PLI") clin_tmp <- read_rds("clin_gsva_path_res.rds") %>% filter(adj.P.Val < .001) clin_tmp <- clin_tmp[,colnames(sig_pth_res)] levels(clin_tmp$coefficient) <- c("Lob vs N&S", "Fib vs N&S", "Fib vs Lob" ) sig_pth_res <- rbind(sig_pth_res, clin_tmp) sig_pth_res <- sig_pth_res %>% relocate(description, .before=id) names(sig_pth_res)[1:2] <- c("comparison", "pathway_name") # sig_pth_res$comparison <- unlist(clst_map[sig_pth_res$comparison]) names(sig_pth_res)[4:5] <- c("kegg_pathway_group", "kegg_pathway_subgroup") nafld_category <- read_delim("NAFLD_progression2path_03-09-2021.txt", "\t", col_names = FALSE) %>% set_names(., c("nafld_categories", "id")) nafld_category$nafld_categories <- paste0("C", nafld_category$nafld_categories) nafld_category <- split(nafld_category, nafld_category$id) %>% map(function(x){ ncats <- paste(unique(x$nafld_categories), collapse = "|") id <- unique(x$id) tibble(nafld_categories=ncats, id) }) %>% bind_rows sig_pth_res <- left_join(sig_pth_res, nafld_category) # write_csv(sig_pth_res, file = "data_files/Data_file_S1_Differentially_enriched_pathways.csv")
gene_sig <- read_rds("cluster_gene_signatures.rds") gene_sig$sig_idx <- 1:12 levels(gene_sig$coefficient) <- c("PLI vs. PN&S", "PF vs. PN&S", "PF vs. PLI") gene_sig$sig_idx <- paste0("s", gene_sig$sig_idx) names(gene_sig)[1:3] <- c("gene_sig_idx","comparison","nafld_pathway_category") gene_sig2 <- read_rds("clin_gene_signatures.rds") levels(gene_sig2$coefficient) <- c("Lob vs N&S", "Fib vs N&S", "Fib vs Lob" ) gene_sig2$sig_idx <- paste0("s*", gene_sig2$sig_idx) names(gene_sig2)[1:3] <- c("gene_sig_idx","comparison","nafld_pathway_category") gene_sig <- rbind(gene_sig, gene_sig2) gene_sig$nafld_pathway_category <- paste0("C", gene_sig$nafld_pathway_category) xx <- as.data.frame(org.Hs.eg.db::org.Hs.egSYMBOL) %>% setNames(., c("entrezgene", "gene_name")) upreg_sig <- gene_sig %>% select(-down_genes_entrez, -n_downreg) %>% separate_rows(up_genes_entrez, sep=";") names(upreg_sig)[4] <- "entrezgene" upreg_sig <- left_join(upreg_sig, xx) %>% group_by(gene_sig_idx, comparison, nafld_pathway_category) %>% summarise(`up-regulated_gene_names`=paste(unique(gene_name), collapse = ";"), `up-regulated_entrez_ids`=paste(unique(entrezgene), collapse = ";")) downreg_sig <- gene_sig %>% select(-up_genes_entrez, -n_upreg) %>% separate_rows(down_genes_entrez, sep=";") names(downreg_sig)[4] <- "entrezgene" downreg_sig <- left_join(downreg_sig, xx) %>% group_by(gene_sig_idx, comparison, nafld_pathway_category) %>% summarise(`down-regulated_gene_names`=paste(unique(gene_name), collapse = ";"), `down-regulated_entrez_ids`=paste(unique(entrezgene), collapse = ";")) out_df <- left_join(upreg_sig, downreg_sig) out_df <- out_df %>% arrange(comparison, nafld_pathway_category) # write_csv(out_df, "Data_file_S3_Gene_signatures.csv")
Updated 03/30/22 Updated 04/11/22 to include results from all signatures
gene_sig_map <- read_csv("data_files_03-30-22/Data_file_S3_Gene_signatures.csv") %>% select(1:3) clust_sig_map <- gene_sig_map[1:12,] %>% mutate(sig_idx=1:12) clin_sig_map <- gene_sig_map[13:24,] %>% mutate(sig_idx=1:12) clust_res_l2017 <- read_rds("cluster_sigs_cmap_lincs17_res.rds") %>% right_join(clust_sig_map, .) %>% select(-sig_idx) clin_res_l2017 <- read_rds("clinical_sigs_cmap_lincs17_res.rds") %>% right_join(clin_sig_map, .) %>% select(-sig_idx) res_2017 <- rbind(clust_res_l2017, clin_res_l2017) clust_res_l2020 <- read_rds("cluster_sigs_cmap_lincs20_res.rds") %>% select(-up_genes, -down_genes) %>% right_join(clust_sig_map, .) %>% select(-sig_idx) clin_res_l2020 <- read_rds("clinical_sigs_cmap_lincs20_res.rds") %>% right_join(clin_sig_map, .) %>% select(-sig_idx) res_2020 <- rbind(clust_res_l2020, clin_res_l2020) res_2020 <- res_2020 %>% filter(!sig_id %in% res_2017$sig_id) rm(clin_res_l2017, clin_res_l2020, clust_res_l2017, clust_res_l2020) trt_cp <- read_rds('GSE92742_inst_metadata.rds') %>% filter(pert_type == "trt_cp") %>% select(sig_id, pert_id) %>% distinct %>% mutate(lincs_db=2017) drg_tgts <- read_rds("drugbank_db_metadata_v5.1.4.rds") %>% select(drugbank_id, targets) %>% distinct %>% na.omit annots <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") trt_cp <- inner_join(trt_cp, annots) %>% select(-name) drb_sigs <- read_rds("cmap2020_drubank_overlap_sig_info_01-24-22.rds") %>% mutate(lincs_db=2020) %>% select(one_of(colnames(trt_cp))) %>% distinct() trt_cp <- rbind(drb_sigs, trt_cp) %>% distinct %>% group_by(sig_id) %>% mutate(lincs_db=paste(sort(unique(lincs_db)), collapse = "|")) %>% left_join(trt_cp, drg_tgts) out_res <- rbind(res_2017, res_2020) %>% inner_join(., trt_cp) %>% relocate(lincs_db, pert_id, pert_iname, drugbank_id, targets, .after = sig_id) %>% select(-es_up, -es_down,) # write_csv(out_res, "data_files_03-30-22/Data_file_S4_All_drugs.csv") # write_csv(out_res, "data_files_04-11-22/Data_file_S4_All_drugs.csv")
Updated 04/11/22 complete re-write
tmp_files <- list.files(".", "_pert_sums") nms <- str_split(tmp_files, '_p') %>% map(1) %>% unlist %>% gsub('sig_', "", .) %>% gsub("lincs", "", .) gene_sig_map$signature_set <- NA gene_sig_map$signature_set[1:12] <- "clust" gene_sig_map$signature_set[13:24] <- "clin" gene_sig_map$sig_idx <- rep(1:12, 2) cols_to_keep <- c("signature_set", "lincs_db", "sig_idx", "pert_id", "pert_sum_score", "pert_iname", "drugbank_id", "drug_idx") lincs_score_out <- map(seq_along(tmp_files), function(idx){ x <- read_rds(paste0('./', tmp_files[[idx]]))[[2]] %>% group_by(sig_idx) %>% arrange(pert_sum_score, .by_group = TRUE) %>% slice(1:20) cur_nm <- nms[idx] %>% str_split(., '_') %>% unlist x <- cbind(signature_set=cur_nm[1], lincs_db=cur_nm[2], x) x <- x[,cols_to_keep] return(x) }) %>% bind_rows %>% mutate(summary_stat="prct_67th_score") best_score_out <- map(seq_along(tmp_files), function(idx){ x <- read_rds(paste0('./', tmp_files[[idx]]))[[1]] %>% group_by(sig_idx) %>% arrange(cmap_score, .by_group = TRUE) %>% slice(1:20) %>% ungroup %>% filter(fdr_p_value < .05) %>% rename(pert_sum_score = cmap_score) cur_nm <- nms[idx] %>% str_split(., '_') %>% unlist x <- cbind(signature_set=cur_nm[1], lincs_db=cur_nm[2], x) x <- x[,cols_to_keep] return(x) }) %>% bind_rows %>% mutate(summary_stat="best_score") out_file <- rbind(best_score_out, lincs_score_out) %>% right_join(gene_sig_map, .) %>% select(-sig_idx, -signature_set) out_file <- out_file %>% relocate(summary_stat, .after = drugbank_id) %>% relocate(pert_sum_score, .after=drug_idx) # write_csv(out_file, file = "data_files_03-30-22/Data_file_S5_CMap_frequency_prioritized_compounds.csv")
03/31/21
drug_list <- read_rds("cmap2020_pert_drugbank_overlap_01-18-22.rds") drug_list2 <- read_rds("GSE92742_perts_in_DrugBank_03-30-21.rds") %>% filter(!drugbank_id %in% drug_list$drugbank_id) %>% # filter(pert_id %in% drug_list$pert_id) %>% select(-name) %>% distinct names(drug_list2)[2] <- "drug_name" drug_list <- rbind(drug_list, drug_list2) %>% distinct network_res <- read_csv("network_proximity_results.csv") %>% arrange(z) names(network_res)[1] <- "drug_name" network_res <- network_res %>% relocate(z, .after=drug_name) tmp_drg_ids <- drug_list %>% select(drugbank_id, drug_name) %>% distinct network_res <- left_join(network_res, tmp_drg_ids) network_res <- network_res %>% relocate(drugbank_id, .after=drug_name) # write_csv(network_res, file="Data_file_S7_network_proximity_results.csv")
See the LAMPS analysis section
See the LAMPS analysis section
library(broom) library(glmnet) library(readxl) mlenet <- read_rds("NAFLD_model/mlenet_model.rds") feats <- tidy(mlenet$glmnet.fit) %>% filter(lambda == mlenet$lambda.1se) %>% filter(term != "") names(feats)[2] <- "ensembl_gene_id" load("gene_annots.RData") gene_annots <- gene_annots %>% rownames_to_column("ensembl_gene_id") %>% select(1,2,5) names(gene_annots)[2] <- "gene_name" feats <- left_join(feats, gene_annots) pmids <- read_excel("NAFLD_model/mlenet_feature_pmids.xlsx") feats <- left_join(feats, pmids) # write_csv(feats, "Data_file_S10_MLENet_features_n71.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.