knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval=FALSE )
This document shows how the basicOmics
can be used to run an RNA-seq analysis. Currently, this is done using the limma-voom
method with batch and other nuisance variables accounted for using sva
. The functions included in this package are wrappers and other functions which help with automating gene expression analysis. The packages commonly used for expression analysis rely on base
R
functions and settings for things like model.matrix
es, factor
s. Many of these base
functions are product of ~20 years of technical debt which are often non-intuitive^["The more you learn about the R language, the worse it will feel."]. Even with R
proficiency, I find myself having to spend a considerable amount of effort verifying that my scripts are doing what I think they should.
The goal of this project is create a set of functions which will help in expression analysis. These are all written to emphasize modularity, simplicity, and the inclusion of explicit errors. I've focused on creating functions which automate the tedious and/or error prone analysis steps.
These are steps used to create a subset of the data for testing purposes. At the moment, everything is hardcoded and taken from real data. It might be worthwhile to create synthetic datasets for automated testing.
library(tximport) library(tibble) library(dplyr) load("../../distefanoRNAseq/data-raw/pdata.RData") load("../../distefanoRNAseq/data-raw/tximport_limma_voom.RData") load("../../distefanoRNAseq/data-raw/tx2gene.RData") load("../../distefanoRNAseq/data-raw/gene_annots.RData") p_sub <- p[colnames(txi_sub$abundance),] ex_p <- p_sub[,c('file_name', 'sex','age', 'bmi_surg', 'diagnosis', 'group')] set.seed(42) ex_p <- ex_p %>% #ex_p[ex_p$group != 'Lob',] %>% rownames_to_column(., var='sample_name') %>% group_by(., group) %>% sample_n(., 10) %>% as.data.frame(.) row.names(ex_p) <- ex_p$sample_name files <- paste0('/Users/daniellefever/Desktop/R/thesisAnalysis/data-raw/DiStefano_raw_data/kallisto/', ex_p$file_name, '.gz') names(files) <- row.names(ex_p) ex_lengthscaledtpm_txi <- tximport(files, type = 'kallisto', tx2gene = tx2gene, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM') #save(ex_p, ex_lengthscaledtpm_txi, gene_annots, file='../data-raw/example_raw_data.RData')
load('../data-raw/example_raw_data.RData') library(dplyr) library(sva) library(basicOmics) library(limma) temp_dge <- basicOmics::dge_wrapper(ex_lengthscaledtpm_txi, ex_p, gene_annots ) mod <- basicOmics::better_model_matrix(~group, data=ex_p) mod0 <- model.matrix(~1, data=ex_p) v_sv <- basicOmics::voom_sva_wrapper(temp_dge, mod, mod0) fit <- lmFit(v_sv) %>% eBayes(.) res <- basicOmics::get_limma_results(fit)
mod <- basicOmics::better_model_matrix(~group, data=ex_p) v <- voom(temp_dge, mod) fit <- lmFit(v) %>% eBayes(.) res <- basicOmics::get_limma_results(fit) res$coefficient <- gsub('_vs_', '-', res$coefficient) mod_cont <- basicOmics::better_model_matrix(~0 + group, data=ex_p) v_cont <- voom(temp_dge, mod_cont) cont_mat <- makeContrasts(contrasts = paste0(levels(ex_p$group)[2:length(levels(ex_p$group))], '-NORMAL'), levels=mod_cont) fit_cont <- lmFit(v_cont) %>% contrasts.fit(., cont_mat) %>% eBayes(.) cont_res <- basicOmics::get_limma_results(fit_cont, coefs=colnames(cont_mat)) cont_res_sigsub <- cont_res[cont_res$adj.P.Val < .05,] cont_list <- list( dplyr::filter(cont_res_sigsub, logFC >0) %>% split(., .$coefficient) %>% lapply(., function(x){ x$ensembl_gene_id }), dplyr::filter(cont_res_sigsub, logFC <0) %>% split(., .$coefficient) %>% lapply(., function(x){ x$ensembl_gene_id }) ) %>% setNames(., c( 'upreg', 'downreg')) res_sigsub <- res[res$adj.P.Val < .05,] res_list <- list( dplyr::filter(res_sigsub, logFC >0) %>% split(., .$coefficient) %>% lapply(., function(x){ x$ensembl_gene_id }), dplyr::filter(res_sigsub, logFC <0) %>% split(., .$coefficient) %>% lapply(., function(x){ x$ensembl_gene_id }) ) %>% setNames(., c( 'upreg', 'downreg')) lapply(names(res_list), function(x){ get_overlap_matix(res_list[[x]], cont_list[[x]]) })
svobj <- sva(v$E, mod, mod0) modSV <- cbind(mod, svobj$sv) fit <- lmFit(v, modSV) %>% eBayes(.) res <- basicOmics::get_limma_results(fit) sigsub_res <- res[res$adj.P.Val < .05,] svobj_cont <- sva(v_cont$E, mod_cont, mod0) modSV_cont <- data.frame(cbind(mod_cont, svobj_cont$sv)) cont_matSV <- makeContrasts(contrasts = paste0(levels(ex_p$group)[2:length(levels(ex_p$group))], '-NORMAL'), levels=modSV_cont) fit_cont <- lmFit(v_cont, modSV_cont) %>% contrasts.fit(., cont_matSV) %>% eBayes(.) res_cont <- basicOmics::get_limma_results(fit_cont, coefs = colnames(cont_matSV)) res_cont$coefficient <- gsub('-', '_vs_', res_cont$coefficient) sigsub_res_cont <- res_cont[res_cont$adj.P.Val < .05,] lapply(unique(sigsub_res_cont$coefficient), function(x){ tmp <- sigsub_res[sigsub_res$coefficient == x,] tmp_cont <- sigsub_res_cont[sigsub_res_cont$coefficient == x,] length(intersect(tmp$ensembl_gene_id, tmp_cont$ensembl_gene_id))/min(length(tmp$ensembl_gene_id), length(tmp_cont$ensembl_gene_id)) }) fib_res <- sigsub_res[sigsub_res$coefficient == 'Fibrosis_vs_NORMAL',] fib_res_cont <- sigsub_res_cont[sigsub_res_cont$coefficient == 'Fibrosis_vs_NORMAL',] genes_to_look_at <- c(setdiff( fib_res_cont$ensembl_gene_id, fib_res$ensembl_gene_id),setdiff(fib_res$ensembl_gene_id, fib_res_cont$ensembl_gene_id) ) tf1 <- res_cont[res_cont$ensembl_gene_id %in% genes_to_look_at & res_cont$coefficient =='Fibrosis_vs_NORMAL',] tf2 <- res[res$ensembl_gene_id %in% genes_to_look_at & res$coefficient =='Fibrosis_vs_NORMAL',]
It looks the contrast and normal (leave one out approach w/ intercept) were virtually indentical. There were a total of 5 genes different between the 2 methods. It looks like this is stochaisticity around the p-values.
fp <- '/Users/daniellefever/Desktop/R/reference_files/msigdb_v6.2_files_to_download_locally/msigdb_v6.2_GMTs/' gmt_files <- list.files(paste0(fp ), pattern = glob2rx('*symbols.gmt')) %>% .[grepl('c2|c3|c5', .)] %>% .[!grepl('all', .)] %>% c('h.all.v6.2.symbols.gmt', .) database_descriptions <- c('hallmark', 'chemical_and_genetic_perturbations', "biocarta", "kegg", "reactome", "canonical_pathways", "micro_rna_targets", "transcription_factor_targets", "go_biological_process", "go_cellular_component", "go_molecular_function") %>% paste(limma::strsplit2(gmt_files, '\\.')[,1], ., sep='-') %>% setNames(., gmt_files) parsed_gmts <- lapply(gmt_files, function(x){ temp_fp <- paste0(fp, x) print(x) cur_gmt <- clusterProfiler::read.gmt(temp_fp) }) %>% setNames(., database_descriptions) gmts_as_vec_lists <- lapply(parsed_gmts, function(x){ split(x, x$ont) %>% lapply(., function(y){ y$gene }) }) e_uniquefied <- uniquefy_by_variance(v_sv$E, gene_annots, 'external_gene_name') v_sv_uniquefied <- v_sv[row.names(e_uniquefied),] row.names(v_sv_uniquefied) <- gene_annots$external_gene_name[match(row.names(v_sv_uniquefied), row.names(gene_annots))] idx <- ids2indices(parsed_gmts$h.all.v6.2.symbols.gmt,id=rownames(v_sv_uniquefied))
get_combo_gmt_list <- function(parsed_gmt_list){ out_list <- lapply(parsed_gmt_list, function(x){ lapply(x, function(y){ str_c(y, collapse = '/') }) %>% do.call(rbind, .) %>% data.frame(.) %>% setNames(., 'genes') %>% rownames_to_column(., 'pathway') }) %>% rbind_named_df_list(., 'gene_set_db') return(out_list) } get_camera_res <- function(v_uniquefied, gmt ){ d <- v_uniquefied$design coefs_to_use <- colnames(d)[grepl('_vs_', colnames(d))] idx <- ids2indices(gmt,id=rownames(v_uniquefied)) cres <- lapply(seq_along(coefs_to_use), function(i){ cur_coef <- coefs_to_use[i] camera(v_uniquefied, idx, contrast = cur_coef, allow.neg.cor = TRUE) %>% rownames_to_column(., 'pathway') %>% dplyr::filter(., FDR < .05) }) %>% setNames(., coefs_to_use) return(cres) } sel_msig_res <- lapply(seq_along(gmts_as_vec_lists), function(gmt_idx){ tmp_gmt <- gmts_as_vec_lists[[gmt_idx]] print(names(gmts_as_vec_lists)[gmt_idx]) get_camera_res(v_sv_uniquefied, tmp_gmt) %>% rbind_named_df_list(., 'coefficient') }) %>% setNames(., names(gmts_as_vec_lists))
basicOmics::expression_summaries(res)
sigsub <- res[res$adj.P.Val < .05,]
get_thresh_list <- function(res_df){ thresh_list <- res_df %>% dplyr::filter(., adj.P.Val < .05) %>% pull(logFC) %>% abs(.) %>% quantile(., probs=c(0, .75, .975)) nms <- names(thresh_list) plt_titles <- sapply(nms,simplify = TRUE, function(x){ qnt <- gsub('%', '', x) %>% as.numeric(.) if(qnt == 0){ cur_tit <- 'All_DEGs' } else { cur_prop <- paste0((100 - qnt), '%') cur_tit <- paste0('Top_', cur_prop, '_DEGs') } }) names(thresh_list) <- plt_titles return(thresh_list) } enrich_res <- split(sigsub, sigsub$coefficient) %>% lapply(., function(x){ cur_thresh_list <- get_thresh_list(x) lapply(seq_along(cur_thresh_list), function(i){ enrich_wrapper(x,cur_thresh_list[i]) }) #%>% setNames(., names(cur_thresh_list)) })
enrich_res <- lapply(enrich_res, function(x){ rbind_named_df_list(x, 'threshold') }) %>% rbind_named_df_list(., 'coefficient') enrich_res$threshold <- limma::strsplit2(enrich_res$threshold, '_')[,2] enrich_res$threshold[enrich_res$threshold == 'DEGs'] <- 'ALL' enrich_res$coefficient <- limma::strsplit2(enrich_res$coefficient, '_')[,1] split_by_reg <- split(enrich_res, enrich_res$regulation) pth_overlaps <- lapply(split_by_reg, function(x){ tmp_df <- data.frame(name=paste0(x$coefficient, '-', x$threshold), x$ID) %>% setNames(., c('name', 'ID')) split(tmp_df, tmp_df$name) %>% get_overlap_matix(.) }) temp_plot <- ggcorrplot::ggcorrplot(pth_overlaps$combined, method = 'square', outline.color = 'black', lab = TRUE, lab_col = 'black') temp_plot$labels$fill <- 'Proportion of\noverlapping genes' temp_plot + theme(axis.text.x.top = element_text(angle = 90),panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + scale_x_discrete(position = "bottom") + scale_fill_gradient(low='ivory', high = 'royalblue') ggsave(temp_plot,filename = 'overlapping_plot.pdf', height = 6, width = 6) getwd()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.