In lefeverde/basicOmics: Package for basic statistical tests on Omics data

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval=FALSE
)

Introduction

This document shows how the basicOmics can be used to run an RNA-seq analysis. Currently, this is done using the limma-voom method with batch and other nuisance variables accounted for using sva. The functions included in this package are wrappers and other functions which help with automating gene expression analysis. The packages commonly used for expression analysis rely on base R functions and settings for things like model.matrixes, factors. Many of these base functions are product of ~20 years of technical debt which are often non-intuitive^["The more you learn about the R language, the worse it will feel."]. Even with R proficiency, I find myself having to spend a considerable amount of effort verifying that my scripts are doing what I think they should.

The goal of this project is create a set of functions which will help in expression analysis. These are all written to emphasize modularity, simplicity, and the inclusion of explicit errors. I've focused on creating functions which automate the tedious and/or error prone analysis steps.

Creating test data subset

These are steps used to create a subset of the data for testing purposes. At the moment, everything is hardcoded and taken from real data. It might be worthwhile to create synthetic datasets for automated testing.

library(tximport)
library(tibble)
library(dplyr)
load("../../distefanoRNAseq/data-raw/pdata.RData")
load("../../distefanoRNAseq/data-raw/tximport_limma_voom.RData")
load("../../distefanoRNAseq/data-raw/tx2gene.RData")
load("../../distefanoRNAseq/data-raw/gene_annots.RData")
p_sub <- p[colnames(txi_sub$abundance),]
ex_p <- p_sub[,c('file_name', 'sex','age', 'bmi_surg', 'diagnosis', 'group')]
set.seed(42)
ex_p <- ex_p %>% 
  #ex_p[ex_p$group != 'Lob',] %>%
  rownames_to_column(., var='sample_name') %>%
  group_by(., group) %>%
  sample_n(., 10) %>%
  as.data.frame(.)
row.names(ex_p) <- ex_p$sample_name
files  <- paste0('/Users/daniellefever/Desktop/R/thesisAnalysis/data-raw/DiStefano_raw_data/kallisto/', ex_p$file_name, '.gz')
names(files) <- row.names(ex_p)
ex_lengthscaledtpm_txi <- tximport(files, type = 'kallisto', tx2gene = tx2gene, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM')
#save(ex_p, ex_lengthscaledtpm_txi, gene_annots, file='../data-raw/example_raw_data.RData')

Running the pipeline

load('../data-raw/example_raw_data.RData')

library(dplyr)
library(sva)
library(basicOmics)
library(limma)

temp_dge <- basicOmics::dge_wrapper(ex_lengthscaledtpm_txi, ex_p, gene_annots  )
mod <- basicOmics::better_model_matrix(~group, data=ex_p)
mod0 <- model.matrix(~1, data=ex_p)
v_sv <- basicOmics::voom_sva_wrapper(temp_dge, mod, mod0) 
fit <- lmFit(v_sv) %>% 
  eBayes(.)
res <- basicOmics::get_limma_results(fit)

Comparing the Contrast vs leave 1 out

mod <- basicOmics::better_model_matrix(~group, data=ex_p)
v <- voom(temp_dge, mod)
fit <- lmFit(v) %>% 
  eBayes(.) 
res <- basicOmics::get_limma_results(fit)
res$coefficient <-  gsub('_vs_', '-', res$coefficient) 

mod_cont <- basicOmics::better_model_matrix(~0 + group, data=ex_p)
v_cont <- voom(temp_dge, mod_cont)
cont_mat <-  makeContrasts(contrasts = paste0(levels(ex_p$group)[2:length(levels(ex_p$group))], '-NORMAL'), levels=mod_cont)
fit_cont <- lmFit(v_cont) %>% 
  contrasts.fit(., cont_mat) %>% 
  eBayes(.)
cont_res <- basicOmics::get_limma_results(fit_cont, coefs=colnames(cont_mat))

cont_res_sigsub <- cont_res[cont_res$adj.P.Val < .05,]
cont_list <- list(
    dplyr::filter(cont_res_sigsub, logFC >0) %>% 
        split(., .$coefficient) %>% 
        lapply(., function(x){
            x$ensembl_gene_id
        }),
    dplyr::filter(cont_res_sigsub, logFC <0) %>% 
        split(., .$coefficient) %>% 
        lapply(., function(x){
            x$ensembl_gene_id
        })
) %>% setNames(., c( 'upreg', 'downreg'))

res_sigsub <- res[res$adj.P.Val < .05,]
res_list <- list(
    dplyr::filter(res_sigsub, logFC >0) %>% 
        split(., .$coefficient) %>% 
        lapply(., function(x){
            x$ensembl_gene_id
        }),
    dplyr::filter(res_sigsub, logFC <0) %>% 
        split(., .$coefficient) %>% 
        lapply(., function(x){
            x$ensembl_gene_id
        })
) %>% setNames(., c( 'upreg', 'downreg'))



lapply(names(res_list), function(x){
    get_overlap_matix(res_list[[x]], cont_list[[x]])
})

Using SVA

svobj <- sva(v$E, mod, mod0)
modSV <- cbind(mod, svobj$sv)
fit <- lmFit(v, modSV) %>% 
  eBayes(.)
res <- basicOmics::get_limma_results(fit)
sigsub_res <- res[res$adj.P.Val < .05,]


svobj_cont <- sva(v_cont$E, mod_cont, mod0)
modSV_cont <- data.frame(cbind(mod_cont, svobj_cont$sv))
cont_matSV <- makeContrasts(contrasts = paste0(levels(ex_p$group)[2:length(levels(ex_p$group))], '-NORMAL'), levels=modSV_cont)
fit_cont <- lmFit(v_cont, modSV_cont) %>% 
  contrasts.fit(., cont_matSV) %>% 
  eBayes(.)
res_cont <- basicOmics::get_limma_results(fit_cont, coefs = colnames(cont_matSV))
res_cont$coefficient <- gsub('-', '_vs_', res_cont$coefficient)
sigsub_res_cont <- res_cont[res_cont$adj.P.Val < .05,]


lapply(unique(sigsub_res_cont$coefficient), function(x){
  tmp <- sigsub_res[sigsub_res$coefficient == x,]
  tmp_cont <- sigsub_res_cont[sigsub_res_cont$coefficient == x,]
  length(intersect(tmp$ensembl_gene_id, tmp_cont$ensembl_gene_id))/min(length(tmp$ensembl_gene_id), length(tmp_cont$ensembl_gene_id))
})
fib_res <- sigsub_res[sigsub_res$coefficient == 'Fibrosis_vs_NORMAL',]
fib_res_cont <- sigsub_res_cont[sigsub_res_cont$coefficient == 'Fibrosis_vs_NORMAL',]


genes_to_look_at <- c(setdiff( fib_res_cont$ensembl_gene_id, fib_res$ensembl_gene_id),setdiff(fib_res$ensembl_gene_id, fib_res_cont$ensembl_gene_id)
)
tf1 <-  res_cont[res_cont$ensembl_gene_id %in% genes_to_look_at & res_cont$coefficient =='Fibrosis_vs_NORMAL',]
tf2 <-  res[res$ensembl_gene_id %in% genes_to_look_at & res$coefficient =='Fibrosis_vs_NORMAL',]

It looks the contrast and normal (leave one out approach w/ intercept) were virtually indentical. There were a total of 5 genes different between the 2 methods. It looks like this is stochaisticity around the p-values.

Running CAMERA

fp <- '/Users/daniellefever/Desktop/R/reference_files/msigdb_v6.2_files_to_download_locally/msigdb_v6.2_GMTs/'
gmt_files <-  list.files(paste0(fp ), pattern = glob2rx('*symbols.gmt')) %>% 
    .[grepl('c2|c3|c5', .)] %>% 
    .[!grepl('all', .)] %>% 
    c('h.all.v6.2.symbols.gmt', .) 

database_descriptions <- c('hallmark', 'chemical_and_genetic_perturbations', "biocarta", "kegg", "reactome", "canonical_pathways", "micro_rna_targets", "transcription_factor_targets", "go_biological_process", "go_cellular_component", "go_molecular_function") %>% 
  paste(limma::strsplit2(gmt_files, '\\.')[,1], ., sep='-') %>% 
  setNames(., gmt_files)

parsed_gmts <- lapply(gmt_files, function(x){
    temp_fp <- paste0(fp, x)
    print(x)
    cur_gmt <- clusterProfiler::read.gmt(temp_fp)
  }) %>% setNames(., database_descriptions)
gmts_as_vec_lists <- lapply(parsed_gmts, function(x){
  split(x, x$ont) %>% 
    lapply(., function(y){
      y$gene
    })
})
e_uniquefied <- uniquefy_by_variance(v_sv$E, gene_annots, 'external_gene_name')
v_sv_uniquefied <- v_sv[row.names(e_uniquefied),]
row.names(v_sv_uniquefied) <- gene_annots$external_gene_name[match(row.names(v_sv_uniquefied), row.names(gene_annots))]

idx <- ids2indices(parsed_gmts$h.all.v6.2.symbols.gmt,id=rownames(v_sv_uniquefied))

Using CAMERA selected MSigDB

get_combo_gmt_list <- function(parsed_gmt_list){
  out_list <- lapply(parsed_gmt_list, function(x){
    lapply(x, function(y){
        str_c(y, collapse = '/')
    }) %>% 
        do.call(rbind, .) %>% 
        data.frame(.) %>% 
        setNames(., 'genes') %>% 
        rownames_to_column(., 'pathway')
    })  %>% 
    rbind_named_df_list(., 'gene_set_db')
  return(out_list)
}

get_camera_res <- function(v_uniquefied, gmt ){
  d <- v_uniquefied$design
  coefs_to_use <- colnames(d)[grepl('_vs_', colnames(d))]
  idx <- ids2indices(gmt,id=rownames(v_uniquefied))
  cres <- lapply(seq_along(coefs_to_use), function(i){
    cur_coef <- coefs_to_use[i]
    camera(v_uniquefied, idx, contrast = cur_coef, allow.neg.cor = TRUE) %>% 
      rownames_to_column(., 'pathway') %>% 
      dplyr::filter(., FDR < .05)
  }) %>% setNames(., coefs_to_use)
  return(cres)
}

sel_msig_res <- lapply(seq_along(gmts_as_vec_lists), function(gmt_idx){
  tmp_gmt <- gmts_as_vec_lists[[gmt_idx]]
  print(names(gmts_as_vec_lists)[gmt_idx])
  get_camera_res(v_sv_uniquefied, tmp_gmt) %>% 
    rbind_named_df_list(., 'coefficient')
}) %>% setNames(., names(gmts_as_vec_lists))

Checking results

basicOmics::expression_summaries(res)

sigsub <- res[res$adj.P.Val < .05,]

Comparing thresholded data accross groups

get_thresh_list <- function(res_df){
  thresh_list <- res_df %>%
    dplyr::filter(., adj.P.Val < .05) %>%
    pull(logFC) %>%
    abs(.) %>%
    quantile(., probs=c(0,  .75, .975)) 
  nms <- names(thresh_list)
  plt_titles <- sapply(nms,simplify = TRUE, function(x){
    qnt <- gsub('%', '', x) %>% 
      as.numeric(.)
    if(qnt == 0){
      cur_tit <- 'All_DEGs'
    } else {
      cur_prop <- paste0((100 - qnt), '%')
      cur_tit <- paste0('Top_', cur_prop, '_DEGs')
    }
  }) 
  names(thresh_list) <- plt_titles
  return(thresh_list)
}

enrich_res <- 
  split(sigsub, sigsub$coefficient) %>% 
  lapply(., function(x){
    cur_thresh_list <- get_thresh_list(x)
    lapply(seq_along(cur_thresh_list), function(i){
      enrich_wrapper(x,cur_thresh_list[i])
    }) #%>% setNames(., names(cur_thresh_list))
  })

Making figures

enrich_res <- lapply(enrich_res, function(x){
    rbind_named_df_list(x, 'threshold')
}) %>% rbind_named_df_list(., 'coefficient')
enrich_res$threshold <-  limma::strsplit2(enrich_res$threshold, '_')[,2] 
enrich_res$threshold[enrich_res$threshold == 'DEGs'] <- 'ALL'
enrich_res$coefficient <-  limma::strsplit2(enrich_res$coefficient, '_')[,1]
split_by_reg <- split(enrich_res, enrich_res$regulation) 
pth_overlaps <- lapply(split_by_reg, function(x){
  tmp_df <-  data.frame(name=paste0(x$coefficient, '-', x$threshold),
             x$ID) %>% setNames(., c('name', 'ID')) 
  split(tmp_df, tmp_df$name) %>% 
    get_overlap_matix(.)
})
temp_plot <- ggcorrplot::ggcorrplot(pth_overlaps$combined, method = 'square', outline.color = 'black', lab = TRUE, lab_col = 'black') 
temp_plot$labels$fill <- 'Proportion of\noverlapping genes'

temp_plot  + theme(axis.text.x.top = element_text(angle = 90),panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) + scale_x_discrete(position = "bottom") + scale_fill_gradient(low='ivory', high = 'royalblue') 
ggsave(temp_plot,filename = 'overlapping_plot.pdf', height = 6, width = 6)

getwd()