The goal of this document is to provide examples of how to do differential expression analysis using the RNASeq datq processed by the Salmon pipeline and analyzed via DESeq2. We experimented with various forms of normalization and this seems to be ideal given the samples we have.

knitr::opts_chunk$set(echo = TRUE)

require(remotes)
if(!require(mpnstXenoModeling)){
  remotes::install_github('sgosline/mpnstXenoModeling')
  library(mpnstXenoModeling)
}

if(!require('dplyr')){
  install.pacakges('dplyr')
  library(dplyr)
}
if(!require('tidyr')){
  install.pacakges('tidyr')
  library(tidyr)
}
if(!require('EnsDb.Hsapiens.v86')){
  BiocManager::install('EnsDb.Hsapiens.v86')
  library(EnsDb.Hsapiens.v86)
}

Load data from synapse query

The RNASEq data processing was updated on 9/3/2021 so these functions have also been udpated. This pulls the data in from the tximport command of the DESeq2 package and then uses the gene length and sample normalization to account for sample differences. The only clinical variable considered is Sex.

#These three commands load RNAseq data
syn<-reticulate::import('synapseclient')$login()
data.tab<-syn$tableQuery('select * from syn24215021')$asDataFrame()
rnaSeq<<-mpnstXenoModeling::dataFromSynTable(data.tab,syn,'RNASeq')%>%
  mutate(`Clinical Status`=gsub("NED","Alive",gsub('Alive with metastatic disease','Alive',Clinical.Status)))%>%
  tidyr::separate(GENEID,into=c('GENE','VERSION'),remove=FALSE)



master.table<-rnaSeq

##select clinical variables
var.ID<-rnaSeq%>%
  dplyr::select(Sample,synid,Age,Sex,MicroTissueQuality,Location,Size,`Clinical Status`)%>%
  distinct()%>%
  tibble::column_to_rownames('Sample')

##select gene mapping
  if(!require('EnsDb.Hsapiens.v86')){
    BiocManager::install('EnsDb.Hsapiens.v86')
    library(EnsDb.Hsapiens.v86)
  }else{
    library('EnsDb.Hsapiens.v86')
  }
    library(ensembldb)

  database <- EnsDb.Hsapiens.v86
  pmap <- ensembldb::select(database, keys=list(GeneIdFilter(rnaSeq$GENE), TxBiotypeFilter("protein_coding")),
      columns = c("GENENAME"))%>%
    dplyr::rename(GENE='GENEID')%>%
    right_join(rnaSeq)%>%
    subset(TXBIOTYPE=='protein_coding')%>%
    dplyr::select(GENE,GENENAME,GENEID)%>%distinct()%>%
    tibble::column_to_rownames('GENEID')

  dds<-rnaSeq%>%subset(GENE%in%pmap$GENE)%>%
    mpnstXenoModeling::deseq2NormFilter()

Differential expression

The differential expression analysis requires three basic steps: 1. Group the samples into two groups, then compute the log2fold change and significance via the linear model 2. Plot the significant gene in a heatmap 3. Evaluate the functional enrichment of the significant genes only 4. Evaluate the functional enrichment of the ranking of all genes using GSEA. This is particularly useful when we have no significantly different genes.

The following four steps are depicted here, using biological sex as the first variable.

Do DE analysis on sex variable groups

Here we do the four steps described above.

First, compute the differential expresison

sex_res <- mpnstXenoModeling::ds2FactorDE(dds, ids1=rownames(subset(var.ID,Sex=='Male')),
                                    ids2=rownames(subset(var.ID,Sex=='Female')),name='sex',doShrinkage=FALSE)

Then we can plot the top genes in a heatmap. Ideally this will cluster by clinical variable of interest.

mpnstXenoModeling::plotTopGenesHeatmap(sex_res, dds, pmap,'sex',adjpval=0.05, upload=FALSE, parentID='syn25323411')

We can then look for over-represented GO terms and plot those.

genes=pmap[rownames(subset(sex_res,padj<0.05)),'GENENAME']
res<-doRegularGo(genes,prefix='sex')

Lastly we can rank the genes and do Gene Set Enrichment.

genes.with.values<-cbind(sex_res,pmap[rownames(sex_res),])%>%
  dplyr::select(Gene='GENENAME',value='log2FoldChange')

res2=doGSEA(genes.with.values,prot.univ=NULL,prefix='sexDifferences',useEns=FALSE)

Now that we show we can do it for one variable, we can execute this same series of commands on other variables.

Do DE analysis on clinical outcome

First we compare patients who have survived vs. those that have not.

alive<-rownames(subset(var.ID,`Clinical Status`%in%c('Alive','NED','Alive with metastatic disease')))
dead<-rownames(subset(var.ID,`Clinical Status`=='Deceased'))

exist_res <- mpnstXenoModeling::ds2FactorDE(dds, alive,dead,'Clinical')

mpnstXenoModeling::plotTopGenesHeatmap(exist_res, dds, pmap,'clin',patients=c(alive,dead),adjpval=0.05, upload=FALSE, parentID='syn25323411')

pmap[rownames(subset(exist_res,padj<0.05)),'GENENAME']%>%doRegularGo(prefix='prognosisDiff')
genes.with.values<-cbind(exist_res,pmap[rownames(exist_res),])%>%
  dplyr::select(Gene='GENENAME',value='log2FoldChange')

doGSEA(genes.with.values,prot.univ=NULL,prefix='prognosisDiff',useEns=FALSE)

Do DE analysis on age variable groups

Then we can examine if there are pathway level changes in older vs. younger patients.

old<-rownames(subset(var.ID,Age>18))
young<-rownames(subset(var.ID,Age<19))
age_res <- mpnstXenoModeling::ds2FactorDE(dds,old,young,'Age')

mpnstXenoModeling::plotTopGenesHeatmap(age_res, dds, pmap,'age', patients=c(old,young), adjpval=0.05, upload=FALSE, parentID='syn25323411')


res<-pmap[rownames(subset(age_res,padj<0.05)),'GENENAME']%>%doRegularGo(prefix='age')
genes.with.values<-cbind(age_res,pmap[rownames(age_res),])%>%
  dplyr::select(Gene='GENENAME',value='log2FoldChange')


res2<-doGSEA(genes.with.values,NULL,'ageDifferences',useEns=FALSE)

Do DE analysis on sample quality variable groups

Is there a noticeable difference in mt quality? There were a lot of genes so we used only those that fell below 0.01 in significance to plot

good <- rownames(subset(var.ID,MicroTissueQuality%in%c('Good','Robust')))
bad <- rownames(subset(var.ID,MicroTissueQuality=='Unusable'))
qual_res <- mpnstXenoModeling::ds2FactorDE(dds, good,bad,'MTQuality')

mpnstXenoModeling::plotTopGenesHeatmap(qual_res, dds, pmap,'MTqual',patients=c(good,bad),adjpval=0.000001, upload=FALSE, parentID='syn25323411')


res=pmap[rownames(subset(qual_res,padj<0.000001)),'GENENAME']%>%doRegularGo(prefix='MTqual')
plot(res)
genes.with.values<-cbind(qual_res,pmap[rownames(qual_res),])%>%
  dplyr::select(Gene='GENENAME',value='log2FoldChange')

res2<-doGSEA(genes.with.values,NULL,'MTquality',useEns=FALSE)
#plot(res2)

Now that we've done these basic analyses we can integrate other data-derived variables such as drug response into our differential expression pipeline.



sgosline/mpnstXenoModeling documentation built on Dec. 10, 2022, 8:33 a.m.