In sgosline/mpnstXenoModeling: MPNST Xenograft Modeling

knitr::opts_chunk$set(echo = TRUE)

require(remotes)
if (!require(mpnstXenoModeling)) {
  remotes::install_github("sgosline/mpnstXenoModeling")
  library(mpnstXenoModeling)
}

if (!require("leapR")) {
  remotes::install_github("biodataganache/leapR")
  library(leapR)
}

if (!require("DT")) {
  install.packages("DT")
  library(DT)
}


if(!require('EnsDb.Hsapiens.v86')) {
  BiocManager::install("EnsDb.Hsapiens.v86")
  library('EnsDb.Hsapiens.v86')
}
if(!require('ensembldb')){
  BiocManager::install("ensembldb", repos='http://cran.us.r-project.org')
  library(ensembldb)
}

if(!require('apeglm')){
  BiocManager::install("apeglm", repos='http://cran.us.r-project.org')
  library(apeglm)
}
library(dplyr)

Here we bring in the RNASeq data and the leapR package to measure and identify correlated pathways across the samples.

Load data from synapse query

We have all the data in count.sf files so can read them in using the tximport package. Here is the data for which we have gene expression.

#this function simply loads all the data into memory



pdx_data <-
  mpnstXenoModeling::loadPDXData()#reticulate::import('synapseclient')$login()
#data.tab<-syn$tableQuery('select * from syn24215021')$asDataFrame()
rnaSeq<<-mpnstXenoModeling::loadRNASeqData()


#select clinical variables
var.ID <- rnaSeq %>%
  dplyr::select(Sample,
                synid,
                Age,
                Sex,
                MicroTissueQuality,
                Location,
                Size,
                `Clinical Status`) %>%
  distinct() %>%
  tibble::column_to_rownames('Sample')

DT::datatable(var.ID)

We want to filter for only those patients that we have MT data for.

#var.ID <-
#  subset(var.ID,
#         unlist(MicroTissueQuality) %in% c('Unusable', 'Usable', 'Robust', 'Good'))

#DT::datatable(var.ID)


database <- EnsDb.Hsapiens.v86
pmap <-
  ensembldb::select(
    database,
    keys = list(
      GeneIdFilter(rnaSeq$GENE),
      TxBiotypeFilter("protein_coding")
    ),
    columns = c("GENENAME")
  ) %>%
  dplyr::rename(GENE = 'GENEID') %>%
  right_join(rnaSeq) %>%
  subset(TXBIOTYPE == 'protein_coding') %>%
  dplyr::select(GENE, GENENAME, GENEID) %>% distinct()#%>%
# tibble::column_to_rownames('GENEID')

dds <- rnaSeq %>%
  subset(Sample %in% rownames(var.ID)) %>%
  subset(GENE %in% pmap$GENE) %>% ## this is important as we filter for protein coding genes
  mpnstXenoModeling::deseq2NormFilter()

RNASeq Data

We have now implemented DESeq2 column normalization to enable the visualization of RNASeq Data for the 10 samples. Let's plot them, after normalization, via PCA to see how they cluster and group independently. Then we can interrogate the genes themselves.

library(DESeq2)

vsd <- DESeq2::vst(dds)

How do samples cluster by clinical outcome?

We want to plot the variable genes in a heatmap and also the PCA. First we carry out the variance stabilizing transformation and then do the PCA.

library(ggplot2)
pcaData <- plotPCA(vsd, intgroup=c("Sex", "Clinical Status"), returnData=TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, shape=Sex, color=`Clinical.Status`)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed()+
  ggtitle('Samples by normalized expression') +
  scale_color_viridis_d()

How do samples cluster by MT status?

pcaData <- plotPCA(vsd, intgroup=c("Sex", "MicroTissueQuality"), returnData=TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))

ggplot(pcaData, aes(PC1, PC2, shape = Sex, color = MicroTissueQuality)) +
  geom_point(size = 3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed() +
  ggtitle('Samples by normalized expression') +
  scale_color_viridis_d()

Most variable genes

Then we can get the annotations and counts using the variance stabilizing transformation and minimum raw counts value of at least 10.

library(pheatmap)

pat.annote <- rnaSeq %>%
  dplyr::select(Sample, MicroTissueQuality, `Clinical Status`, Sex) %>%
  distinct() %>%
  mutate(MicroTissueQuality = unlist(MicroTissueQuality)) %>%
  tibble::column_to_rownames('Sample')

count.mat <- assay(vsd)#counts(dds,normalized=TRUE)#[symbs,]

keep <- which(rowMin(counts(dds, normalized = TRUE)) > 9)
pat.var <-
  apply(count.mat[keep, ], 1, var) %>% sort()#apply(counts(dds[keep,],normalized=TRUE),1,var)%>%sort()

symbs <- names(pat.var)[1:100]


pheatmap(
  geneIdToSymbolMatrix(count.mat[symbs, ], pmap),
  cellwidth = 10,
  cellheight = 10,
  annotation_col = pat.annote,
  filename = 'mostVarTrans.pdf'
)
pheatmap(
  geneIdToSymbolMatrix(count.mat[symbs, ], pmap),
  cellwidth = 10,
  cellheight = 10,
  annotation_col = pat.annote
)

Correlate all genes across Samples

What pathways are enriched in correlation across these? Need to actually do this!!!

##now let's look at the pathways that are correlated across all
library(leapR)
#data(ncipid)
data("krbpaths")
idx.kegg <- grepl("^KEGG_", krbpaths$names)
names.kegg <- krbpaths$names[idx.kegg]
names.kegg <- sub("KEGG_", "", names.kegg)
names.kegg <- gsub("_", " ", names.kegg)

names.kegg <- sapply(names.kegg,
                     function(y)
                       paste(strwrap(y, 45), collapse = "\n"),
                     USE.NAMES = FALSE)

desc.kegg <- krbpaths$desc[idx.kegg]
sizes.kegg <- krbpaths$sizes[idx.kegg]
Max <- max(sizes.kegg)
matrix.kegg <- krbpaths$matrix[idx.kegg, 1:Max]
keggpaths <- list(
  names = names.kegg,
  desc = desc.kegg,
  sizes = sizes.kegg,
  matrix = matrix.kegg
)

data.mat <- geneIdToSymbolMatrix(count.mat, pmap)
term.2.gene <- as.data.frame(keggpaths$matrix) %>%
  mutate(term = keggpaths$names) %>%
  tidyr::pivot_longer(!term, names_to = "Column", values_to = "gene") %>%
  dplyr::filter(!(gene == "null")) %>%
  dplyr::select(term, gene)
term.2.name <-
  data.frame(term = keggpaths$names, name = keggpaths$names)

res = mpnstXenoModeling::plotCorrelationEnrichment(
  as.matrix(data.mat),
  keggpaths,
  fdr.cutoff = 0.05,
  corr.cutoff = 0.25,
  prefix = 'allSamps',
  width = 11,
  height = 8.5,
  order.by = "Ingroup mean",
  clean.names = FALSE,
  pathway.plot.size = 3
)

MT Differential expression

There seem to be many genes deferentially expressed between MT samples

good <-
  rownames(subset(var.ID, MicroTissueQuality %in% c('Good', 'Robust'))) %>%
  unique()
bad <-
  rownames(subset(var.ID, MicroTissueQuality %in%c('Unusable'))) %>% unique()

qual_res <-
  mpnstXenoModeling::ds2FactorDE(dds, good,  bad, 'MTQuality')


plotTopGenesHeatmap(
  qual_res,
  dds,
  pmap,
  'MTqual',
  patients = colnames(dds),
  newVar = '',
  adjpval = 0.01,
  upload = FALSE,
  parentID = 'syn25323411'
)

res <- pmap %>%
  subset(GENEID %in% rownames(subset(qual_res, padj < 0.01)))

res <- doRegularGo(res$GENENAME, prefix = 'MTqualGO')
#plot(res)

genes.with.values <- qual_res %>%
  rownames_to_column('GENEID') %>% left_join(pmap) %>%
  dplyr::select(Gene = 'GENENAME', value = 'log2FoldChange')

res2 <- doGSEA(genes.with.values, NULL, 'MTquality', useEns = FALSE)
#plot(res2)plot()

Compare with shrinkage

qual_res <- mpnstXenoModeling::ds2FactorDE(dds,  good,
                                           bad, 'MTQuality', doShrinkage = TRUE)

plotTopGenesHeatmap(
  qual_res,
  dds,
  pmap,
  'MTqualShrink',
  patients = c(good, bad),
  adjpval = 0.01,
  upload = TRUE,
  parentID = 'syn25323411'
)



res <- qual_res %>%
  tibble::rownames_to_column('GENEID') %>%
  subset(padj < 0.01) %>%
  left_join(pmap)
#pmap[rownames(subset(qual_res,padj<0.0001)),'GENENAME']%>%
p <- doRegularGo(res$GENENAME, prefix = 'MTqualGOShrink')
#plot(res)

genes.with.values <-
  tibble::rownames_to_column(qual_res, 'GENEID') %>% left_join(pmap) %>%
  dplyr::select(Gene = 'GENENAME', value = 'log2FoldChange')

res2 <- doGSEA(genes.with.values, NULL, 'MTqualityShrink', useEns = FALSE)