In lujunyan1118/mofaCLL: Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia

knitr::opts_chunk$set(message = FALSE, warning = FALSE)
plotDir = ifelse(exists(".standalone"), "", "part3/")
if(plotDir!="") if(!file.exists(plotDir)) dir.create(plotDir)
knitr::opts_chunk$set(fig.path=plotDir, dev=c("png", "pdf"))

#Libraries
library(mofaCLL)
library(glmnet)
library(survival)
library(maxstat)
library(lubridate)
library(survminer)
library(gridExtra)
library(limma)
library(GEOquery)
library(cowplot)
library(DESeq2)
library(tidyverse)

Validate CLL-PD in external, independent cohorts

Test whether CLL-PD can be computed using single-omic dataset

Load datasets

data("mofaOut","gene")
setMap <- read_tsv(system.file("externalData/setToPathway.txt", package = "mofaCLL"), col_types = "cc")

Download methylation dataset from server to a temporary folder

methPath <- file.path(tempdir(),"meth.RData")
if(!file.exists(methPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/meth.RData",
                methPath)
  load(methPath)
} else {
  load(methPath)
}

Get values for LF4

library(MOFA)
facTab <- getFactors(
  MOFAobject, 
  factors = "LF4",
  as.data.frame = TRUE
) %>% as_tibble()
trainData <- getTrainData(MOFAobject)
#unload MOFA package
detach("package:MOFA", unload = TRUE)

Using gene expression dataset for predicting LF4

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["mRNA"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run regularised linear regression models

set.seed(5862)
rnaRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Export select gene list from the best model

coefMat <- rnaRes$coefMat
coefTab <- tibble(id = rownames(coefMat),
  coef = coefMat[,which.max(rnaRes$r2Test)]) %>%
  filter(coef != 0) %>%
  mutate(Symbol = sprintf("\textit{%s}",rowData(rna[id,])$symbol)) %>%
  arrange(desc(abs(coef))) %>%
  select(id, Symbol, coef)

geneCoef <- coefTab %>% 
  mutate(coef  = formatC(coef, digits=2)) %>%
  dplyr::rename(`Ensembl gene ID` = id, 
         Coefficient = coef)

geneCoef

Using methylation dataset for predicting LF4

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Methylation"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run regularised linear regression models

methRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Export the select gene list from best model

coefMat <- methRes$coefMat
coefTab <- tibble(id = rownames(coefMat),
  coef = coefMat[,which.max(methRes$r2Test)]) %>%
  filter(coef != 0) %>%
  arrange(desc(abs(coef))) %>%
  select(id, coef)

methCoef <- coefTab %>% 
  mutate(coef  = formatC(coef, digits=2)) %>%
  dplyr::rename(`Probe ID` = id, 
         Coefficient = coef)

methCoef

Using drug responses

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Drugs"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run glm model

set.seed(5862)
drugRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Using gene mutations

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Mutations"]]

#fill the missing value with majority
X <- apply(X, 1, function(x) {
  xVec <- x
  avgVal <- mean(x,na.rm= TRUE)
  if (avgVal >= 0.5) {
    xVec[is.na(xVec)] <- 1
  } else xVec[is.na(xVec)] <- 0
  xVec
})

sampleOverlap <- intersect(names(y), rownames(X))
y <- y[sampleOverlap]
X <- X[sampleOverlap,]

Run glm model

set.seed(5862)
geneRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Compare the prediction accuracy (variance explained for CLL-PD)

Summarized variance explained

plotTab <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>%
  bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>%
  bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>%
  bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>%
  group_by(set) %>% summarise(meanR2 = mean(r2, na.rm = TRUE), sdR2 = sd(r2, na.rm = TRUE)) %>%
  arrange(desc(meanR2)) %>% mutate(set = factor(set, levels = set))

plotTabAll <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>%
  bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>%
  bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>%
  bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

Summarize number of selected features

featureNum <-  tibble(n = nFeature(rnaRes), set = "mRNA") %>%
  bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>%
  bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>%
  bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>%
  group_by(set) %>% summarise(meanN = mean(n, na.rm = TRUE), sdN = sd(n, na.rm = TRUE)) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

featureNumAll <-  tibble(n = nFeature(rnaRes), set = "mRNA") %>%
  bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>%
  bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>%
  bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

varExpBar <- ggplot(plotTab, aes(x=set, y = meanR2, fill = set)) + 
  geom_col(width = 0.4) +
  ggbeeswarm::geom_quasirandom(data = plotTabAll, aes(x=set, y=r2), alpha =0.3, width = 0.2) +
  geom_errorbar(aes(ymin = meanR2 - sdR2, ymax = meanR2 + sdR2), width =0.2) +
  ylim(0,1) + xlab("data set") + ylab(bquote('Variance explained ('*R^2*')')) +
  scale_fill_manual(values = structure(c(colList[1:3],colList[5]),names = as.character(plotTab$set))) +
  theme_full +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust =1, vjust =0.5))

numBar <- ggplot(featureNum, aes(x=set, y = meanN)) + 
  ggbeeswarm::geom_quasirandom(data = featureNumAll, aes(x=set, y =n), width = 0.2, alpha=0.3) +
  geom_point(size=3) +
  geom_errorbar(aes(ymin = meanN - sdN, ymax = meanN + sdN), width =0.2) +
  xlab("") + ylab("Number of features") +
  theme_half + 
  theme(legend.position = "none", panel.border = element_blank(), 
        axis.line.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.x = element_blank(), 
        axis.text.x = element_blank())

varPlot <- plot_grid(varExpBar, numBar, align = "v", nrow =2, rel_heights = c(0.7,0.3))
varPlot

Estimate and test F4 (CLL-PD) in external cohorts using gene expression

ICGC-CLL cohort with RNAseq data (EGAS00001000374)

Pre-processing data

Download ICGC dataset from EMBL server (The raw sequencing, genomic and outcome data can be queried from ICGC portatl: https://icgc.org/icgc/cgp/64/530/826)

icgcPath <- file.path(tempdir(),"rnaICGC.RData")
if(!file.exists(icgcPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/rnaICGC.RData",
                icgcPath)
  load(icgcPath)
} else {
  load(icgcPath)
}

Load and process ICGC expression dataset

rnaExt.vst <- varianceStabilizingTransformation(rnaExternal)

#Adjusted for potential batch effect
exprMat <- assay(rnaExt.vst)
batch <- as.factor(rnaExt.vst$raw_data_accession)
exprMat.combat <- sva::ComBat(exprMat, batch = batch)
assay(rnaExt.vst) <- exprMat.combat

Load and process internal RNAseq dataset

rna$factor <- facTab[match(colnames(rna), facTab$sample),]$value
rna <- rna[,!is.na(rna$factor)] #use samples with LF4 values
rna.vst <- varianceStabilizingTransformation(rna)

Filter out low variant genes in both datasets

exprMat <- assay(rna.vst)
exprMat.ext <- assay(rnaExt.vst)

#icgc dataset
sds <- genefilter::rowSds(exprMat.ext)
exprMat.ext <- exprMat.ext[order(sds,decreasing = TRUE)[1:10000],]

#internal dataset
sds <- genefilter::rowSds(exprMat)
exprMat <- exprMat[order(sds,decreasing = TRUE)[1:10000],]

#subset to keep common genes
overGene <- intersect(rownames(exprMat), rownames(exprMat.ext))
exprMat <- exprMat[overGene,]
exprMat.ext <- exprMat.ext[overGene,]

Remove highly correlated gene in training data

X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

Predict factor value in ICGC cohort

Build linear model using in-house CLL dataset

set.seed(5862)

y <- rna.vst$factor
modelTrain <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- modelTrain$modelList[[which.max(modelTrain$r2Train)]] #choose the best model

Predict this factor in the new dataset

newX <- exprMat.ext[colnames(X),]
newX <- t(mscale(newX))
y.pred <- glmnet:::predict.cv.glmnet(useModel,  newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))
dim(newX)

#save predicted values
icgcTab <- tibble(id = names(y.pred), value = y.pred)

Test assocation with clincial outcome

Prepare outcome table based on patient annotations

survT <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% 
  rownames_to_column("sampleID") %>%
  mutate(predLF = y.pred[sampleID]) %>% 
  dplyr::select(sampleID, treatedAfter, TTT, predLF, OS, died) %>%
  dplyr::rename(patientID = sampleID) %>%
  filter(!(is.na(TTT) & is.na(OS))) #remove samples with none outcome avaialble

#how many samples
nrow(survT)

Prepare risk factor table

riskTab <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% 
  rownames_to_column("sampleID") %>%
  dplyr::select(sampleID, IGHV, TP53, del17p,  age, sex, SF3B1, NOTCH1) %>%
  mutate(`TP53.del17p` = TP53 | del17p, age = age/10) %>%
  mutate_if(is.logical, as.numeric) %>%
  select(-TP53, -del17p) %>%
  dplyr::rename(patientID = sampleID) %>%
  mutate(LF4 = y.pred[patientID], IGHV =factor(IGHV, levels = c("U","M")))

Univariate test

Overall survival

#calculate p value for cox regression
pOS <- comSurv(survT$predLF, survT$OS, survT$died)
kmOS <- km(survT$predLF, survT$OS, survT$died, "Overall survival (ICGC-CLL)",
   stat = "maxstat",pval = pOS$p, showTable = TRUE)
kmOS

Time to treatment

pTTT <- comSurv(survT$predLF, survT$TTT, survT$treatedAfter)
kmTTT <- km(survT$predLF, survT$TTT, survT$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "maxstat", pval = pTTT$p, showTable = TRUE)
kmTTT

Prepare a summary table

sumOutcome <- bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")) %>% mutate(cohort = "ICGC-CLL", n=nrow(survT))

Combine IGHV and CLL-PD

KM plot for subgroup defined by IGHV status and median latent factor values

groupTab <- survT %>% mutate(IGHV = rnaExternal[,patientID]$IGHV) %>%
  filter(!is.na(IGHV)) %>% 
  mutate(group = ifelse(predLF > median(predLF),"highPD","lowPD")) %>%
  mutate(subgroup = paste0(IGHV,"_",group))

groupList <- list()
# TTT
groupList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)

# OS
groupList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)

grid.arrange(grobs = groupList, ncol = 2)

Multivariate test

surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "OS", "died")

summary(surv1)
haOS <- plotHazard(surv1, title = "OS (ICGC-CLL)") + 
  scale_y_log10(limits = c(0.1,5))

haOS

Time to treatment

surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "TTT", "treatedAfter")

summary(surv1)
haTTT <- plotHazard(surv1, title = "TTT (ICGC-CLL)") + 
  scale_y_log10(limits = c(0.1,5))
haTTT

Correlation between predicted CLL-PD and demographics

Age

plotTab <- riskTab %>%
  mutate(C1C2 = paste0("C",rnaExternal[,match(patientID, colnames(rnaExternal))]$`C1C2`),
         sex = ifelse(is.na(sex),NA, ifelse(sex =="f" , "Female","Male")))

corRes <- cor.test(plotTab$age, plotTab$LF4)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("N = %s", nrow(filter(plotTab,!is.na(age))))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)

plotAge <- ggplot(plotTab, aes(x = age, y = LF4)) + 
  geom_point(color =colList[3]) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
  annotate("text", x = max(plotTab$age), y = Inf, label = annoN,
           hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoP,
           hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef,
           hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
  ylab("predicted CLL-PD") + xlab("Age (years)") +
  theme_full
plotAge

Sex

corRes <- t.test(LF4 ~ sex, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))

plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(sex = sprintf("%s\n(N=%s)",sex,n))

plotSex <- ggplot(plotTab, aes(x = sex, y = LF4)) + 
  geom_violin(aes(fill = sex)) +
  geom_point() + 
  annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
  scale_fill_manual(values = colList) +
  ylab("predicted CLL-PD") + xlab("Sex") +
  theme_full + theme(legend.position = "none")
plotSex

Plot C1/C2

plotTab <- filter(plotTab, C1C2 != "CNA")
corRes <- t.test(LF4 ~ C1C2, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))

plotTab <- filter(plotTab, !is.na(C1C2)) %>%
  group_by(C1C2) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(C1C2 = sprintf("%s\n(N=%s)",C1C2,n))

plotC1C2 <- ggplot(filter(plotTab,!is.na(C1C2)), aes(x = C1C2, y = LF4)) + 
  geom_violin(aes(fill = C1C2)) +
  geom_point() + 
  annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1.1, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
  scale_fill_manual(values = colList[3:length(colList)]) +
  ylab("predicted CLL-PD") + xlab("C1/C2 group") +
  theme_full + theme(legend.position = "none")
plotC1C2

Association with genomics

#t-test
y <- y.pred
geneICGC <- geneICGC[names(y),]
dim(geneICGC)
tRes <- apply(geneICGC, 2, function(x) {
  res <- t.test(y ~ as.factor(x), var.equal = TRUE)
  data.frame(p = res$p.value, 
             df = res$estimate[[2]] - res$estimate[[1]])
}) %>% bind_rows() %>% mutate(gene = colnames(geneICGC),
                              p.adj = p.adjust(p, method = "BH")) %>%
  arrange(p)
filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()

plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2],
            x_lab = "Difference of mean", ifLabel = TRUE) + 
  theme(legend.position = "none",
        plot.margin = margin(8,8,8,8))
plotGeneVolcano

Assocation with mutation load

Function to plot and annotate assocations

plotCor <- function(plotTab, x, y, x_label, y_label, title, color = "black") {
  corRes <- cor.test(plotTab[[x]], plotTab[[y]], use = "pairwise.complete.obs")
  annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2))
  annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e"))

  ggplot(plotTab, aes_string(x = x, y = y)) + 
    geom_point(shape = 21, fill =color, size=3) + 
    geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoCoef,
           hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) +
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
    ylab(y_label) + xlab(x_label) + ggtitle(title) +
    theme_full + theme(plot.margin = margin(8,8,8,8))
}

plotTab <- tibble(value = y.pred,load=rnaExternal[,names(y.pred)]$mutationLoad) %>%
  filter(!is.na(load))

plotMut <- plotCor(plotTab, "value", "load", "CLL-PD", "Total number of mutations",
        sprintf("WGS dataset (ICGC, n=%s)",nrow(plotTab)), colList[[6]])
plotMut

Enrichment analysis

highSet <- c("MYC targets v1", "MYC targets v2", "mTORC1 signaling","Oxidative phosphorylation")
gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(assay(rnaExt.vst[,names(y.pred)]), designMat, gmts$H, 
                       id = rowData(rnaExt.vst)$symbol, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "ICGC-CLL cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichICGC <- enrichRes$enrichPlot
enrichICGC

Three external cohorts with microarray data

Query data from GEO and format datasets

A list object to store external data

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072*1000)
gseList <- list()

GSE22762 (Munic cohort)

An eight-gene expression signature for the prediction of survival and time to treatment in chronic lymphocytic leukemia

gse.all <- getGEO("GSE22762", GSEMatrix = TRUE)
gse <- gse.all[[1]]
gse$patID <- sapply(str_split(gse$title,"[_ ]"), function(x) x[4])
colnames(gse) <- gse$patID

exprs(gse) <- log2(exprs((gse)))

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)]
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]


#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]


gseList[["GSE22762"]] <- gse

GSE39671 (UCSD cohort)

Expression data from untreated CLL patients

gse <- getGEO("GSE39671", GSEMatrix = TRUE)
gse <- gse[[1]]
exprs(gse) <- log2(exprs((gse)))

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)]
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]

#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]

gseList[["GSE39671"]] <- gse

GSE10138 (Duke cohort)

A Genomic Approach to Improve Prognosis and Predict Therapeutic Response in Chronic Lymphocytic Leukemia (Duke_VA)

gse <- getGEO("GSE10138", GSEMatrix = TRUE)
gse <- gse[[1]]

patID <- sapply(str_split(gse$title," "),function(x) x[3])
patID <- sapply(str_split(patID,"-"),function(x) x[1])
gse$patID <- patID
gse <- gse[,!duplicated(gse$patID)]
exprs(gse) <- log2(limma::normalizeVSN(gse)) #the raw microarray was stored. Normalization needs to be performed first.

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]


#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]


patAnno <- tibble(sampleID = colnames(gse),
                  type = gse$source_name_ch1)

#the object dose not have patient annotation, the clinical outcome data is obtained from the publication: PMID: 19861443
annoPath <- system.file("externalData/GSE10138_patAnno.csv", package = "mofaCLL")

csvAnno <- read.csv2(annoPath, stringsAsFactors = FALSE) %>%
  dplyr::select(Group, TTT..years., Rx) %>%
  rename(TTT = TTT..years., treatedAfter = Rx) %>%
  filter(!is.na(TTT)) %>% 
  mutate(treatedAfter = ifelse(treatedAfter == "Y ",TRUE,FALSE),
         sampleID = colnames(gse)) %>%
  as_tibble() 

patAnno <- left_join(patAnno, csvAnno)

gseList[["GSE10138"]] <- list(gse = gse, patAnno = patAnno)

Test associations with outcomes

Munic cohort (GSE22762)

Preprocessing

Filter out low variant genes in both datasets

gse <- gseList$GSE22762

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

Model trainning

Remove highly correlated features in the training set

X <- mscale(exprMat)
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor
lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

Predict CLL-PD

Input matrix from new dataset

newX <- gseMat[colnames(X),]
newX <- t(mscale(newX)) 
#dimension of test set
dim(newX)

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

Association with outcomes

pTab <- pData(gse)
patAnno <- tibble(sampleID = rownames(pTab),
                  TTT = as.numeric(pTab$`time to treatment (days):ch1`)/365,
                  treatedAfter = pTab$`treatment status (censoring day):ch1`,
                  OS = as.numeric(pTab$`overall survival (days):ch1`)/365,
                  died = pTab$`life status (censoring day):ch1`) %>%
  mutate(treatedAfter = as.logical(as.numeric(treatedAfter)),
         died = as.logical(as.numeric(died)))


plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Factor vs OS (cox regression)

pOS <- comSurv(plotTab$factor, plotTab$OS, plotTab$died)

Add to summary table

sumOutcome <- mutate(bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")),cohort = "Munich",n = nrow(plotTab)) %>% bind_rows(sumOutcome)

KM plots

kmTTT_m1 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p,  
   titlePlot = "Time to treatment (Munich)", showTable = TRUE)

kmOS_m1 <- km(plotTab$factor, plotTab$OS, plotTab$died, stat = "maxstat", pval =pOS$p,
   titlePlot = "Overall survival (Munich)", showTable = TRUE)
kmOS_m1
kmTTT_m1

Enrichment analysis

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Munich cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichMunich <- enrichRes$enrichPlot
enrichMunich

UCSD cohort (GSE39671)

Preprocessing

gse <- gseList[["GSE39671"]]

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

Model training

Remove highly correlated features

X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor
lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

Predict CLL-PD

Input matrix from new dataset

newX <- gseMat[colnames(X),]
newX <- t(mscale(newX)) 
#dimension of test set
dim(newX)

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

Association with outcome

patAnno <- tibble(sampleID = colnames(gse),
                  TTT = as.numeric(gse$`sampling time to first treatment (years):ch1`),
                  treatedAfter = gse$`treatment event (1:ch1`) %>%
  separate(treatedAfter, c("a","b","treatedAfter"),":") %>%
  mutate(treatedAfter = as.logical(as.numeric(treatedAfter))) %>%
  dplyr::select(sampleID, TTT, treatedAfter)


plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Add to summary table

sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "UCSD",n=nrow(plotTab)) %>% bind_rows(sumOutcome)

KM plots

kmTTT_m2 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p,  
   titlePlot = "Time to treatment (UCSD)", showTable = TRUE)

kmTTT_m2

Enrichment

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "UCSD cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichUCSD <- enrichRes$enrichPlot
enrichUCSD

Duke cohort (GSE10138)

Preprocessing

gse <- gseList$GSE10138$gse
patAnno <- gseList$GSE10138$patAnno

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

Model training

Remove highly correlated features

X <- t(mscale(exprMat))
reRes <- removeCorrelated(X, cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor

lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

Predict CLL-PD

newX <- t(mscale(gseMat[colnames(X),]))
#dimension of test set
dim(newX)

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

Association with outcome

plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Add information to summary table

sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "Duke",n=nrow(plotTab)) %>% 
  bind_rows(sumOutcome)

KM plots

kmTTT_m3 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval =pTTT$p,  
   titlePlot = "Time to treatment (Duke)", showTable = TRUE)

kmTTT_m3

Enrichment

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Duke cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichDuke <- enrichRes$enrichPlot
enrichDuke

Summarization plot of p-values and hazzard ratios in external cohorts

plotTab <- sumOutcome %>%
  mutate(cohort = sprintf("%s\n(n=%s)",cohort,n)) %>%
  mutate(cohort = factor(cohort, levels = unique(cohort)))


haSumPlot <- ggplot(plotTab, aes(x=cohort, y = HR, col = outcome, dodge = outcome)) + 
  geom_hline(yintercept = 1, linetype = "dotted") +
  geom_point(position = position_dodge(width=0.8)) +
  geom_errorbar(position = position_dodge(width =0.8), 
                aes(ymin = lower, ymax = higher), width = 0.3, size=1) + 
  geom_text(position = position_dodge2(width = 0.8),
            aes(x=as.numeric(as.factor(cohort))+0.15, 
                label = sprintf("italic(P)~'=%s'",formatNum(p, digits = 1, format = "e"))),
            color = "black",size =5, parse = TRUE) +
  xlab("Cohorts") + ylab("Hazard ratio") +
  scale_y_log10(limits = c(0.3,11)) +
  coord_flip() + theme_full + theme(legend.title = element_blank(),
                                    legend.position = c(0.2,0.1),
                                    legend.background = element_blank(),
                                    legend.key.size = unit(0.5,"cm"),
                                    legend.key.width = unit(0.6,"cm"),
                                    legend.text = element_text(size=rel(1.2))) +
  scale_color_manual(values = c(OS = colList[3], TTT = colList[5])) 

haSumPlot

Arrange plots for Section 2 in the manuscript

Main figure 2

title = ggdraw() + draw_figure_label("Figure 2", fontface = "bold", position = "top.left",size=22)

pout <- ggdraw() +
  draw_plot(varPlot, 0.02, 0.5, 0.22, 0.5) +
  draw_plot(groupList[["TTT"]], 0.25, 0.5 , 0.36, 0.5) +
  draw_plot(groupList[["OS"]], 0.64, 0.5, 0.36, 0.5) +
  draw_plot(haSumPlot, 0, 0, 0.32, 0.465) +
  draw_plot(haTTT, 0.33, 0, 0.30, 0.48) +
  draw_plot(haOS, 0.66, 0, 0.30, 0.48) +
  draw_plot_label(c("a", "b", "c", "d","e","f"), 
                  c(0, 0, 0.33, 0.66, 0.33, 0.65), c(1, 0.5, 1, 1, 0.5,0.5), size = 20)
plot_grid(title, pout, rel_heights = c(0.05,0.95), ncol = 1)

Supplementary figure for KM-plots

plot_grid(kmTTT, kmOS, kmTTT_m1, kmOS_m1, kmTTT_m2, kmTTT_m3, nrow =3)

Supplementary figure for the assocations between predicted CLL-PD and demographics in ICGC cohort

plot_grid(plotAge,plotSex,nrow=1)

Supplementary figure for the associations between predicted CLL-PD and genomics in ICGC cohort

plot_grid(plotGeneVolcano, plotMut,rel_widths = c(1,1), nrow =1, align = "h", axis="t")

Supplementary figure for the enrichment analysis in ICGC cohort

enrichICGC

Supplementary figure for the enrichment analysis in four external cohorts

plot_grid(enrichICGC, enrichMunich, enrichUCSD, enrichDuke, ncol=2,
          align = "hv", axis = "l")

lujunyan1118/mofaCLL documentation built on Dec. 21, 2021, 12:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

lujunyan1118/mofaCLL Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia

In lujunyan1118/mofaCLL: Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia

Validate CLL-PD in external, independent cohorts

Test whether CLL-PD can be computed using single-omic dataset

Using gene expression dataset for predicting LF4

Using methylation dataset for predicting LF4

Using drug responses

Using gene mutations

Compare the prediction accuracy (variance explained for CLL-PD)

Estimate and test F4 (CLL-PD) in external cohorts using gene expression

ICGC-CLL cohort with RNAseq data (EGAS00001000374)

Pre-processing data

Predict factor value in ICGC cohort

Test assocation with clincial outcome

Univariate test

Combine IGHV and CLL-PD

Multivariate test

Correlation between predicted CLL-PD and demographics

Association with genomics

Assocation with mutation load

Enrichment analysis

Three external cohorts with microarray data

Query data from GEO and format datasets

GSE22762 (Munic cohort)

GSE39671 (UCSD cohort)

GSE10138 (Duke cohort)

Test associations with outcomes

Munic cohort (GSE22762)

Preprocessing

Model trainning

Predict CLL-PD

Association with outcomes

Enrichment analysis

UCSD cohort (GSE39671)

Preprocessing

Model training

Predict CLL-PD

Association with outcome

Enrichment

Duke cohort (GSE10138)

Preprocessing

Model training

Predict CLL-PD

Association with outcome

Enrichment

Summarization plot of p-values and hazzard ratios in external cohorts

Arrange plots for Section 2 in the manuscript

Main figure 2

Supplementary figure for KM-plots

Supplementary figure for the assocations between predicted CLL-PD and demographics in ICGC cohort

Supplementary figure for the associations between predicted CLL-PD and genomics in ICGC cohort

Supplementary figure for the enrichment analysis in ICGC cohort

Supplementary figure for the enrichment analysis in four external cohorts

R Package Documentation

Browse R Packages

We want your feedback!

lujunyan1118/mofaCLL
Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia