knitr::opts_chunk$set(message = FALSE, warning = FALSE) plotDir = ifelse(exists(".standalone"), "", "part3/") if(plotDir!="") if(!file.exists(plotDir)) dir.create(plotDir) knitr::opts_chunk$set(fig.path=plotDir, dev=c("png", "pdf")) #Libraries library(mofaCLL) library(glmnet) library(survival) library(maxstat) library(lubridate) library(survminer) library(gridExtra) library(limma) library(GEOquery) library(cowplot) library(DESeq2) library(tidyverse)
Load datasets
data("mofaOut","gene") setMap <- read_tsv(system.file("externalData/setToPathway.txt", package = "mofaCLL"), col_types = "cc")
Download methylation dataset from server to a temporary folder
methPath <- file.path(tempdir(),"meth.RData") if(!file.exists(methPath)) { download.file("https://www.huber.embl.de/users/jlu/data/meth.RData", methPath) load(methPath) } else { load(methPath) }
Get values for LF4
library(MOFA) facTab <- getFactors( MOFAobject, factors = "LF4", as.data.frame = TRUE ) %>% as_tibble() trainData <- getTrainData(MOFAobject) #unload MOFA package detach("package:MOFA", unload = TRUE)
Prepare model
y <- structure(facTab$value, names = facTab$sample) X <- trainData[["mRNA"]] X <- X[,complete.cases(t(X))] sampleOverlap <- intersect(names(y), colnames(X)) y <- y[sampleOverlap] X <- X[,sampleOverlap] #remove highly correlated features X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced
Run regularised linear regression models
set.seed(5862) rnaRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)
Export select gene list from the best model
coefMat <- rnaRes$coefMat coefTab <- tibble(id = rownames(coefMat), coef = coefMat[,which.max(rnaRes$r2Test)]) %>% filter(coef != 0) %>% mutate(Symbol = sprintf("\textit{%s}",rowData(rna[id,])$symbol)) %>% arrange(desc(abs(coef))) %>% select(id, Symbol, coef) geneCoef <- coefTab %>% mutate(coef = formatC(coef, digits=2)) %>% dplyr::rename(`Ensembl gene ID` = id, Coefficient = coef) geneCoef
Prepare model
y <- structure(facTab$value, names = facTab$sample) X <- trainData[["Methylation"]] X <- X[,complete.cases(t(X))] sampleOverlap <- intersect(names(y), colnames(X)) y <- y[sampleOverlap] X <- X[,sampleOverlap] #remove highly correlated features X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced
Run regularised linear regression models
methRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)
Export the select gene list from best model
coefMat <- methRes$coefMat coefTab <- tibble(id = rownames(coefMat), coef = coefMat[,which.max(methRes$r2Test)]) %>% filter(coef != 0) %>% arrange(desc(abs(coef))) %>% select(id, coef) methCoef <- coefTab %>% mutate(coef = formatC(coef, digits=2)) %>% dplyr::rename(`Probe ID` = id, Coefficient = coef) methCoef
Prepare model
y <- structure(facTab$value, names = facTab$sample) X <- trainData[["Drugs"]] X <- X[,complete.cases(t(X))] sampleOverlap <- intersect(names(y), colnames(X)) y <- y[sampleOverlap] X <- X[,sampleOverlap] #remove highly correlated features X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced
Run glm model
set.seed(5862) drugRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)
Prepare model
y <- structure(facTab$value, names = facTab$sample) X <- trainData[["Mutations"]] #fill the missing value with majority X <- apply(X, 1, function(x) { xVec <- x avgVal <- mean(x,na.rm= TRUE) if (avgVal >= 0.5) { xVec[is.na(xVec)] <- 1 } else xVec[is.na(xVec)] <- 0 xVec }) sampleOverlap <- intersect(names(y), rownames(X)) y <- y[sampleOverlap] X <- X[sampleOverlap,]
Run glm model
set.seed(5862) geneRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)
Summarized variance explained
plotTab <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>% bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>% bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>% bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>% group_by(set) %>% summarise(meanR2 = mean(r2, na.rm = TRUE), sdR2 = sd(r2, na.rm = TRUE)) %>% arrange(desc(meanR2)) %>% mutate(set = factor(set, levels = set)) plotTabAll <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>% bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>% bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>% bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>% mutate(set = factor(set, levels = levels(plotTab$set)))
Summarize number of selected features
featureNum <- tibble(n = nFeature(rnaRes), set = "mRNA") %>% bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>% bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>% bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>% group_by(set) %>% summarise(meanN = mean(n, na.rm = TRUE), sdN = sd(n, na.rm = TRUE)) %>% mutate(set = factor(set, levels = levels(plotTab$set))) featureNumAll <- tibble(n = nFeature(rnaRes), set = "mRNA") %>% bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>% bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>% bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>% mutate(set = factor(set, levels = levels(plotTab$set)))
varExpBar <- ggplot(plotTab, aes(x=set, y = meanR2, fill = set)) + geom_col(width = 0.4) + ggbeeswarm::geom_quasirandom(data = plotTabAll, aes(x=set, y=r2), alpha =0.3, width = 0.2) + geom_errorbar(aes(ymin = meanR2 - sdR2, ymax = meanR2 + sdR2), width =0.2) + ylim(0,1) + xlab("data set") + ylab(bquote('Variance explained ('*R^2*')')) + scale_fill_manual(values = structure(c(colList[1:3],colList[5]),names = as.character(plotTab$set))) + theme_full + theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust =1, vjust =0.5)) numBar <- ggplot(featureNum, aes(x=set, y = meanN)) + ggbeeswarm::geom_quasirandom(data = featureNumAll, aes(x=set, y =n), width = 0.2, alpha=0.3) + geom_point(size=3) + geom_errorbar(aes(ymin = meanN - sdN, ymax = meanN + sdN), width =0.2) + xlab("") + ylab("Number of features") + theme_half + theme(legend.position = "none", panel.border = element_blank(), axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank(), axis.text.x = element_blank()) varPlot <- plot_grid(varExpBar, numBar, align = "v", nrow =2, rel_heights = c(0.7,0.3)) varPlot
Download ICGC dataset from EMBL server (The raw sequencing, genomic and outcome data can be queried from ICGC portatl: https://icgc.org/icgc/cgp/64/530/826)
icgcPath <- file.path(tempdir(),"rnaICGC.RData") if(!file.exists(icgcPath)) { download.file("https://www.huber.embl.de/users/jlu/data/rnaICGC.RData", icgcPath) load(icgcPath) } else { load(icgcPath) }
Load and process ICGC expression dataset
rnaExt.vst <- varianceStabilizingTransformation(rnaExternal) #Adjusted for potential batch effect exprMat <- assay(rnaExt.vst) batch <- as.factor(rnaExt.vst$raw_data_accession) exprMat.combat <- sva::ComBat(exprMat, batch = batch) assay(rnaExt.vst) <- exprMat.combat
Load and process internal RNAseq dataset
rna$factor <- facTab[match(colnames(rna), facTab$sample),]$value rna <- rna[,!is.na(rna$factor)] #use samples with LF4 values rna.vst <- varianceStabilizingTransformation(rna)
Filter out low variant genes in both datasets
exprMat <- assay(rna.vst) exprMat.ext <- assay(rnaExt.vst) #icgc dataset sds <- genefilter::rowSds(exprMat.ext) exprMat.ext <- exprMat.ext[order(sds,decreasing = TRUE)[1:10000],] #internal dataset sds <- genefilter::rowSds(exprMat) exprMat <- exprMat[order(sds,decreasing = TRUE)[1:10000],] #subset to keep common genes overGene <- intersect(rownames(exprMat), rownames(exprMat.ext)) exprMat <- exprMat[overGene,] exprMat.ext <- exprMat.ext[overGene,]
Remove highly correlated gene in training data
X <- mscale(exprMat) #remove highly correlated genes reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE) X <- reRes$reduced
Build linear model using in-house CLL dataset
set.seed(5862) y <- rna.vst$factor modelTrain <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se") useModel <- modelTrain$modelList[[which.max(modelTrain$r2Train)]] #choose the best model
Predict this factor in the new dataset
newX <- exprMat.ext[colnames(X),] newX <- t(mscale(newX)) y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1] y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred)) dim(newX)
#save predicted values icgcTab <- tibble(id = names(y.pred), value = y.pred)
Prepare outcome table based on patient annotations
survT <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% rownames_to_column("sampleID") %>% mutate(predLF = y.pred[sampleID]) %>% dplyr::select(sampleID, treatedAfter, TTT, predLF, OS, died) %>% dplyr::rename(patientID = sampleID) %>% filter(!(is.na(TTT) & is.na(OS))) #remove samples with none outcome avaialble #how many samples nrow(survT)
Prepare risk factor table
riskTab <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% rownames_to_column("sampleID") %>% dplyr::select(sampleID, IGHV, TP53, del17p, age, sex, SF3B1, NOTCH1) %>% mutate(`TP53.del17p` = TP53 | del17p, age = age/10) %>% mutate_if(is.logical, as.numeric) %>% select(-TP53, -del17p) %>% dplyr::rename(patientID = sampleID) %>% mutate(LF4 = y.pred[patientID], IGHV =factor(IGHV, levels = c("U","M")))
Overall survival
#calculate p value for cox regression pOS <- comSurv(survT$predLF, survT$OS, survT$died) kmOS <- km(survT$predLF, survT$OS, survT$died, "Overall survival (ICGC-CLL)", stat = "maxstat",pval = pOS$p, showTable = TRUE) kmOS
Time to treatment
pTTT <- comSurv(survT$predLF, survT$TTT, survT$treatedAfter) kmTTT <- km(survT$predLF, survT$TTT, survT$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "maxstat", pval = pTTT$p, showTable = TRUE) kmTTT
Prepare a summary table
sumOutcome <- bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")) %>% mutate(cohort = "ICGC-CLL", n=nrow(survT))
KM plot for subgroup defined by IGHV status and median latent factor values
groupTab <- survT %>% mutate(IGHV = rnaExternal[,patientID]$IGHV) %>% filter(!is.na(IGHV)) %>% mutate(group = ifelse(predLF > median(predLF),"highPD","lowPD")) %>% mutate(subgroup = paste0(IGHV,"_",group)) groupList <- list() # TTT groupList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5) # OS groupList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5) grid.arrange(grobs = groupList, ncol = 2)
OS
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "OS", "died") summary(surv1) haOS <- plotHazard(surv1, title = "OS (ICGC-CLL)") + scale_y_log10(limits = c(0.1,5)) haOS
Time to treatment
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "TTT", "treatedAfter") summary(surv1) haTTT <- plotHazard(surv1, title = "TTT (ICGC-CLL)") + scale_y_log10(limits = c(0.1,5)) haTTT
Age
plotTab <- riskTab %>% mutate(C1C2 = paste0("C",rnaExternal[,match(patientID, colnames(rnaExternal))]$`C1C2`), sex = ifelse(is.na(sex),NA, ifelse(sex =="f" , "Female","Male"))) corRes <- cor.test(plotTab$age, plotTab$LF4) pval <- formatNum(corRes$p.value, digits = 1, format = "e") annoN <- sprintf("N = %s", nrow(filter(plotTab,!is.na(age)))) annoP <- bquote(italic("P")~"="~.(pval)) annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate) plotAge <- ggplot(plotTab, aes(x = age, y = LF4)) + geom_point(color =colList[3]) + geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + annotate("text", x = max(plotTab$age), y = Inf, label = annoN, hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) + annotate("text", x = max(plotTab$age), y = Inf, label = annoP, hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) + annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef, hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) + ylab("predicted CLL-PD") + xlab("Age (years)") + theme_full plotAge
Sex
corRes <- t.test(LF4 ~ sex, plotTab) pval <- formatNum(corRes$p.value, digits = 1, format = "e") annoP <- bquote(italic("P")~"="~.(pval)) plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>% mutate(sex = sprintf("%s\n(N=%s)",sex,n)) plotSex <- ggplot(plotTab, aes(x = sex, y = LF4)) + geom_violin(aes(fill = sex)) + geom_point() + annotate("text", x = Inf, y = Inf, label = annoP, hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) + scale_fill_manual(values = colList) + ylab("predicted CLL-PD") + xlab("Sex") + theme_full + theme(legend.position = "none") plotSex
Plot C1/C2
plotTab <- filter(plotTab, C1C2 != "CNA") corRes <- t.test(LF4 ~ C1C2, plotTab) pval <- formatNum(corRes$p.value, digits = 1, format = "e") annoP <- bquote(italic("P")~"="~.(pval)) plotTab <- filter(plotTab, !is.na(C1C2)) %>% group_by(C1C2) %>% mutate(n=n()) %>% ungroup() %>% mutate(C1C2 = sprintf("%s\n(N=%s)",C1C2,n)) plotC1C2 <- ggplot(filter(plotTab,!is.na(C1C2)), aes(x = C1C2, y = LF4)) + geom_violin(aes(fill = C1C2)) + geom_point() + annotate("text", x = Inf, y = Inf, label = annoP, hjust=1.1, vjust =2, size = 5, parse = FALSE, col= colList[1]) + scale_fill_manual(values = colList[3:length(colList)]) + ylab("predicted CLL-PD") + xlab("C1/C2 group") + theme_full + theme(legend.position = "none") plotC1C2
#t-test y <- y.pred geneICGC <- geneICGC[names(y),] dim(geneICGC) tRes <- apply(geneICGC, 2, function(x) { res <- t.test(y ~ as.factor(x), var.equal = TRUE) data.frame(p = res$p.value, df = res$estimate[[2]] - res$estimate[[1]]) }) %>% bind_rows() %>% mutate(gene = colnames(geneICGC), p.adj = p.adjust(p, method = "BH")) %>% arrange(p) filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()
plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2], x_lab = "Difference of mean", ifLabel = TRUE) + theme(legend.position = "none", plot.margin = margin(8,8,8,8)) plotGeneVolcano
Function to plot and annotate assocations
plotCor <- function(plotTab, x, y, x_label, y_label, title, color = "black") { corRes <- cor.test(plotTab[[x]], plotTab[[y]], use = "pairwise.complete.obs") annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2)) annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e")) ggplot(plotTab, aes_string(x = x, y = y)) + geom_point(shape = 21, fill =color, size=3) + geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoCoef, hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) + annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoP, hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) + ylab(y_label) + xlab(x_label) + ggtitle(title) + theme_full + theme(plot.margin = margin(8,8,8,8)) }
plotTab <- tibble(value = y.pred,load=rnaExternal[,names(y.pred)]$mutationLoad) %>% filter(!is.na(load)) plotMut <- plotCor(plotTab, "value", "load", "CLL-PD", "Total number of mutations", sprintf("WGS dataset (ICGC, n=%s)",nrow(plotTab)), colList[[6]]) plotMut
highSet <- c("MYC targets v1", "MYC targets v2", "mTORC1 signaling","Oxidative phosphorylation") gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL")) designMat <- model.matrix(~ 1 + y.pred) enrichRes <- runCamera(assay(rnaExt.vst[,names(y.pred)]), designMat, gmts$H, id = rowData(rnaExt.vst)$symbol, method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_", plotTitle = "ICGC-CLL cohort (Hallmarks, 5% FDR)", setToHighlight = highSet, setMap = setMap) enrichICGC <- enrichRes$enrichPlot enrichICGC
A list object to store external data
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072*1000) gseList <- list()
An eight-gene expression signature for the prediction of survival and time to treatment in chronic lymphocytic leukemia
gse.all <- getGEO("GSE22762", GSEMatrix = TRUE) gse <- gse.all[[1]] gse$patID <- sapply(str_split(gse$title,"[_ ]"), function(x) x[4]) colnames(gse) <- gse$patID exprs(gse) <- log2(exprs((gse))) #only keep genes with entrenz ID presented in internal dataset gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)] gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene] #reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top sdsNew <- genefilter::rowSds(exprs(gse)) gse <- gse[order(sdsNew, decreasing = TRUE),] gseList[["GSE22762"]] <- gse
Expression data from untreated CLL patients
gse <- getGEO("GSE39671", GSEMatrix = TRUE) gse <- gse[[1]] exprs(gse) <- log2(exprs((gse))) #only keep genes with entrenz ID presented in internal dataset gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)] gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene] #reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top sdsNew <- genefilter::rowSds(exprs(gse)) gse <- gse[order(sdsNew, decreasing = TRUE),] gseList[["GSE39671"]] <- gse
A Genomic Approach to Improve Prognosis and Predict Therapeutic Response in Chronic Lymphocytic Leukemia (Duke_VA)
gse <- getGEO("GSE10138", GSEMatrix = TRUE) gse <- gse[[1]] patID <- sapply(str_split(gse$title," "),function(x) x[3]) patID <- sapply(str_split(patID,"-"),function(x) x[1]) gse$patID <- patID gse <- gse[,!duplicated(gse$patID)] exprs(gse) <- log2(limma::normalizeVSN(gse)) #the raw microarray was stored. Normalization needs to be performed first. #only keep genes with entrenz ID presented in internal dataset gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene] #reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top sdsNew <- genefilter::rowSds(exprs(gse)) gse <- gse[order(sdsNew, decreasing = TRUE),] patAnno <- tibble(sampleID = colnames(gse), type = gse$source_name_ch1) #the object dose not have patient annotation, the clinical outcome data is obtained from the publication: PMID: 19861443 annoPath <- system.file("externalData/GSE10138_patAnno.csv", package = "mofaCLL") csvAnno <- read.csv2(annoPath, stringsAsFactors = FALSE) %>% dplyr::select(Group, TTT..years., Rx) %>% rename(TTT = TTT..years., treatedAfter = Rx) %>% filter(!is.na(TTT)) %>% mutate(treatedAfter = ifelse(treatedAfter == "Y ",TRUE,FALSE), sampleID = colnames(gse)) %>% as_tibble() patAnno <- left_join(patAnno, csvAnno) gseList[["GSE10138"]] <- list(gse = gse, patAnno = patAnno)
Filter out low variant genes in both datasets
gse <- gseList$GSE22762 #internal dataset rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),] exprMat <- assay(rnaSub) sds <- genefilter::rowSds(exprMat) rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],] #external gseMat <- exprs(gse) sds <- genefilter::rowSds(gseMat) gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes #subset for common genes commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene) gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),] rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),]) #get expression matrix gseMat <- exprs(gseSub) exprMat <- assay(rnaSub)[rownames(gseMat),]
Remove highly correlated features in the training set
X <- mscale(exprMat) reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE) X <- reRes$reduced
set.seed(5862) y <- rna.vst$factor lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se") useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]
Input matrix from new dataset
newX <- gseMat[colnames(X),] newX <- t(mscale(newX)) #dimension of test set dim(newX)
Predict factor using the best model
y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1] y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))
pTab <- pData(gse) patAnno <- tibble(sampleID = rownames(pTab), TTT = as.numeric(pTab$`time to treatment (days):ch1`)/365, treatedAfter = pTab$`treatment status (censoring day):ch1`, OS = as.numeric(pTab$`overall survival (days):ch1`)/365, died = pTab$`life status (censoring day):ch1`) %>% mutate(treatedAfter = as.logical(as.numeric(treatedAfter)), died = as.logical(as.numeric(died))) plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>% left_join(patAnno, by = "sampleID")
Factor vs TTT (cox regression)
pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)
Factor vs OS (cox regression)
pOS <- comSurv(plotTab$factor, plotTab$OS, plotTab$died)
Add to summary table
sumOutcome <- mutate(bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")),cohort = "Munich",n = nrow(plotTab)) %>% bind_rows(sumOutcome)
KM plots
kmTTT_m1 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p, titlePlot = "Time to treatment (Munich)", showTable = TRUE) kmOS_m1 <- km(plotTab$factor, plotTab$OS, plotTab$died, stat = "maxstat", pval =pOS$p, titlePlot = "Overall survival (Munich)", showTable = TRUE) kmOS_m1 kmTTT_m1
gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL")) designMat <- model.matrix(~ 1 + y.pred) enrichRes <- runCamera(exprs(gse), designMat, gmts$H, id =fData(gse)$`Gene Symbol`, method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_", plotTitle = "Munich cohort (Hallmarks, 5% FDR)", setToHighlight = highSet, setMap = setMap) enrichMunich <- enrichRes$enrichPlot enrichMunich
gse <- gseList[["GSE39671"]] #internal dataset rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),] exprMat <- assay(rnaSub) sds <- genefilter::rowSds(exprMat) rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],] #external gseMat <- exprs(gse) sds <- genefilter::rowSds(gseMat) gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes #subset for common genes commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene) gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),] rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),]) #get expression matrix gseMat <- exprs(gseSub) exprMat <- assay(rnaSub)[rownames(gseMat),]
Remove highly correlated features
X <- mscale(exprMat) #remove highly correlated genes reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE) X <- reRes$reduced
set.seed(5862) y <- rna.vst$factor lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se") useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]
Input matrix from new dataset
newX <- gseMat[colnames(X),] newX <- t(mscale(newX)) #dimension of test set dim(newX)
Predict factor using the best model
y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1] y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))
patAnno <- tibble(sampleID = colnames(gse), TTT = as.numeric(gse$`sampling time to first treatment (years):ch1`), treatedAfter = gse$`treatment event (1:ch1`) %>% separate(treatedAfter, c("a","b","treatedAfter"),":") %>% mutate(treatedAfter = as.logical(as.numeric(treatedAfter))) %>% dplyr::select(sampleID, TTT, treatedAfter) plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>% left_join(patAnno, by = "sampleID")
Factor vs TTT (cox regression)
pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)
Add to summary table
sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "UCSD",n=nrow(plotTab)) %>% bind_rows(sumOutcome)
KM plots
kmTTT_m2 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p, titlePlot = "Time to treatment (UCSD)", showTable = TRUE) kmTTT_m2
gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL")) designMat <- model.matrix(~ 1 + y.pred) enrichRes <- runCamera(exprs(gse), designMat, gmts$H, id =fData(gse)$`Gene Symbol`, method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_", plotTitle = "UCSD cohort (Hallmarks, 5% FDR)", setToHighlight = highSet, setMap = setMap) enrichUCSD <- enrichRes$enrichPlot enrichUCSD
gse <- gseList$GSE10138$gse patAnno <- gseList$GSE10138$patAnno #internal dataset rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),] exprMat <- assay(rnaSub) sds <- genefilter::rowSds(exprMat) rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],] #external gseMat <- exprs(gse) sds <- genefilter::rowSds(gseMat) gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes #subset for common genes commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene) gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),] rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),]) #get expression matrix gseMat <- exprs(gseSub) exprMat <- assay(rnaSub)[rownames(gseMat),]
Remove highly correlated features
X <- t(mscale(exprMat)) reRes <- removeCorrelated(X, cutoff = 0.9, method = "pearson", record = FALSE) X <- reRes$reduced
set.seed(5862) y <- rna.vst$factor lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se") useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]
newX <- t(mscale(gseMat[colnames(X),])) #dimension of test set dim(newX)
Predict factor using the best model
y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1] y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))
plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>% left_join(patAnno, by = "sampleID")
Factor vs TTT (cox regression)
pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)
Add information to summary table
sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "Duke",n=nrow(plotTab)) %>% bind_rows(sumOutcome)
KM plots
kmTTT_m3 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval =pTTT$p, titlePlot = "Time to treatment (Duke)", showTable = TRUE) kmTTT_m3
gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL")) designMat <- model.matrix(~ 1 + y.pred) enrichRes <- runCamera(exprs(gse), designMat, gmts$H, id =fData(gse)$`Gene Symbol`, method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_", plotTitle = "Duke cohort (Hallmarks, 5% FDR)", setToHighlight = highSet, setMap = setMap) enrichDuke <- enrichRes$enrichPlot enrichDuke
plotTab <- sumOutcome %>% mutate(cohort = sprintf("%s\n(n=%s)",cohort,n)) %>% mutate(cohort = factor(cohort, levels = unique(cohort))) haSumPlot <- ggplot(plotTab, aes(x=cohort, y = HR, col = outcome, dodge = outcome)) + geom_hline(yintercept = 1, linetype = "dotted") + geom_point(position = position_dodge(width=0.8)) + geom_errorbar(position = position_dodge(width =0.8), aes(ymin = lower, ymax = higher), width = 0.3, size=1) + geom_text(position = position_dodge2(width = 0.8), aes(x=as.numeric(as.factor(cohort))+0.15, label = sprintf("italic(P)~'=%s'",formatNum(p, digits = 1, format = "e"))), color = "black",size =5, parse = TRUE) + xlab("Cohorts") + ylab("Hazard ratio") + scale_y_log10(limits = c(0.3,11)) + coord_flip() + theme_full + theme(legend.title = element_blank(), legend.position = c(0.2,0.1), legend.background = element_blank(), legend.key.size = unit(0.5,"cm"), legend.key.width = unit(0.6,"cm"), legend.text = element_text(size=rel(1.2))) + scale_color_manual(values = c(OS = colList[3], TTT = colList[5])) haSumPlot
title = ggdraw() + draw_figure_label("Figure 2", fontface = "bold", position = "top.left",size=22) pout <- ggdraw() + draw_plot(varPlot, 0.02, 0.5, 0.22, 0.5) + draw_plot(groupList[["TTT"]], 0.25, 0.5 , 0.36, 0.5) + draw_plot(groupList[["OS"]], 0.64, 0.5, 0.36, 0.5) + draw_plot(haSumPlot, 0, 0, 0.32, 0.465) + draw_plot(haTTT, 0.33, 0, 0.30, 0.48) + draw_plot(haOS, 0.66, 0, 0.30, 0.48) + draw_plot_label(c("a", "b", "c", "d","e","f"), c(0, 0, 0.33, 0.66, 0.33, 0.65), c(1, 0.5, 1, 1, 0.5,0.5), size = 20) plot_grid(title, pout, rel_heights = c(0.05,0.95), ncol = 1)
plot_grid(kmTTT, kmOS, kmTTT_m1, kmOS_m1, kmTTT_m2, kmTTT_m3, nrow =3)
plot_grid(plotAge,plotSex,nrow=1)
plot_grid(plotGeneVolcano, plotMut,rel_widths = c(1,1), nrow =1, align = "h", axis="t")
enrichICGC
plot_grid(enrichICGC, enrichMunich, enrichUCSD, enrichDuke, ncol=2, align = "hv", axis = "l")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.