library(seahorseCLL) library(BloodCancerMultiOmics2017) library(DESeq2) library(limma) library(genefilter) library(ggbeeswarm) library(SummarizedExperiment) library(RColorBrewer) library(piano) library(GEOquery) library(pheatmap) library(grid) library(gridExtra) library(cowplot) library(tidyverse)
plotDir = ifelse(exists(".standalone"), "", "section04/") if(plotDir!="") if(!file.exists(plotDir)) dir.create(plotDir) options(stringsAsFactors=FALSE)
Load data set
data("dds", "patmeta","mutCOM","seaOri")
Only use CLL samples with IGHV annotations and have been used in seahorse experiments
#annotate IGHV status dds$IGHV <- patmeta[match(dds$PatID, rownames(patmeta)),]$IGHV dds$diag <- patmeta[match(dds$PatID, rownames(patmeta)),]$Diagnosis #estimate size factor dds <- estimateSizeFactors(dds) #only choose CLL samples with IGHV annotations and have been used in seahorse experiments ddsCLL <- dds[,dds$diag == "CLL" & !is.na(dds$IGHV) & dds$PatID %in% colnames(seaOri)]
Filter genes
#remove genes without gene symbol annotations ddsCLL <- ddsCLL[! rowData(ddsCLL)$symbol %in% c(NA,""),] ddsCLL <- ddsCLL[rowData(ddsCLL)$chromosome != "Y",] #only keep genes that have counts higher than 10 in any sample keep <- apply(counts(ddsCLL), 1, function(x) any(x >= 10)) ddsCLL <- ddsCLL[keep,] # Remove transcripts which do not show variance across samples. sds <- rowSds(counts(ddsCLL, normalized = TRUE)) sh <- shorth(sds) ddsCLL <- ddsCLL[sds >= sh,] #how many genes do we have nrow(ddsCLL)
Variance stabilizing tranformation
ddsCLL.norm <- varianceStabilizingTransformation(ddsCLL)
Differential expression using DESeq2
ddsCLL$IGHV <- factor(ddsCLL$IGHV, levels = c("U", "M")) design(ddsCLL) <- ~ IGHV ddsCLL <- DESeq(ddsCLL, betaPrior = TRUE)
Get differential expression result
DEres <- as.tibble(results(ddsCLL, tidy = TRUE)) %>% mutate(symbol = rowData(ddsCLL)$symbol)
Function for converting DEseq results to enrichment analysis input
createInput <- function(DEres, pCut = 0.05, ifFDR = FALSE, rankBy = "stat") { if (ifFDR) { inputTab <- filter(DEres, padj <= pCut) } else { inputTab <- filter(DEres, pvalue <= pCut) } inputTab <- arrange(inputTab, pvalue) %>% filter(!duplicated(symbol)) %>% select_("symbol", rankBy) %>% data.frame(stringsAsFactors = FALSE) rownames(inputTab) <- inputTab$symbol inputTab$symbol <- NULL colnames(inputTab) <- "stat" return(inputTab) }
load genesets
gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017"))
Enrichment analysis using Hallmarks gene set
enRes <- list() inputTab <- createInput(DEres, pCut = 0.1, ifFDR = TRUE) enRes[["Gene enrichment analysis"]] <- runGSEA(inputTab = inputTab, gmtFile = gmts$H, GSAmethod = "page") #remove the HALLMARK_ enRes$`Gene enrichment analysis`$Name <- gsub("HALLMARK_","", enRes$`Gene enrichment analysis`$Name)
Plot hallmark result
enBar <- plotEnrichmentBar(enRes, pCut = 0.1, ifFDR = TRUE, setName = "Hallmark gene sets") plot(enBar)
Prepare the data for heatmap
# load genes in the gene set gsc <- loadGSC(gmts$H) geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS #select differentially expressed genes fdrCut <- 0.10 sigDE <- filter(DEres, padj <= fdrCut, log2FoldChange < 0) %>% filter(symbol %in% geneList) %>% arrange(log2FoldChange) #get the expression matrix plotMat <- assay(ddsCLL.norm[sigDE$row,]) colnames(plotMat) <- ddsCLL.norm$PatID rownames(plotMat) <- sigDE$symbol #sort columns of plot matrix based on trisomy12 status plotMat <- plotMat[,order(ddsCLL$IGHV)] #calculate z-score and sensor plotMat <- t(scale(t(plotMat))) plotMat[plotMat >= 4] <- 4 plotMat[plotMat <= -4] <- -4 annoCol <- data.frame(row.names = ddsCLL.norm$PatID, `IGHV` = ddsCLL.norm$IGHV)
Plot the heatmap
#color for colum annotation annoColor <- list(IGHV = c(M = "red", U = "grey80")) hallHeatmap <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), cluster_cols = FALSE, cluster_rows = FALSE, annotation_col = annoCol, annotation_colors = annoColor, show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0, border_color = NA, main = "HALLMARK_GLYCOLYSIS",silent = TRUE)$gtable grid.draw(hallHeatmap)
plotGenes <- c("PFKP","PGAM1","PGK1") plotTab <- data.frame(assay(ddsCLL.norm[rowData(ddsCLL.norm)$symbol %in% plotGenes,])) colnames(plotTab) <- ddsCLL.norm$PatID plotTab$symbol <- rowData(ddsCLL.norm[rowData(ddsCLL.norm)$symbol %in% plotGenes,])$symbol plotTab <- gather(plotTab, key = "PatID", value = "value", -symbol) %>% mutate(IGHV = colData(ddsCLL.norm)[match(PatID, ddsCLL.norm$PatID),]$IGHV) %>% mutate(p = sigDE[match(symbol, sigDE$symbol),]$pvalue) pTab <- distinct(plotTab, symbol, p) %>% arrange(symbol) IGHVcount <- distinct(plotTab, PatID, IGHV) %>% group_by(IGHV) %>% summarise(n = length(IGHV)) %>% mutate(label = sprintf("%s-CLL\n(n=%s)",IGHV, n)) scaleFun <- function(x) sprintf("%.1f",x) beePlot <- ggplot(plotTab, aes(x=IGHV, y = value)) + stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(outlier.shape = NA, col="black", width=0.4) + geom_beeswarm(cex=2, size =0.5, aes(col = IGHV)) + theme_classic() + xlab("") + ylab("normalized expression") + scale_color_manual(values = c("M" = "red","U" = "grey30")) + scale_x_discrete(labels = structure(IGHVcount$label, names = IGHVcount$IGHV)) + scale_y_continuous(expand = expand_scale(add = c(0.1,0.6)), labels = scaleFun) + theme(axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.title = element_text(size=12, face="bold"), axis.text.y = element_text(size=12), axis.text.x = element_text(size =10), plot.title = element_text(face="bold", hjust=0.5), legend.position = "none", axis.title.x = element_text(face="bold"), strip.background = element_blank(), strip.text = element_text(size=13, face = "bold")) + facet_wrap(~ symbol, scales = "free") #add p value annotations pTab$pt <- sapply(pTab$p, sciPretty) beePlot <- beePlot + geom_text(pTab, mapping = aes(x = 1.5, y = Inf, label = paste("p==~",bquote(.(pt))), hjust=0.5, vjust =1), size =3, parse = T)
title = ggdraw() + draw_figure_label("Figure 3", fontface = "bold", position = "top.left",size=20) p <- ggdraw() + draw_plot(enBar, 0, 0.5, 0.5, 0.45) + draw_plot(hallHeatmap, 0.5, 0, 0.5, 0.95) + draw_plot(beePlot, 0, 0, 0.5, 0.4) + draw_plot_label(c("A","B","C"), c(0, 0.5, 0), c(1, 1, 0.45), size=20) plot_grid(title, p, rel_heights = c(0.05,0.95), ncol = 1)
Get public dataset from Gene Expression Omnibus (GEO)
gse <- getGEO("GSE52774", GSEMatrix = TRUE) gse <- gse[[1]]
Define treatment status
table(gse$`treatment:ch1`) gse$treatment <- factor(ifelse(gse$`treatment:ch1` == "untreated", "untreated","IgM"), levels = c("untreated","IgM"))
Distribution of raw expression values
boxplot(exprs(gse))
Variance stablizing transformation
gse.vst <- gse exprs(gse.vst) <- normalizeVSN(gse)
Remove genes without gene symbol
gse.vst <- gse.vst[fData(gse.vst)$`GENE_SYMBOL`!="",]
Remove invariant genes
sds <- rowSds(exprs(gse.vst)) gse.vst <- gse.vst[sds > shorth(sds),]
PCA
exprMat <- exprs(gse.vst) #using top 5000 variant genes sds <- rowSds(exprMat) exprMat <- exprMat[order(sds, decreasing = TRUE) <= 5000,] pcRes <- prcomp(t(exprMat), center = TRUE, scale. = FALSE) plotTab <- pcRes$x[,c(1,2)] %>% data.frame() %>% rownames_to_column("sampleID") %>% mutate(treatment = gse[,sampleID]$treatment) ggplot(plotTab, aes(x=PC1, y = PC2, color = treatment)) + geom_point() + theme_bw()
Most treated and untreated samples can be clearly separated.
Design matrix
mm <- model.matrix( ~ treatment , pData(gse.vst) )
Run Limma
fit <- lmFit(gse.vst, mm) fit <- eBayes(fit) resTab <- topTable(fit, coef= "treatmentIgM", number = "all")
P value histogram
hist(resTab$P.Value, breaks = 50, main = "p value histogram", xlab = "pvalues")
Run enrichment analysis using PAGE
inputTab <- dplyr::filter(resTab, adj.P.Val < 0.1) %>% dplyr::select(t, GENE_SYMBOL) %>% arrange(desc(abs(t))) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) %>% data.frame() %>% column_to_rownames("GENE_SYMBOL") enRes <- list() gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017")) enRes[["BCR stimulated by IgM (GSE49695)"]] <- runGSEA(inputTab, gmtFile = gmts$H, GSAmethod = "page")
Plot enrichment p values (only pathways passing 1% FDR are shown)
enBar1 <- seahorseCLL::plotEnrichmentBar(enRes,pCut = 0.01, ifFDR = TRUE, setName = "Hallmark gene sets") grid.draw(enBar1)
Heatmap of genes up-regulated in glycolysis pathway
# load genes in the gene set gsc <- loadGSC(gmts$H) geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS #select differentially expressed genes fdrCut <- 0.10 sigDE <- filter(resTab, adj.P.Val < fdrCut, logFC > 0) %>% filter(GENE_SYMBOL %in% geneList) %>% arrange(desc(logFC)) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) #get the expression matrix plotMat <- exprs(gse.vst[sigDE$ID,]) rownames(plotMat) <- sigDE$GENE_SYMBOL #sort columns of plot matrix based on treatment status plotMat <- plotMat[,order(gse.vst$treatment)] annoCol <- data.frame(row.names = colnames(gse.vst), `treatment` = gse.vst$treatment)
Plot the heatmap
#color for colum annotation annoColor <- list(treatment = c(IgM = "red", untreated = "grey80")) hallHeatmap1 <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), scale = "row", cluster_cols = FALSE, cluster_rows = FALSE, annotation_col = annoCol, annotation_colors = annoColor, show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0, border_color = NA, main = "HALLMARK_GLYCOLYSIS (GSE49695)",silent = TRUE)$gtable grid.draw(hallHeatmap1)
Get public dataset from Gene Expression Omnibus (GEO)
gse <- getGEO("GSE30105", GSEMatrix = TRUE) gse <- gse[[1]]
Define treatment status
table(gse$title) gse$treatment <- factor(ifelse(grepl("Unstimulated",gse$title), "untreated","CPG"), levels = c("untreated","CPG"))
Distribution of raw expression values
boxplot(exprs(gse))
Variance stablizing transformation
gse.vst <- gse exprs(gse.vst) <- normalizeVSN(gse)
Remove genes without gene symbol
gse.vst <- gse.vst[fData(gse.vst)$`GENE_SYMBOL`!="",]
Remove invariant genes
sds <- rowSds(exprs(gse.vst)) gse.vst <- gse.vst[sds > shorth(sds),]
PCA
exprMat <- exprs(gse.vst) #using top 5000 variant genes sds <- rowSds(exprMat) exprMat <- exprMat[order(sds, decreasing = TRUE) <= 5000,] pcRes <- prcomp(t(exprMat), center = TRUE, scale. = FALSE) plotTab <- pcRes$x[,c(1,2)] %>% data.frame() %>% rownames_to_column("sampleID") %>% mutate(treatment = gse[,sampleID]$treatment) ggplot(plotTab, aes(x=PC1, y = PC2, color = treatment)) + geom_point() + theme_bw()
Most treated and untreated samples can be clearly separated.
Design matrix
mm <- model.matrix( ~ treatment , pData(gse.vst) )
Run Limma
fit <- lmFit(gse.vst, mm) fit <- eBayes(fit) resTab <- topTable(fit, coef= "treatmentCPG", number = "all")
P value histogram
hist(resTab$P.Value, breaks = 50, main = "p value histogram", xlab = "pvalues")
Run enrichment analysis using PAGE
inputTab <- dplyr::filter(resTab, adj.P.Val < 0.1) %>% dplyr::select(t, GENE_SYMBOL) %>% arrange(desc(abs(t))) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) %>% data.frame() %>% column_to_rownames("GENE_SYMBOL") enRes <- list() gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"), KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017")) enRes[["BCR stimulated by CPG (GSE30105)"]] <- runGSEA(inputTab, gmtFile = gmts$H, GSAmethod = "page")
Plot enrichment p values (only pathways passing 1% FDR are shown)
enBar2 <- seahorseCLL::plotEnrichmentBar(enRes,pCut = 0.01, ifFDR = TRUE,setName = "Hallmark gene sets") grid.draw(enBar2)
Heatmap of genes up-regulated in glycolysis pathway
# load genes in the gene set gsc <- loadGSC(gmts$H) geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS #select differentially expressed genes fdrCut <- 0.10 sigDE <- filter(resTab, adj.P.Val < fdrCut, logFC > 0) %>% filter(GENE_SYMBOL %in% geneList) %>% arrange(desc(logFC)) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) #get the expression matrix plotMat <- exprs(gse.vst[match(sigDE$NAME,fData(gse.vst)$NAME),]) rownames(plotMat) <- sigDE$GENE_SYMBOL #sort columns of plot matrix based on treatment status plotMat <- plotMat[,order(gse.vst$treatment)] annoCol <- data.frame(row.names = colnames(gse.vst), `treatment` = gse.vst$treatment)
Plot the heatmap
#color for colum annotation annoColor <- list(treatment = c(CPG= "red", untreated = "grey80")) hallHeatmap2 <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), scale = "row", cluster_cols = FALSE, cluster_rows = FALSE, annotation_col = annoCol, annotation_colors = annoColor, show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0, border_color = NA, main = "HALLMARK_GLYCOLYSIS (GSE30105)",silent = TRUE)$gtable grid.draw(hallHeatmap2)
p <- ggdraw() + draw_plot(enBar1, 0, 0.5, 0.6, 0.48) + draw_plot(hallHeatmap1, 0.6, 0.5, 0.4, 0.48) + draw_plot(enBar2, 0, 0.1, 0.6, 0.38) + draw_plot(hallHeatmap2, 0.6,0, 0.4,0.48) + draw_plot_label(c("A","B","C","D"), c(0, 0.55, 0, 0.55), c(1, 1, 0.49, 0.49), fontface = "plain", size=20) plot_grid(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.