Transcriptomic analysis

library(seahorseCLL)
library(BloodCancerMultiOmics2017)
library(DESeq2)
library(limma)
library(genefilter)
library(ggbeeswarm)
library(SummarizedExperiment)
library(RColorBrewer)
library(piano)
library(GEOquery)
library(pheatmap)
library(grid)
library(gridExtra)
library(cowplot)
library(tidyverse)
plotDir = ifelse(exists(".standalone"), "", "section04/")
if(plotDir!="") if(!file.exists(plotDir)) dir.create(plotDir)
options(stringsAsFactors=FALSE)

Differential expression between M-CLL and U-CLL samples

Data pre-processing

Load data set

data("dds", "patmeta","mutCOM","seaOri")

Only use CLL samples with IGHV annotations and have been used in seahorse experiments

#annotate IGHV status
dds$IGHV <- patmeta[match(dds$PatID, rownames(patmeta)),]$IGHV
dds$diag <- patmeta[match(dds$PatID, rownames(patmeta)),]$Diagnosis

#estimate size factor
dds <- estimateSizeFactors(dds)

#only choose CLL samples with IGHV annotations and have been used in seahorse experiments
ddsCLL <- dds[,dds$diag == "CLL" & !is.na(dds$IGHV) & dds$PatID %in% colnames(seaOri)]

Filter genes

#remove genes without gene symbol annotations
ddsCLL <- ddsCLL[! rowData(ddsCLL)$symbol %in% c(NA,""),]
ddsCLL <- ddsCLL[rowData(ddsCLL)$chromosome != "Y",]

#only keep genes that have counts higher than 10 in any sample
keep <- apply(counts(ddsCLL), 1, function(x) any(x >= 10)) 
ddsCLL <- ddsCLL[keep,]

# Remove transcripts which do not show variance across samples.
sds <- rowSds(counts(ddsCLL, normalized = TRUE))
sh <- shorth(sds)
ddsCLL <- ddsCLL[sds >= sh,]

#how many genes do we have
nrow(ddsCLL)

Variance stabilizing tranformation

ddsCLL.norm <- varianceStabilizingTransformation(ddsCLL)

Identify differentially expressed gene between M-CLL and U-CLL using DESeq2

Differential expression using DESeq2

ddsCLL$IGHV <- factor(ddsCLL$IGHV, levels = c("U", "M"))
design(ddsCLL) <- ~ IGHV
ddsCLL <- DESeq(ddsCLL, betaPrior = TRUE)

Get differential expression result

DEres <- as.tibble(results(ddsCLL, tidy = TRUE)) %>% mutate(symbol = rowData(ddsCLL)$symbol)

Gene enrichment analysis

Function for converting DEseq results to enrichment analysis input

createInput <- function(DEres, pCut = 0.05, ifFDR = FALSE, rankBy = "stat") {
  if (ifFDR) {
    inputTab <- filter(DEres, padj <= pCut)
  } else {
    inputTab <- filter(DEres, pvalue <= pCut)
  }

  inputTab <- arrange(inputTab, pvalue) %>% filter(!duplicated(symbol)) %>% select_("symbol", rankBy) %>% data.frame(stringsAsFactors = FALSE)
  rownames(inputTab) <- inputTab$symbol
  inputTab$symbol <- NULL
  colnames(inputTab) <- "stat"
  return(inputTab)
}

load genesets

gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017"))

Enrichment analysis using Hallmarks gene set

enRes <- list()
inputTab <- createInput(DEres, pCut = 0.1, ifFDR = TRUE)
enRes[["Gene enrichment analysis"]] <- runGSEA(inputTab = inputTab, gmtFile = gmts$H, GSAmethod = "page")

#remove the HALLMARK_
enRes$`Gene enrichment analysis`$Name <- gsub("HALLMARK_","", enRes$`Gene enrichment analysis`$Name)

Plot hallmark result

enBar <- plotEnrichmentBar(enRes, pCut = 0.1, ifFDR = TRUE, setName = "Hallmark gene sets")
plot(enBar)

Heatmap plot of the expression values of genes in the glycolysis geneset

Prepare the data for heatmap

# load genes in the gene set
gsc <- loadGSC(gmts$H)
geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS

#select differentially expressed genes
fdrCut <- 0.10
sigDE <- filter(DEres, padj <= fdrCut, log2FoldChange < 0) %>% filter(symbol %in% geneList) %>%
  arrange(log2FoldChange)

#get the expression matrix
plotMat <- assay(ddsCLL.norm[sigDE$row,])
colnames(plotMat) <- ddsCLL.norm$PatID
rownames(plotMat) <- sigDE$symbol

#sort columns of plot matrix based on trisomy12 status
plotMat <- plotMat[,order(ddsCLL$IGHV)]

#calculate z-score and sensor
plotMat <- t(scale(t(plotMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4

annoCol <- data.frame(row.names = ddsCLL.norm$PatID, `IGHV` = ddsCLL.norm$IGHV)

Plot the heatmap

#color for colum annotation
annoColor <- list(IGHV = c(M = "red", U = "grey80"))

hallHeatmap <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
         cluster_cols = FALSE, cluster_rows = FALSE,
         annotation_col = annoCol, annotation_colors = annoColor,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, main = "HALLMARK_GLYCOLYSIS",silent = TRUE)$gtable

grid.draw(hallHeatmap)

Beeswarm plot of expression values of key glycolytic genes

plotGenes <- c("PFKP","PGAM1","PGK1")
plotTab <- data.frame(assay(ddsCLL.norm[rowData(ddsCLL.norm)$symbol %in% plotGenes,]))
colnames(plotTab) <- ddsCLL.norm$PatID
plotTab$symbol <- rowData(ddsCLL.norm[rowData(ddsCLL.norm)$symbol %in% plotGenes,])$symbol
plotTab <- gather(plotTab, key = "PatID", value = "value", -symbol) %>%
  mutate(IGHV = colData(ddsCLL.norm)[match(PatID, ddsCLL.norm$PatID),]$IGHV) %>%
  mutate(p = sigDE[match(symbol, sigDE$symbol),]$pvalue)

pTab <- distinct(plotTab, symbol, p) %>% arrange(symbol)
IGHVcount <- distinct(plotTab, PatID, IGHV) %>% group_by(IGHV) %>%
  summarise(n = length(IGHV)) %>% 
  mutate(label = sprintf("%s-CLL\n(n=%s)",IGHV, n))

scaleFun <- function(x) sprintf("%.1f",x)

beePlot <- ggplot(plotTab, aes(x=IGHV, y = value)) + 
    stat_boxplot(geom = "errorbar", width = 0.3) +
    geom_boxplot(outlier.shape = NA, col="black", width=0.4) + 
    geom_beeswarm(cex=2, size =0.5, aes(col = IGHV)) + theme_classic() +
    xlab("") + ylab("normalized expression") +
    scale_color_manual(values = c("M" = "red","U" = "grey30")) + 
    scale_x_discrete(labels = structure(IGHVcount$label, names = IGHVcount$IGHV)) +
    scale_y_continuous(expand = expand_scale(add = c(0.1,0.6)),
                       labels = scaleFun) +
    theme(axis.line.x = element_blank(), axis.ticks.x = element_blank(),
           axis.title = element_text(size=12, face="bold"),
           axis.text.y = element_text(size=12),
           axis.text.x = element_text(size =10),
           plot.title = element_text(face="bold", hjust=0.5),
           legend.position = "none",
           axis.title.x = element_text(face="bold"),
          strip.background = element_blank(),
          strip.text = element_text(size=13, face = "bold")) +
  facet_wrap(~ symbol, scales = "free")

#add p value annotations
pTab$pt <- sapply(pTab$p, sciPretty)
beePlot <- beePlot + geom_text(pTab, mapping = aes(x = 1.5, y = Inf, 
                                        label = paste("p==~",bquote(.(pt))), 
                                        hjust=0.5, vjust =1), size =3, 
                    parse = T)

Combined plots for manuscript (Figure 3)

title = ggdraw() + draw_figure_label("Figure 3", fontface = "bold", position = "top.left",size=20)
p <- ggdraw() + 
  draw_plot(enBar, 0, 0.5, 0.5, 0.45) + 
  draw_plot(hallHeatmap, 0.5, 0, 0.5, 0.95) +
  draw_plot(beePlot, 0, 0, 0.5, 0.4) +
  draw_plot_label(c("A","B","C"), c(0, 0.5, 0), c(1, 1, 0.45), size=20)
plot_grid(title, p, rel_heights = c(0.05,0.95), ncol = 1)

Query public datasets for gene expression signatures in CLL of B-cell receptor triggering

Expression signature of B-cell receptor triggered by IgM (GSE49695)

Data import

Get public dataset from Gene Expression Omnibus (GEO)

gse <- getGEO("GSE52774", GSEMatrix = TRUE)
gse <- gse[[1]]

Define treatment status

table(gse$`treatment:ch1`)
gse$treatment <- factor(ifelse(gse$`treatment:ch1` == "untreated", "untreated","IgM"),
                        levels = c("untreated","IgM"))

Quality assessment

Distribution of raw expression values

boxplot(exprs(gse))

Variance stablizing transformation

gse.vst <- gse
exprs(gse.vst) <- normalizeVSN(gse)

Remove genes without gene symbol

gse.vst <- gse.vst[fData(gse.vst)$`GENE_SYMBOL`!="",]

Remove invariant genes

sds <- rowSds(exprs(gse.vst))
gse.vst <- gse.vst[sds > shorth(sds),]

PCA

exprMat <- exprs(gse.vst)
#using top 5000 variant genes
sds <- rowSds(exprMat)
exprMat <- exprMat[order(sds, decreasing = TRUE) <= 5000,]

pcRes <- prcomp(t(exprMat), center = TRUE, scale. = FALSE)
plotTab <- pcRes$x[,c(1,2)] %>% data.frame() %>% rownames_to_column("sampleID") %>%
  mutate(treatment = gse[,sampleID]$treatment)

ggplot(plotTab, aes(x=PC1, y = PC2, color = treatment)) + geom_point() + 
  theme_bw()

Most treated and untreated samples can be clearly separated.

Differential expression

Design matrix

mm <- model.matrix( ~  treatment , pData(gse.vst) )

Run Limma

fit <- lmFit(gse.vst, mm)
fit <- eBayes(fit)
resTab <- topTable(fit, coef= "treatmentIgM", number = "all")

P value histogram

hist(resTab$P.Value, breaks = 50, main = "p value histogram", xlab = "pvalues")

Gene enrichment analysis

Run enrichment analysis using PAGE

inputTab <- dplyr::filter(resTab, adj.P.Val < 0.1) %>%
  dplyr::select(t, GENE_SYMBOL) %>%
  arrange(desc(abs(t))) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) %>%
  data.frame() %>% column_to_rownames("GENE_SYMBOL")

enRes <- list()

gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017"))

enRes[["BCR stimulated by IgM (GSE49695)"]] <- runGSEA(inputTab, 
                               gmtFile = gmts$H,
                               GSAmethod = "page")

Plot enrichment p values (only pathways passing 1% FDR are shown)

enBar1 <- seahorseCLL::plotEnrichmentBar(enRes,pCut = 0.01, ifFDR = TRUE,
                                         setName = "Hallmark gene sets")
grid.draw(enBar1)

Heatmap of genes up-regulated in glycolysis pathway

# load genes in the gene set
gsc <- loadGSC(gmts$H)
geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS

#select differentially expressed genes
fdrCut <- 0.10
sigDE <- filter(resTab, adj.P.Val < fdrCut, logFC > 0) %>% 
  filter(GENE_SYMBOL %in% geneList) %>%
  arrange(desc(logFC)) %>% distinct(GENE_SYMBOL, .keep_all = TRUE)

#get the expression matrix
plotMat <- exprs(gse.vst[sigDE$ID,])
rownames(plotMat) <- sigDE$GENE_SYMBOL

#sort columns of plot matrix based on treatment status
plotMat <- plotMat[,order(gse.vst$treatment)]

annoCol <- data.frame(row.names = colnames(gse.vst), `treatment` = gse.vst$treatment)

Plot the heatmap

#color for colum annotation
annoColor <- list(treatment = c(IgM = "red", untreated = "grey80"))

hallHeatmap1 <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), scale = "row",
         cluster_cols = FALSE, cluster_rows = FALSE,
         annotation_col = annoCol, annotation_colors = annoColor,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, main = "HALLMARK_GLYCOLYSIS (GSE49695)",silent = TRUE)$gtable

grid.draw(hallHeatmap1)

Gene expression signature in CLL of B-cell receptor triggering by CPG (GSE30105)

Data import

Get public dataset from Gene Expression Omnibus (GEO)

gse <- getGEO("GSE30105", GSEMatrix = TRUE)
gse <- gse[[1]]

Define treatment status

table(gse$title)
gse$treatment <- factor(ifelse(grepl("Unstimulated",gse$title), "untreated","CPG"),
                        levels = c("untreated","CPG"))

Quality assessment

Distribution of raw expression values

boxplot(exprs(gse))

Variance stablizing transformation

gse.vst <- gse
exprs(gse.vst) <- normalizeVSN(gse)

Remove genes without gene symbol

gse.vst <- gse.vst[fData(gse.vst)$`GENE_SYMBOL`!="",]

Remove invariant genes

sds <- rowSds(exprs(gse.vst))
gse.vst <- gse.vst[sds > shorth(sds),]

PCA

exprMat <- exprs(gse.vst)
#using top 5000 variant genes
sds <- rowSds(exprMat)
exprMat <- exprMat[order(sds, decreasing = TRUE) <= 5000,]
pcRes <- prcomp(t(exprMat), center = TRUE, scale. = FALSE)
plotTab <- pcRes$x[,c(1,2)] %>% data.frame() %>% rownames_to_column("sampleID") %>%
  mutate(treatment = gse[,sampleID]$treatment)

ggplot(plotTab, aes(x=PC1, y = PC2, color = treatment)) + geom_point() + 
  theme_bw()

Most treated and untreated samples can be clearly separated.

Differential expression

Design matrix

mm <- model.matrix( ~  treatment , pData(gse.vst) )

Run Limma

fit <- lmFit(gse.vst, mm)
fit <- eBayes(fit)
resTab <- topTable(fit, coef= "treatmentCPG", number = "all")

P value histogram

hist(resTab$P.Value, breaks = 50, main = "p value histogram", xlab = "pvalues")

Gene enrichment analysis

Run enrichment analysis using PAGE

inputTab <- dplyr::filter(resTab, adj.P.Val < 0.1) %>%
  dplyr::select(t, GENE_SYMBOL) %>%
  arrange(desc(abs(t))) %>% distinct(GENE_SYMBOL, .keep_all = TRUE) %>%
  data.frame() %>% column_to_rownames("GENE_SYMBOL")

enRes <- list()

gmts = list(H=system.file("extdata","h.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            C6=system.file("extdata","c6.all.v5.1.symbols.gmt", package="BloodCancerMultiOmics2017"),
            KEGG=system.file("extdata","c2.cp.kegg.v5.1.symbols.gmt", package = "BloodCancerMultiOmics2017"))

enRes[["BCR stimulated by CPG (GSE30105)"]] <- runGSEA(inputTab, 
                               gmtFile = gmts$H,
                               GSAmethod = "page")

Plot enrichment p values (only pathways passing 1% FDR are shown)

enBar2 <- seahorseCLL::plotEnrichmentBar(enRes,pCut = 0.01, ifFDR = TRUE,setName = "Hallmark gene sets")
grid.draw(enBar2)

Heatmap of genes up-regulated in glycolysis pathway

# load genes in the gene set
gsc <- loadGSC(gmts$H)
geneList <- gsc$gsc$HALLMARK_GLYCOLYSIS

#select differentially expressed genes
fdrCut <- 0.10
sigDE <- filter(resTab, adj.P.Val < fdrCut, logFC > 0) %>% 
  filter(GENE_SYMBOL %in% geneList) %>%
  arrange(desc(logFC)) %>% distinct(GENE_SYMBOL, .keep_all = TRUE)

#get the expression matrix
plotMat <- exprs(gse.vst[match(sigDE$NAME,fData(gse.vst)$NAME),])
rownames(plotMat) <- sigDE$GENE_SYMBOL

#sort columns of plot matrix based on treatment status
plotMat <- plotMat[,order(gse.vst$treatment)]

annoCol <- data.frame(row.names = colnames(gse.vst), `treatment` = gse.vst$treatment)

Plot the heatmap

#color for colum annotation
annoColor <- list(treatment = c(CPG= "red", untreated = "grey80"))

hallHeatmap2 <- pheatmap(plotMat, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100), scale = "row",
         cluster_cols = FALSE, cluster_rows = FALSE,
         annotation_col = annoCol, annotation_colors = annoColor,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, main = "HALLMARK_GLYCOLYSIS (GSE30105)",silent = TRUE)$gtable

grid.draw(hallHeatmap2)

Combine the figures for supplementary material

p <- ggdraw() + draw_plot(enBar1, 0, 0.5, 0.6, 0.48) + 
  draw_plot(hallHeatmap1, 0.6, 0.5, 0.4, 0.48) +
  draw_plot(enBar2, 0, 0.1, 0.6, 0.38) +
  draw_plot(hallHeatmap2, 0.6,0, 0.4,0.48) +
  draw_plot_label(c("A","B","C","D"), c(0, 0.55, 0, 0.55), c(1, 1, 0.49, 0.49), fontface = "plain", size=20)
plot_grid(p)


lujunyan1118/seahorseCLL documentation built on May 12, 2019, 6:16 p.m.