knitr::opts_chunk$set(echo = TRUE)
Claus Weinholdt, Henri Wichmann, Johanna Kotrba, David H. Ardell, Matthias Kappler, Alexander W. Eckert, Dirk Vordermark and Ivo Grosse Prediction of regulatory targets of alternative isoforms of the epidermal growth factor receptor in a glioblastoma cell line BMC Bioinformatics (2019)
Install and load packages containing the functions for the analyses
install.packages("devtools") devtools::install_github("GrosseLab/BGSC") library(BGSC)
data(ExpData)
suppressWarnings( library("BGSC",quietly = TRUE,warn.conflicts = FALSE) ) library(ggplot2) library(gridExtra) library(grid) library(gtable) suppressWarnings( library(purrr,quietly = TRUE,warn.conflicts = FALSE) )
We use the function neqc function from the limma package which was developed for normalizing Illumina BeadChips data. The neqc function performs background correction using negative control probes followed by quantile normalization using negative and positive control probes. The Illumina GenomeStudio calculates and reports a detection p-value, which represents the confidence that a given transcript is expressed above background defined by negative control probes. For further analysis, we used only those probes for which the detection p-values for all six probes was below 0.05.
normData <- normalizeExpData(set = 'SF767')
We define that:
addBorder_gtable <- function(g){ g <- gtable_add_grob(g, grobs = rectGrob(gp = gpar(fill = NA, lwd = 1)), t= 1 , b = nrow(g), l = 1, r = ncol(g)) # g <- gtable_add_grob(g, # grobs = rectGrob(gp = gpar(fill = NA, lwd = 1)), # t = 1, l = 1, r = ncol(g)) return(g) } Lsets <- get.Lset() IndicatorVar <- lapply(Lsets,function(x){ vec <- rep(0,6);names(vec) <- c(1:6) if (!is.null(x$s1)) { vec[x$s1] <- 1 } return(vec) }) IndicatorVar.gtable <- purrr::map2( IndicatorVar , names(IndicatorVar), function(vec,.y){ pmat <- t(matrix(vec,2,3)) dimnames(pmat) <- list( c('no RNAi','RNAi by siRNA ALL','RNAi by siRNA I'),c('no EGF','EGF')) pmat <- pmat[c(1,3,2),] ## to get same oder as in paper bk = seq(0,max(1),by = 1) #hmcols <- c(RColorBrewer::brewer.pal(9,"Reds")[6],RColorBrewer::brewer.pal(9,"Blues")[7])# colorRampPalette(c("blue", "red"))(length(bk)) hmcols <- RColorBrewer::brewer.pal(8,'Paired')[c(6,2)] if(.y == 'a') hmcols <- 'slategray' pheatmap::pheatmap(pmat,silent = T,color = hmcols,breaks = seq(-1,1,by = 1) ,display_numbers = T,fontsize_number = 16,number_format = '%.0f',cluster_rows = FALSE,cluster_cols = FALSE,main=paste0('Pattern of for group ', .y),fontsize = 12 ,gaps_row = c(1,2),gaps_col = 1,border_color = 'black',number_color = 'black',legend = FALSE) } ) grid.arrange( addBorder_gtable(IndicatorVar.gtable$a$gtable), addBorder_gtable(IndicatorVar.gtable$b$gtable), addBorder_gtable(IndicatorVar.gtable$c$gtable), addBorder_gtable(IndicatorVar.gtable$d$gtable), nrow = 2,ncol = 2, top = textGrob("Schematic expression pattern",gp = gpar(fontsize = 20,fontface = "bold")), bottom = textGrob("For group a all six expression values are assigned to the class 0 (black).\n For the groups b - d all six expression values are assigned to two classes - class 0 (red) and class 1 (blue).",gp = gpar(fontsize = 12,font = 1)) )
Experimental design where the rows present the RNAi treatment -- without RNAi, RNAi against EGFR splice variant I ($\text{siRNA}{I}$), and RNAi against all EGFR splice variants ($\text{siRNA}{ALL}$) -- and the columns present the EGF treatment. The six corresponding logarithmic expression values per gene are denoted by $x_1, \dots, x_6$.
| | no EGF | EGF | |---------------- |----------------|----------------| |no RNAi | $x_{1}$ | $x_{2}$ | |RNAi by $\text{siRNA}{I}$ | $x{3}$ | $x_{4}$ | |RNAi by $\text{siRNA}{ALL}$ | $x{5}$ | $x_{6}$ |
For group $a$ we assume that all six expression levels stem from the same normal distribution. In this case, the mean $\mu$ and standard deviation $\sigma$ of this normal distribution (black) is equal to $\mu$ and $\sigma$ of the six expression levels. For gene groups b-d, we assume that all six expression levels stems from a mixture of two normal distributions with independent means $\mu_0$ and $\mu_1$, and one pooled standard deviation $\sigma$. For gene groups $b-d$, we assume that the expression levels [$x_1$, $x_3$, and $x_5$], [$x_1$, $x_3$, $x_4$, and $x_5$], and [$x_1$, $x_3$, $x_4$, $x_5$, and $x_6$] stem from the normal distribution based on $\mu_0$ (red), respectively. For gene groups $b-d$, we assume that the expression levels [$x_2$, $x_4$, and $x_6$], [$x_2$ and $x_4$], and [$x_2$] stem from the normal distribution based on $\mu_1$ (blue), respectively.
Lsets <- get.Lset() #getting a list with the schematic expression pattern normDataLogLikData <- logLikelihoodOfnormData(normData$E) #calc loglik for each gene in each group normDataLogLik <- normDataLogLikData[['logL']] ALL.MUs <- normDataLogLikData[['ALL.MUs']] #table with mean for each gene in each group ALL.VARs <- normDataLogLikData[['ALL.VARs']] #table with var for each gene in each group
As an example, we show for each group a gene having the minimum log-likelihood. For the groups $a-d$, the examples are ABCB7, ACSL1, TPR, and ADAR, respectively. In each figure, we plot the probability density of the normal distribution for the group $a$ as a black curve and mark the six log2-expression values with black circles. For groups $b−d$, we plot with red and blue curves the probability densities of the normal distributions and mark the six log2-expression values with circles, which are colored according to classes for class 0 in red and for class 1 in blue.
GeneExample <- c('ILMN_1687840','ILMN_1684585','ILMN_1730999','ILMN_2320964') names(GeneExample) <- c('g1','g2','g3','g4') tmpPlot <- purrr::map2(GeneExample,c('a','b','c','d'),function(.x,.y) Density.NV.fit.plot(id = .x ,normData, ALL.MUs, ALL.VARs, useGroup = .y ,DOplot = FALSE) ) grid.arrange( tmpPlot$g1 + theme(legend.position = "none"), tmpPlot$g2 + theme(legend.position = "none"), tmpPlot$g3 + theme(legend.position = "none"), tmpPlot$g4 + theme(legend.position = "none") ,ncol = 2,nrow = 2, bottom = textGrob("The group having the minimum log-likelihood is highlighted with yellow.",gp = gpar(fontsize = 12,font = 1)) )
Performing classification through model selection based on minimum log-likelihood is problematic when the number of free model parameters is not identical among all models under comparison. Here, model $a$ has two free model parameters, while models $b$, $c$, and $d$ have three. Hence, a naive classification based on a minimum log-likelihood criterion would give a spurious advantage to models $b$, $c$, and $d$ with three free model parameters over model $a$ with only two free parameters. To eliminate that spurious advantage, we compute marginal likelihoods $p(x|z)$ using the approximation of Schwarz et al. commonly referred to as Bayesian Information Criterion.
npar <- sapply(Lsets, function(x) sum(!sapply(x,is.null ) )) + 1 ## number parameters for log-likelihood -> mean + var k <- sapply(Lsets, function(x) sum( sapply(x,length) )) ## number of samples print(rbind(npar,k)) normDataBIC <- get.IC(normDataLogLik , npar, k , IC = 'BIC')
message('Number of genes assigned to group with the minimal Bayesian Information Criterion') BICminInd <- apply( normDataBIC,MARGIN = 1, FUN = minIndex) tmp <- table(BICminInd); names(tmp) <- names(Lsets) tmpMat <- as.matrix(tmp);colnames(tmpMat) <- "#genes assigned to group" print(t(tmpMat))
We assume that $70\%$ of all genes are not regulated by EGF, so we define the prior probability for group a by $p(a) = 0.70$. Further, we assume that the remaining $30\%$ of the genes fall equally in groups with EGF-regulation, so we define the prior probabilities for groups $b$, $c$, and $d$ by $p(b) = p(c) = p(d) = 0.1$. We can compute for $z \in {a, b, c, d}$ the posterior $p(z|x) \approx p(x|z) \cdot p(z)$ and then perform Bayesian model selection by assigning each gene to that group $z$ with the maximum approximate posterior $p(z|x)$. Further, we define as putative target genes for each group the subset of genes with an approximate posterior probability exceeding $0.75$.
normDataPosterior <- get.Posterior( normDataBIC ,Pis = c(0.7,0.1,0.1,0.1)) POSTmaxInd <- apply(normDataPosterior, MARGIN = 1 ,FUN = maxIndex) PostClass <- get.gene.group(data = normDataPosterior,indexing = "maximal",filter = 0.75, DoPlot = TRUE)
Genes of the group $c$ are putative target genes regulated by EGFR isoforms II-IV and not by other receptors.
After calculating the log2-fold change for group $c$ by [$\mu_{c1}$ - $\mu_{c0}$], we validated three up-regulated genes, namely CKAP2L, ROCK1, and TPR and three down-regulated genes, namely ALDH4A1, CLCA2, and GALNS.
### calculating mean and log2-fold change MeanFoldChangeClass <- get.log2Mean.and.log2FC(normData = normData) ### load qPCR data data(qPCR_SF767, envir = environment()) qgenes <- c('CKAP2L','ROCK1','TPR','ALDH4A1','CLCA2','GALNS') comparative_qPCR <- list() qPCR_Mean <- qPCR_log2FC <- data.frame() for(n in qgenes){ Ref_GAPDH <- as.double(qPCR_SF767[,'GAPDH']) tmp <- as.double(qPCR_SF767[,n]) comparative_qPCR[[n]] <- comparativeMethod_qPCR.analysis(Gene = tmp,Ref = Ref_GAPDH) comparative_qPCR[[n]]$CellLine <- qPCR_SF767$CellLine comparative_qPCR[[n]]$Treatment <- qPCR_SF767$Treatment } SF.log2FC <- comparativeMethod_qPCR.RNAi.log2FC(comparative_qPCR) qCPRdataC <- SF.log2FC$dCT.C1C0 data.table::setkey(qCPRdataC,Gene) ### annotation of gene examples IDs.dt <- data.table::data.table(normData$genes,keep.rownames = T,key = 'rn') IDs.dt.c <- IDs.dt[rownames(PostClass$resFilter$c),] data.table::setkey(IDs.dt.c,'SYMBOL') qgenesIDs <- lapply(as.character(qCPRdataC$Gene), function(qg) as.character(IDs.dt.c[qg,][['rn']]) ) names(qgenesIDs) <- as.character(qCPRdataC$Gene)
We show the normalized expression for the six genes of group $c$. The normalized expression is shown in a similar way as the Schematic Expression Pattern
. For three up-regulated genes (CKAP2L, ROCK1, and TPR) the expression is higher for class $c1$ (dark red) and for three down-regulated genes (ALDH4A1, CLCA2, and GALNS) the expression is lower for class $c1$ (light red).
Gene.gtable <- list() for(tmoG in names(qgenesIDs)){ pmat <- t(matrix(normData$E[qgenesIDs[[tmoG]],],2,3)) dimnames(pmat) <- list( c('no RNAi','RNAi by siRNA ALL','RNAi by siRNA I'),c('no EGF','EGF')) pmat <- pmat[c(1,3,2),] ## to get same oder as in paper pmat <- 2^pmat tmpnScore <- ceiling(pmat/100)*100 # tmpnScore <- ceiling(pmat/10)*10 bk = seq(0,max(tmpnScore),by = 1) hmcols <- colorRampPalette(c("white", "darkred"))(length(bk)-1) #pheatmap::pheatmap(pmat,color = hmcols,breaks = seq(0,max(tmpnScore),by = 1) ,display_numbers = T,fontsize_number = 12,cluster_rows = FALSE,cluster_cols = FALSE,main=tmoG,fontsize = 12,gaps_row = c(1,2),gaps_col = 1,border_color = 'black',number_color = 'black', ) # filename = paste0(plotDir,'/Heatmap_Score_',tair,'.pdf') ,width = 10 ,height = 10) Gene.gtable[[tmoG]] <- pheatmap::pheatmap(pmat,silent = T,color = hmcols,breaks = seq(0,max(tmpnScore),by = 1) ,display_numbers = T,fontsize_number = 15,cluster_rows = FALSE,cluster_cols = FALSE,main=tmoG,fontsize = 12,gaps_row = c(1,2),gaps_col = 1,border_color = 'black',number_color = 'black') }
addBorder_gtable <- function(g){ g <- gtable_add_grob(g, grobs = rectGrob(gp = gpar(fill = NA, lwd = 1)), t= 1 , b = nrow(g), l = 1, r = ncol(g)) # g <- gtable_add_grob(g, # grobs = rectGrob(gp = gpar(fill = NA, lwd = 1)), # t = 1, l = 1, r = ncol(g)) return(g) } grid.arrange( addBorder_gtable(Gene.gtable$CKAP2L$gtable), addBorder_gtable(Gene.gtable$ROCK1$gtable), addBorder_gtable(Gene.gtable$TPR$gtable), addBorder_gtable(Gene.gtable$ALDH4A1$gtable), addBorder_gtable(Gene.gtable$CLCA2$gtable), addBorder_gtable(Gene.gtable$GALNS$gtable), nrow = 2,ncol = 3)
We show the probability density distributions of the log2-normalized expression for the six genes of group $c$.
GeneExample <- qgenesIDs names(GeneExample) <- names(qgenesIDs) tmpPlot <- purrr::map2(GeneExample,c('c','c','c','c','c','c'),function(.x,.y) Density.NV.fit.plot(id = .x ,normData, ALL.MUs, ALL.VARs, useGroup = .y ,DOplot = FALSE) ) grid.arrange( tmpPlot$CKAP2L + theme(legend.position = "none"), tmpPlot$ROCK1 + theme(legend.position = "none"), tmpPlot$TPR + theme(legend.position = "none"), tmpPlot$ALDH4A1 + theme(legend.position = "none"), tmpPlot$CLCA2 + theme(legend.position = "none"), tmpPlot$GALNS + theme(legend.position = "none") ,ncol = 3,nrow = 2, bottom = textGrob("The group having the minimum log-likelihood is highlighted with yellow.",gp = gpar(fontsize = 12,font = 1)) )
We show the log2-normalized expression of the group $c$ for the six genes.
tmp <- data.table(MeanFoldChangeClass$c[do.call(c,qgenesIDs),.(rn,s1M,s0M,s1s0FC)]) setnames(tmp,c('rn','s1M','s0M','s1s0FC'),c('IlluminaID',"meanC1","meanC0",'log2-fold change')) tmp$GeneName <- names(qgenesIDs) tmp <- tmp[,c('GeneName','IlluminaID',"meanC1","meanC0",'log2-fold change'),with=F] setkey(tmp,'GeneName') print(tmp[c('CKAP2L', 'ROCK1','TPR','ALDH4A1', 'CLCA2', 'GALNS'),])
### bar Illumina expression ---------------------------------------------------------------------- PlotDataEXP <- make.plot.data.exp.Ill.qPCR(qCPRdata = qCPRdataC, MeanFoldChangeClass = MeanFoldChangeClass,class="c") # print(PlotDataEXP) rns <- levels(PlotDataEXP$Gene)[c(2,5,6,1,3,4)] ; PlotDataEXP$Gene <- factor(PlotDataEXP$Gene, levels = rns) rns <- levels(PlotDataEXP$pid)[c(2,5,6,1,3,4)] ; PlotDataEXP$pid <- factor(PlotDataEXP$pid, levels = rns) cols <- RColorBrewer::brewer.pal(8,'Paired')[c(6,2)] limits <- aes(ymax = PlotDataEXP$Mean + PlotDataEXP$stderr , ymin=PlotDataEXP$Mean - PlotDataEXP$stderr) dodge <- position_dodge(width = 0.9) base_size <- 12 PlotExp <- ggplot(PlotDataEXP,aes(x=factor(Gene),y=Mean,fill=factor(Set))) + geom_bar(position="dodge",stat="identity") + geom_errorbar(limits, position=dodge, width=0.25) + ylim(c(0 ,10)) + scale_fill_manual(values = cols,name="") + labs(x = "", y = "Log2 mean expression",title="mean expression data of Illumina") + thememapBarplot(base_size = base_size,legend_key_size = 0.6) + # theme( aspect.ratio = 9 / 16) + theme(legend.position="bottom") print(PlotExp)
We have found that the six $\log_2$-fold changes of the Illumina microarray expression levels, and those of the qPCR expression levels show a Pearson correlation coefficient of $0.99$ ($p$-value = 0.00002). Therefore, we can suggest that the set of 1,140 genes might contain some further putative target genes of isoforms II-IV of the epidermal growth factor receptor in tumor cells.
### bar log2 FC qPCR Illumina ---------------------------------------------------------------------- PlotDataFC <- make.plot.data.FC.Ill.qPCR(qCPRdata = qCPRdataC, qgenesIDs = qgenesIDs, MeanFoldChangeClass, class="c" ) print(cor.test( PlotDataFC[PlotDataFC$Set == 'Microarray','FC'],PlotDataFC[PlotDataFC$Set == 'qPCR','FC'] )) # print(PlotDataFC) rns <- qgenes #levels(PlotDataFC$Gene)[c(2,5,6,1,3,4)] ; PlotDataFC$Gene <- factor(PlotDataFC$Gene, levels = rns) # rns <- levels(PlotDataFC$pid)[c(2,5,6,1,3,4)] ; PlotDataFC$pid <- factor(PlotDataFC$pid, levels = rns) cols <- RColorBrewer::brewer.pal(11,"PRGn")[c(2,10)] limits <- aes(ymax = PlotDataFC$FC + PlotDataFC$stderr , ymin=PlotDataFC$FC - PlotDataFC$stderr) dodge <- position_dodge(width=0.9) # g1 <- ggplot(PlotDataFC,aes(x=factor(pid),y=FC,fill=factor(Set))) + PlotFC <- ggplot(PlotDataFC,aes(x=factor(Gene),y=FC,fill=factor(Set))) + geom_bar(position="dodge",stat="identity") + geom_errorbar(limits, position=dodge, width=0.25) + ylim(c(-2 ,2)) + labs(x = "", y = "Log2-fold change",title="Comparison of Illumina data and RT-qPCR") + scale_fill_manual(values = cols,name="") + thememapBarplot(base_size = base_size,legend_key_size = 0.6) + # theme( aspect.ratio = 9 / 16) + theme(legend.position="bottom") print(PlotFC)
message('data of "Comparison of Illumina data and RT-qPCR"-plot ') PlotDataFC.dt <- data.table(PlotDataFC) setkeyv(PlotDataFC.dt,c('Gene','Set')) knitr::kable(PlotDataFC.dt[ c('CKAP2L', 'ROCK1','TPR','ALDH4A1', 'CLCA2', 'GALNS'),.(Gene,Set,FC,stderr)], caption = "Comparison of Illumina data and RT-qPCR data", row.names = F,digits = 2)
print( sessionInfo() )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.