knitr::opts_chunk$set(tidy = FALSE, cache = FALSE, dev = "png", message = FALSE, error = FALSE, warning = TRUE)
Loading all the data from the HumanDEU package.
library( DEXSeq ) library( dplyr ) library( HumanDEU ) data( "resultsDF" ) data( "crossCoefs1", "crossCoefs2", "crossCoefs3", "crossCoefsJR1", "crossCoefsJR2", "crossCoefsJR3" )
With the function below, we fit a linear model on the REUCs and the RSICs using the tissues and individuals as categorical predictors. Then, this function calculates the coefficients of partial determination of these fits on each of the GTEx datasets.
estimateR2 <- function( crossCoefs, statistic, subset, BPPARAM=SerialParam() ){ df <- data.frame( individual=rep( rownames(crossCoefs[1,,]), ncol( crossCoefs[1,,] ) ), tissue=rep( colnames(crossCoefs[1,,]), each=nrow( crossCoefs[1,,] ) ), stringsAsFactors=FALSE ) df$reuc <- NA r2 <- bplapply( seq_along(dimnames(crossCoefs)[["exon"]]), function(x, dfL){ df$reuc <- as.vector(crossCoefs[x,,]) if( any( is.na( df$reuc ) ) ){ return(rep(NA, 3)) }else{ anovaRes <- anova( lm( reuc ~ individual + tissue, df ) ) beta <- anovaRes[c("individual", "tissue", "Residuals"),"Sum Sq"] return( beta ) } }, dfL=df, BPPARAM=BPPARAM ) res <- do.call(rbind, r2) colnames(res) <- c("individual", "tissue", "residuals") rownames(res) <- dimnames(crossCoefs)[["exon"]] res <- as.data.frame( res ) res$statistic <- statistic res$subset <- subset res$exon <- rownames(res) res }
However, since this calculations would take long to complete, this package provides the precomputed results. These results were generated by running the code below.
#BPPARAM <- SnowParam(7) BPARAM <- SerialParam() varExplained1 <- estimateR2( crossCoefs1, "REUC", "subsetA", BPPARAM ) varExplained2 <- estimateR2( crossCoefs2, "REUC", "subsetB", BPPARAM ) varExplained3 <- estimateR2( crossCoefs3, "REUC", "subsetC", BPPARAM ) varExplainedJR1 <- estimateR2( crossCoefsJR1, "RSIC", "subsetA", BPPARAM ) varExplainedJR2 <- estimateR2( crossCoefsJR2, "RSIC", "subsetB", BPPARAM ) varExplainedJR3 <- estimateR2( crossCoefsJR3, "RSIC", "subsetC", BPPARAM ) varExplained <- rbind( varExplained1, varExplained2, varExplained3, varExplainedJR1, varExplainedJR2, varExplainedJR3 ) rownames( varExplained ) <- NULL
The code below makes sure that the output of the function is the same as the object saved in this package.
data("varExplained") all( head( varExplained, 5 ) == estimateR2( crossCoefs1[1:5,,], statistic="REUC", subset="subsetA", SerialParam() ) )
Similarly, we compute the correlation between REUCs and RSICs for the three subsets of the GTEx data.
estimateCors <- function( cc, ccJR, BPPARAM ){ res <- bplapply( dimnames(cc)[["exon"]], function( ex, ccL, ccJRL ){ cor( as.vector( ccL[ex,,] ), as.vector( ccJRL[ex,,] ) ) }, ccL=cc, ccJR=ccJR, BPPARAM=BPPARAM ) names( res ) <- dimnames(cc)[["exon"]] unlist( res ) }
Similar to other sections, we don't estimate all the correlation coefficients for each exon because it is computationally expensive. Instead the object resultsDF
already contains the pre-computed results.
BPPARAM <- SerialParam() BPPARAM <- SnowParam(6) cors1 <- estimateCors( crossCoefs1, crossCoefsJR1, BPPARAM ) cors2 <- estimateCors( crossCoefs2, crossCoefsJR2, BPPARAM ) cors3 <- estimateCors( crossCoefs3, crossCoefsJR3, BPPARAM ) resultsDF$corRiscReuc <- NA for( i in 1:3 ){ sub <- paste0( "subset", LETTERS[i] ) corOb <- get( paste0( "cors", i ) ) stopifnot( with( resultsDF[resultsDF$label == sub,],paste( gene, exon, sep=":" ) ) == names( corOb ) ) resultsDF[resultsDF$label == sub,"corRiscReuc"] <- corOb } #save( resultsDF, file="../data/resultsDF.RData" )
We verify that the code indeed reproduces identical correlation coefficients as the ones pre-computed.
data("resultsDF") all( resultsDF$corRiscReuc[1:100] == estimateCors( crossCoefs1[1:100,,], crossCoefsJR1[1:100,,], SerialParam() ) )
We load the necessary libraries to continue with the analysis.
library(DEXSeq) library(ggplot2) library(dplyr) library(xtable) library(cowplot) library(scales) data("resultsGeneTable")
Next, we defined a function to plot a barplot of the number of exon-skipping reads for DEU exons.
varPlot <- function( results, subset="" ){ results <- filter( results, label == subset, padj < 0.1, tdu > 1 ) otherMeans <- results$esMeans numExons <- table( cut( otherMeans, c(0, 1, 3, 5, 10, max(otherMeans)), include.lowest=TRUE ) ) df <- data.frame(numExons) colnames( df ) <- c("Exon skipping\nreads", "Number") levels(df$`Exon skipping\nreads`)[5] <- ">10" pl <- ggplot(df, aes(`Exon skipping\nreads`, Number, fill=`Exon skipping\nreads`)) + geom_bar(stat="identity") + ylab("Exonic regions") + xlab("Exon skipping reads") + scale_y_continuous(labels = comma) + theme( legend.position="none", axis.text.x = element_text(angle = 35, hjust = 1) ) pl } fig3PanelA <- varPlot( resultsDF, "subsetA" )
Out of the differentially used exons in each subset of GTEX, what is the evidence for the effect being driven by alternative splicing?
asSupportDF <- do.call(rbind, lapply( c("subsetA", "subsetB", "subsetC"), function(x){ vData <- varPlot( resultsDF, x )$data cbind( Subset=x, vData, Percentage=round(100*vData$Number/sum(vData$Number) ) ) }) ) levels( asSupportDF$Subset ) <- c( "A", "B", "C") colnames( asSupportDF )[2] <- "Exon skipping reads (mean)" colnames( asSupportDF )[3] <- "# of exonic regions" colnames( asSupportDF )[4] <- "% of exonic regions" asSupportDF print( xtable( asSupportDF, display=c("s", "s", "s", "d", "d") , caption="Evidence of alternative splicing for exons with tissue-dependent usage. For each subset of the GTEx data (first column), the number of exonic regions with TDU (third column) is stratified according to their means of normalized sequenced fragments supporting their splicing out from transcripts (second column). The fourth column shows the percentage of exonic regions in each strata for each subset of data", label="tabS:table4", align=c("p{0.05\\textwidth}", "p{0.1\\textwidth}", "p{0.25\\textwidth}", "p{0.25\\textwidth}", "p{0.25\\textwidth}") ), hline.after=c(-1, 0, 5, 10, nrow(asSupportDF) ), format.args = list(big.mark = ",", decimal.mark = "."), digits=0, include.rownames=FALSE, file="table4.tex" )
Out of the genes with at least one exon being differentially used across tissues (on each subset), for how many of them alternative splicing can explain all the exon usage differences across tissues?
for( subset in c("A", "B", "C") ){ lab <- paste0( "subset", subset ) deuAs <- paste0( "deuAsSupport", subset ) asExons <- filter( resultsDF, label == lab, padj < 0.1, tdu > 1, esMeans > 1 ) %>% count(gene) exonsPerGene[[deuAs]] <- 0 exonsPerGene[as.character( asExons$gene ),deuAs] <- asExons$n } asSupportGenes <- sapply( c("A", "B", "C"), function(subset){ deuExons <- paste0("deuExons", subset ) deuExonsAs <- paste0("deuAsSupport", subset ) numDeuEx <- sum( exonsPerGene[,deuExons] > 0 ) numAsDeuEx <- sum( exonsPerGene[,deuExonsAs] == exonsPerGene[,deuExons] & exonsPerGene[,deuExons] > 0 ) c( `# of genes with TDU`=numDeuEx, `# of genes with TDU (fully AS explained)`=numAsDeuEx, `% of genes with TDU (fully AS explained)`=round( numAsDeuEx/numDeuEx*100 ) ) }) asSupportGenes print( xtable( asSupportGenes, display=c("s", "d", "d", "d"), caption="Genes with TDEU that could be explained by alternative splicing. Each column shows data for a subset of the GTEx data. The first row shows the number of genes with tissue-dependent usage in at least one exonic region. The second row shows the subset of genes from the first row in which all the exonic regions that are used in a tissue-dependent manner have evidence of being alternatively spliced (normalized mean across samples of exon-skipping reads larger than 1). The third row shows the same quantity as the second row but expressed in percentage of genes.", label="tabS:table5" ), format.args = list(big.mark = ",", decimal.mark = "."), digits=0, include.rownames=TRUE, file="table5.tex")
We now define a function to plot the $R^{2}$ coefficients of the fits done using the REUCs and the RSICs as response variables.
plotR2R2 <- function( varExplained, results, sub ){ varResults <- filter( varExplained, subset == sub ) results <- filter( results, label == sub ) otherMeans <- results$esMeans names( otherMeans ) <- with( results, paste( gene, exon, sep=":")) tissueSign <- filter( results, padj < 0.1 )[,c("gene", "exon")] tissueSign <- paste( tissueSign$gene, tissueSign$exon, sep=":" ) varResults <- split( varResults, varResults$statistic ) stopifnot( all( varResults[["REUC"]]$exon == varResults[["RSIC"]]$exon ) ) stopifnot( all( names(otherMeans) == varResults[["REUC"]]$exon ) ) dfPlot <- as.data.frame( sapply( varResults, function(x){ percentageTissue=x[,c("tissue")] / rowSums( x[,c("tissue", "individual", "residuals" ) ] ) } ) ) rownames( dfPlot ) <- varResults[[1]]$exon dfPlot$Junctions <- cut( otherMeans, c(0, 1, 3, 5, 10, max(otherMeans) ), include.lowest=TRUE ) levels(dfPlot$Junctions)[5] <- ">10" dfPlot <- dfPlot[tissueSign,] p3 <- ggplot( dfPlot, aes( REUC, RSIC, color=Junctions ) ) + geom_point( alpha=.25, size=.1 ) + xlab( expression(paste( R^{2}, "of REUCs" ) ) ) + ylab( expression(paste( R^{2}, "of RSICs" ) ) ) + guides(colour = guide_legend(title="Exon\nskipping\nreads (mean)") ) + theme(legend.position="none") p3 } fig3PanelB <- plotR2R2( varExplained, resultsDF, "subsetA" )
As an additional check, we also chech the distribution of correlations between REUCs and RSICs for each exon, stratifying by the evidence of alternative splicing (exon-skipping reads).
plotCorDensities <- function( results, subset ){ dfHist <- filter( resultsDF, label == subset ) dfHist <- filter( resultsDF, padj < 0.1, tdu > 1 ) dfHist$Junctions <- cut( dfHist$esMeans, c(0, 1, 3, 5, 10, max( dfHist$esMeans )), include.lowest=TRUE) levels(dfHist$Junctions)[5] <- ">10" gp <- ggplot(dfHist) + stat_ecdf(aes(corRiscReuc, #y=..scaled.., group=Junctions, colour = Junctions), geom="line", lwd=1.3) + xlim(-0.4, 1) + xlab("Correlation") + ylab("Cumulative\ndistribution") + guides( colour = guide_legend( title="Exon skipping reads"), fill = FALSE ) + theme(legend.position="none") gp } fig3PanelC <- plotCorDensities( resultsDF, "subsetA" )
Panel corresponding to figure 3
#save_plot( file.path( path, "plots", "figure3.png"), plot_grid( fig3PanelA, fig3PanelB, fig3PanelC, ncol=3, align='h', labels="AUTO")#, #base_height=2.7, base_aspect_ratio=3 )
Corresponding supplementary fig.
#save_plot( file.path(path, "plots", "FigS6.png"), plot_grid( varPlot( resultsDF, "subsetB" ), plotR2R2( varExplained, resultsDF, "subsetB" ), plotCorDensities( resultsDF, "subsetB" ), varPlot( resultsDF, "subsetC" ), plotR2R2( varExplained, resultsDF, "subsetC" ), plotCorDensities( resultsDF, "subsetC" ), ncol=3, align="hv", labels="AUTO")#, #base_height=5, base_aspect_ratio=1.7 )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.