In areyesq89/HumanTissuesDEU: Analysis of tissue-dependent exon usage in humans

knitr::opts_chunk$set(tidy = FALSE,
                      cache = FALSE,
                      dev = "png",
                      message = FALSE,
                      error = FALSE,
                      warning = TRUE)

Analysis of variance of REUCs and RSICs

Precomputed values

Loading all the data from the HumanDEU package.

library( DEXSeq )
library( dplyr )
library( HumanDEU )
data( "resultsDF" )
data( "crossCoefs1", "crossCoefs2", "crossCoefs3",
     "crossCoefsJR1", "crossCoefsJR2", "crossCoefsJR3" )

With the function below, we fit a linear model on the REUCs and the RSICs using the tissues and individuals as categorical predictors. Then, this function calculates the coefficients of partial determination of these fits on each of the GTEx datasets.

estimateR2 <- function( crossCoefs, statistic, subset, BPPARAM=SerialParam() ){
    df <- data.frame(
        individual=rep( rownames(crossCoefs[1,,]), ncol( crossCoefs[1,,] ) ),
        tissue=rep( colnames(crossCoefs[1,,]), each=nrow( crossCoefs[1,,] ) ),
        stringsAsFactors=FALSE )
    df$reuc <- NA
    r2 <- bplapply(
        seq_along(dimnames(crossCoefs)[["exon"]]),
        function(x, dfL){
            df$reuc <- as.vector(crossCoefs[x,,])
            if( any( is.na( df$reuc ) ) ){
                return(rep(NA, 3))
            }else{
                anovaRes <- anova( lm( reuc ~ individual + tissue, df ) )
                beta <- anovaRes[c("individual", "tissue", "Residuals"),"Sum Sq"]
                return( beta )
            }
        }, dfL=df, BPPARAM=BPPARAM )
    res <- do.call(rbind, r2)
    colnames(res) <- c("individual", "tissue", "residuals")
    rownames(res) <- dimnames(crossCoefs)[["exon"]]
    res <- as.data.frame( res )
    res$statistic <- statistic
    res$subset <- subset
    res$exon <- rownames(res)
    res
}

However, since this calculations would take long to complete, this package provides the precomputed results. These results were generated by running the code below.

#BPPARAM <- SnowParam(7)
BPARAM <- SerialParam()
varExplained1 <- estimateR2( crossCoefs1, "REUC", "subsetA", BPPARAM )
varExplained2 <- estimateR2( crossCoefs2, "REUC", "subsetB", BPPARAM )
varExplained3 <- estimateR2( crossCoefs3, "REUC", "subsetC", BPPARAM )
varExplainedJR1 <- estimateR2( crossCoefsJR1, "RSIC", "subsetA", BPPARAM )
varExplainedJR2 <- estimateR2( crossCoefsJR2, "RSIC", "subsetB", BPPARAM )
varExplainedJR3 <- estimateR2( crossCoefsJR3, "RSIC", "subsetC", BPPARAM )

varExplained <- rbind( varExplained1, varExplained2, varExplained3,
                      varExplainedJR1, varExplainedJR2, varExplainedJR3 )
rownames( varExplained ) <- NULL

The code below makes sure that the output of the function is the same as the object saved in this package.

data("varExplained")

all( head( varExplained, 5 ) ==
        estimateR2( crossCoefs1[1:5,,],
                   statistic="REUC",
                   subset="subsetA",
                   SerialParam() ) )

Similarly, we compute the correlation between REUCs and RSICs for the three subsets of the GTEx data.

estimateCors <- function( cc, ccJR, BPPARAM ){
    res <- bplapply(
        dimnames(cc)[["exon"]],
        function( ex, ccL, ccJRL ){
            cor( as.vector( ccL[ex,,] ), as.vector( ccJRL[ex,,] ) )
        }, ccL=cc, ccJR=ccJR, BPPARAM=BPPARAM )
    names( res ) <- dimnames(cc)[["exon"]]
    unlist( res )
}

Similar to other sections, we don't estimate all the correlation coefficients for each exon because it is computationally expensive. Instead the object resultsDF already contains the pre-computed results.

BPPARAM <- SerialParam()
BPPARAM <- SnowParam(6)

cors1 <- estimateCors( crossCoefs1, crossCoefsJR1, BPPARAM )
cors2 <- estimateCors( crossCoefs2, crossCoefsJR2, BPPARAM )
cors3 <- estimateCors( crossCoefs3, crossCoefsJR3, BPPARAM )

resultsDF$corRiscReuc <- NA

for( i in 1:3 ){
    sub <- paste0( "subset", LETTERS[i] )
    corOb <- get( paste0( "cors", i ) )
    stopifnot(
        with( resultsDF[resultsDF$label == sub,],paste( gene, exon, sep=":" )  ) == names( corOb ) )
    resultsDF[resultsDF$label == sub,"corRiscReuc"] <- corOb
}

#save( resultsDF, file="../data/resultsDF.RData" )

We verify that the code indeed reproduces identical correlation coefficients as the ones pre-computed.

data("resultsDF")

all( resultsDF$corRiscReuc[1:100] ==
    estimateCors(
        crossCoefs1[1:100,,],
        crossCoefsJR1[1:100,,],
        SerialParam() ) )

Print statistics on tables and defining functions

We load the necessary libraries to continue with the analysis.

library(DEXSeq)
library(ggplot2)
library(dplyr)
library(xtable)
library(cowplot)
library(scales)
data("resultsGeneTable")

Next, we defined a function to plot a barplot of the number of exon-skipping reads for DEU exons.

varPlot <- function( results, subset="" ){
    results <- filter( results, label == subset, padj < 0.1, tdu > 1 )
    otherMeans <- results$esMeans
    numExons <- table( cut( otherMeans,
                           c(0, 1, 3, 5, 10, max(otherMeans)),
                           include.lowest=TRUE ) )
    df <- data.frame(numExons)
    colnames( df ) <- c("Exon skipping\nreads", "Number")
    levels(df$`Exon skipping\nreads`)[5] <- ">10"
    pl <- ggplot(df, aes(`Exon skipping\nreads`, Number,
                         fill=`Exon skipping\nreads`)) +
        geom_bar(stat="identity") +
        ylab("Exonic regions") +
        xlab("Exon skipping reads") +
        scale_y_continuous(labels = comma) +
        theme( legend.position="none",
              axis.text.x = element_text(angle = 35, hjust = 1) )
    pl
}

fig3PanelA <- varPlot( resultsDF, "subsetA" )

Out of the differentially used exons in each subset of GTEX, what is the evidence for the effect being driven by alternative splicing?

asSupportDF <-
    do.call(rbind,
            lapply( c("subsetA", "subsetB", "subsetC"),
                   function(x){
                       vData <- varPlot( resultsDF, x )$data
                       cbind( Subset=x, vData,
                             Percentage=round(100*vData$Number/sum(vData$Number) ) )
                   }) )
levels( asSupportDF$Subset ) <- c( "A", "B", "C")
colnames( asSupportDF )[2] <- "Exon skipping reads (mean)"
colnames( asSupportDF )[3] <- "# of exonic regions"
colnames( asSupportDF )[4] <- "% of exonic regions"

asSupportDF

print( xtable( asSupportDF, display=c("s", "s", "s", "d", "d") ,
                caption="Evidence of alternative splicing for exons with
  tissue-dependent usage. For each subset of the GTEx data (first column), the
  number of exonic regions with TDU (third column) is stratified
  according to their means of normalized sequenced fragments supporting
  their splicing out from transcripts (second column). The fourth column
  shows the percentage of exonic regions in each strata for each
  subset of data",
  label="tabS:table4",
  align=c("p{0.05\\textwidth}", "p{0.1\\textwidth}",
          "p{0.25\\textwidth}", "p{0.25\\textwidth}",
          "p{0.25\\textwidth}") ),
  hline.after=c(-1, 0, 5, 10, nrow(asSupportDF) ),
  format.args = list(big.mark = ",", decimal.mark = "."),
  digits=0, include.rownames=FALSE,
  file="table4.tex" )

Out of the genes with at least one exon being differentially used across tissues (on each subset), for how many of them alternative splicing can explain all the exon usage differences across tissues?

for( subset in c("A", "B", "C") ){
    lab <- paste0( "subset", subset )
    deuAs <- paste0( "deuAsSupport", subset )
    asExons <- filter( resultsDF, label == lab,
                      padj < 0.1, tdu > 1,
                      esMeans > 1 ) %>% count(gene)
    exonsPerGene[[deuAs]] <- 0
    exonsPerGene[as.character( asExons$gene ),deuAs] <- asExons$n
}

asSupportGenes <- sapply( c("A", "B", "C"), function(subset){
    deuExons <- paste0("deuExons", subset )
    deuExonsAs <- paste0("deuAsSupport", subset )
    numDeuEx <- sum( exonsPerGene[,deuExons] > 0 )
    numAsDeuEx <- sum( exonsPerGene[,deuExonsAs] == exonsPerGene[,deuExons] &
                       exonsPerGene[,deuExons] > 0 )
    c( `# of genes with TDU`=numDeuEx,
      `# of genes with TDU (fully AS explained)`=numAsDeuEx,
      `% of genes with TDU (fully AS explained)`=round( numAsDeuEx/numDeuEx*100 ) )
})

asSupportGenes

print( xtable( asSupportGenes, display=c("s", "d", "d", "d"),
                caption="Genes with TDEU that could be explained by alternative splicing.
  Each column shows data for a subset of the GTEx data.
  The first row shows the number of genes with tissue-dependent
  usage in at least one exonic region. The second row shows the
  subset of genes from the first row in which all the exonic regions
  that are used in a tissue-dependent manner have evidence of being
  alternatively spliced (normalized mean across samples of
  exon-skipping reads larger than 1). The third row shows the
  same quantity as the second row but expressed in percentage of genes.",
  label="tabS:table5" ),
  format.args = list(big.mark = ",", decimal.mark = "."),
  digits=0, include.rownames=TRUE,
  file="table5.tex")

We now define a function to plot the $R^{2}$ coefficients of the fits done using the REUCs and the RSICs as response variables.

plotR2R2 <- function( varExplained, results, sub ){
    varResults <-
        filter( varExplained, subset == sub )
    results <- filter( results, label == sub )
    otherMeans <- results$esMeans
    names( otherMeans ) <- with( results, paste( gene, exon, sep=":"))
    tissueSign <- filter( results, padj < 0.1 )[,c("gene", "exon")]
    tissueSign <- paste( tissueSign$gene, tissueSign$exon, sep=":" )
    varResults <- split( varResults, varResults$statistic )
    stopifnot( all( varResults[["REUC"]]$exon == varResults[["RSIC"]]$exon ) )
    stopifnot( all( names(otherMeans) == varResults[["REUC"]]$exon ) )
    dfPlot <- as.data.frame( sapply( varResults, function(x){
        percentageTissue=x[,c("tissue")] /
            rowSums( x[,c("tissue", "individual", "residuals" ) ] ) } ) )
    rownames( dfPlot ) <- varResults[[1]]$exon
    dfPlot$Junctions <- cut( otherMeans, c(0, 1, 3, 5, 10, max(otherMeans) ),
                            include.lowest=TRUE )
    levels(dfPlot$Junctions)[5] <- ">10"
    dfPlot <- dfPlot[tissueSign,]
    p3 <- ggplot( dfPlot, aes( REUC, RSIC,
                              color=Junctions ) ) +
        geom_point( alpha=.25, size=.1 ) +
        xlab( expression(paste( R^{2}, "of REUCs" ) ) ) +
        ylab( expression(paste( R^{2}, "of RSICs" ) ) ) +
        guides(colour = guide_legend(title="Exon\nskipping\nreads (mean)") ) +
        theme(legend.position="none")
    p3
}

fig3PanelB <- plotR2R2( varExplained, resultsDF, "subsetA" )

As an additional check, we also chech the distribution of correlations between REUCs and RSICs for each exon, stratifying by the evidence of alternative splicing (exon-skipping reads).

plotCorDensities <- function( results, subset ){
    dfHist <- filter( resultsDF, label == subset )
    dfHist <- filter( resultsDF, padj < 0.1, tdu > 1 )
    dfHist$Junctions <- cut( dfHist$esMeans, c(0, 1, 3, 5, 10, max( dfHist$esMeans )),
                            include.lowest=TRUE)
    levels(dfHist$Junctions)[5] <- ">10"
    gp <- ggplot(dfHist) +
        stat_ecdf(aes(corRiscReuc, #y=..scaled..,
                      group=Junctions, colour = Junctions),
                  geom="line", lwd=1.3) +
        xlim(-0.4, 1) +
        xlab("Correlation") +
        ylab("Cumulative\ndistribution") +
        guides(
            colour = guide_legend( title="Exon skipping reads"),
            fill = FALSE ) +
        theme(legend.position="none")
    gp
}


fig3PanelC <- plotCorDensities( resultsDF, "subsetA" )

Print plots in panels

Panel corresponding to figure 3

#save_plot( file.path( path, "plots", "figure3.png"),
plot_grid( fig3PanelA, fig3PanelB, fig3PanelC, ncol=3,
          align='h', labels="AUTO")#,
                                        #base_height=2.7, base_aspect_ratio=3 )

Corresponding supplementary fig.

#save_plot( file.path(path, "plots", "FigS6.png"),
plot_grid(
    varPlot( resultsDF, "subsetB" ),
    plotR2R2( varExplained, resultsDF, "subsetB" ),
    plotCorDensities( resultsDF, "subsetB" ),
    varPlot( resultsDF, "subsetC" ),
    plotR2R2( varExplained, resultsDF, "subsetC" ),
    plotCorDensities( resultsDF, "subsetC" ), ncol=3,
    align="hv", labels="AUTO")#,
          #base_height=5, base_aspect_ratio=1.7 )