In jgilis/iSEEtranscripts: Unlocking iSEE for transcript-level visualization

knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, dev="png",
                      message=FALSE, error=FALSE, warning=FALSE)

Quantification of data from @alasoo against GENCODE [@gencode] using Salmon [@salmon].

library(macrophage)
dir <- system.file("extdata", package="macrophage")
coldata <- read.csv(file.path(dir, "coldata.csv"))
coldata <- coldata[,c(1,2,3,5)]
names(coldata) <- c("names","id","line","condition")
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
all(file.exists(coldata$files))

suppressPackageStartupMessages(library(SummarizedExperiment))

# This hidden code chunk is only needed for Bioc build machines,
# so that 'fishpond' will build regardless of whether
# the machine can connect to ftp.ebi.ac.uk.
# Using linkedTxomes to point to a GTF that lives in the macrophage pkg.
# The chunk can be skipped if you have internet connection,
# as tximeta will automatically ID the transcriptome and DL the GTF.
library(tximeta)
makeLinkedTxome(
  indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"),
  source="Gencode",
  organism="Homo sapiens",
  release="29",
  genome="GRCh38",
  fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
  gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
  write=FALSE
)

We load in the quantification data with tximeta:

library(dplyr)
coldata <- coldata %>% filter(condition %in% c("naive","IFNg"))
se <- tximeta(coldata, countsFromAbundance="scaledTPM", varReduce=TRUE)

se <- se[seqnames(se) == "chr1",]
se$condition <- factor(se$condition, c("naive","IFNg"))

library(DRIMSeq)
counts <- data.frame(gene_id=sapply(mcols(se)$gene_id, `[`, 1),
                     feature_id=mcols(se)$tx_name,
                     assays(se)[["counts"]])
samples <- as.data.frame(colData(se))
names(samples)[1] <- "sample_id"
d <- dmDSdata(counts=counts, samples=samples)
n <- 12
n.small <- 6
d <- dmFilter(d,
              min_samps_feature_expr=n.small, min_feature_expr=10,
              min_samps_feature_prop=n.small, min_feature_prop=0.1,
              min_samps_gene_expr=n, min_gene_expr=10)
d

library(DEXSeq)
sample.data <- DRIMSeq::samples(d)
count.data <- round(as.matrix(counts(d)[,-c(1:2)]))
dxd <- DEXSeqDataSet(countData=count.data,
                     sampleData=sample.data,
                     design=~sample + exon + condition:exon,
                     featureID=counts(d)$feature_id,
                     groupID=counts(d)$gene_id)
# this takes a little over a minute on my laptop
system.time({
  dxd <- estimateSizeFactors(dxd)
  dxd <- estimateDispersions(dxd, quiet=TRUE)
  dxd <- testForDEU(dxd, reducedModel=~sample + exon)
})
dxr <- DEXSeqResults(dxd, independentFiltering=FALSE)
qval <- perGeneQValue(dxr)
dxr.g <- data.frame(gene=names(qval),qval)
columns <- c("featureID","groupID","pvalue")
dxr2 <- as.data.frame(dxr[,columns])

library(pheatmap)
pheatmap(log10(as.matrix(dxr[dxr$groupID == names(which.min(qval)),"countData"])+1),
         cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, show_colnames=FALSE)

stageR for stagewise testing

library(stageR)
pConfirmation <- matrix(dxr$pvalue,ncol=1)
dimnames(pConfirmation) <- list(dxr$featureID,"transcript")
pScreen <- qval
names(pScreen) <- names(pScreen)
tx2gene <- as.data.frame(dxr[,c("featureID", "groupID")])
stageRObj <- stageRTx(pScreen=pScreen, pConfirmation=pConfirmation,
                      pScreenAdjusted=TRUE, tx2gene=tx2gene)
stageRObj <- stageWiseAdjustment(stageRObj, method="dtu", alpha=0.05)
dex.padj <- getAdjustedPValues(stageRObj, order=TRUE,
                               onlySignificantGenes=FALSE)
head(dex.padj, n=10)

We talked about having the test statistics available within the RangedSummarizedExperiment prior to calling iSEE. For now, I will just add the DEXSeq + stageR adjusted p-values to the rowData.

se <- se[rownames(se) %in% dex.padj$txID,] # filter se based on DRIMSeq filtering? Do we want that as a general rule?
rowData(se)[,c("gene_padj", "tx_padj")] <- dex.padj[match(rownames(se),dex.padj$txID),c("gene", "transcript")]

Visualize raw usages as proportions

library(iSEE)

# Helper to compute proportions (based on DEXSeq's classes.R)
.getTotalCount <- function(countData, tx2gene) {
    geneForEachTx <- as.character(tx2gene$gene_id[match(rownames(countData), tx2gene$tx_name)])
    forCycle <- split(seq_len(nrow(countData)), as.character(geneForEachTx))
    all <- lapply(forCycle, function(i) {
        sct <- countData[i, , drop = FALSE]
        rs <- t(vapply(seq_len(nrow(sct)), function(r) colSums(sct[, , drop = FALSE]), numeric(ncol(countData))))
        # adapted, removed "-r" to get gene-level counts
        rownames(rs) <- rownames(sct)
        rs
    })
    totalCount <- do.call(rbind, all)
    totalCount <- totalCount[rownames(countData), ]
    return(totalCount)
}

# Main function, relies on iSEE::FeatureAssayPlot
ProportionsPlot <- function(se,
                            transcript, 
                            XAxisColumnData, 
                            PanelWidth=6L){

    totalCount <- .getTotalCount(countData = assays(se)$counts, 
                                 tx2gene = rowData(se))
    assays(se)$proportions <- assays(se)$counts/totalCount

    # directly borrow FeatureAssayPlot from iSEE:
    init <- iSEE::FeatureAssayPlot(YAxisFeatureName=transcript, 
                                   Assay="proportions", 
                                   XAxis= "Column data", 
                                   XAxisColumnData = XAxisColumnData, 
                                   PanelWidth=6L)

    # !proportions assay only available in local function environment
    app <- iSEE(se, initial=list(init))
    return(app)
}

app <- ProportionsPlot(se = se,
                       transcript = "ENST00000486652.5", 
                       XAxisColumnData = "condition", 
                       PanelWidth=6L)