FRASER.R
In FRASER: Find RAre Splicing Events in RNA-Seq Data

## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"--------------------
BiocStyle::latex()

## ----parallel, echo=FALSE, cache=FALSE, results="hide"---------------------
library(BiocParallel)
# for speed we run everything in serial mode
register(SerialParam())

## ----knitr, echo=FALSE, cache=FALSE, results="hide"------------------------
library(knitr)
opts_chunk$set(
    tidy=FALSE,
    dev="png",
    fig.width=7, 
    fig.height=7,
    dpi=300,
    message=FALSE,
    warning=FALSE,
    cache=FALSE
)

## ----include=FALSE---------------------------------------------------------
opts_chunk$set(concordance=TRUE)

## ----loadFraser, echo=FALSE------------------------------------------------
suppressPackageStartupMessages({
    library(FRASER)
})

## ----quick_fraser_guide, echo=TRUE-----------------------------------------
# load FRASER library
library(FRASER)

# count data
fds <- createTestFraserSettings()
fds <- countRNAData(fds)
fds

# compute stats
fds <- calculatePSIValues(fds)

# filtering junction with low expression
fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20,
       minDeltaPsi=0.0, filter=TRUE)

# fit the splicing model for each metric 
# with a specific latentsapce dimension
fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3))

# we provide two ways to anntoate introns with the corresponding gene symbols:
# the first way uses TxDb-objects provided by the user as shown here
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgDb <- org.Hs.eg.db
fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)

# alternatively, we also provide a way to use biomart for the annotation:
# fds <- annotateRanges(fds)

# get results: we recommend to use an FDR cutoff 0.05, but due to the small
# dataset size we extract all events and their associated values
# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3)
res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA)
res

# result visualization
plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE)


## ----sampleData Table, echo=TRUE-------------------------------------------
sampleTable <- fread(system.file(
    "extdata", "sampleTable.tsv", package="FRASER", mustWork=TRUE))
head(sampleTable)

## ----FRASER setting example1, echo=TRUE------------------------------------
# convert it to a bamFile list
bamFiles <- system.file(sampleTable[,bamFile], package="FRASER", mustWork=TRUE)
sampleTable[,bamFile:=bamFiles]

# create FRASER object
settings <- FraserDataSet(colData=sampleTable,
        workingDir=tempdir())

# show the FraserSettings object
settings

## ----FRASER setting example2, echo=TRUE------------------------------------
settings <- createTestFraserSettings()
settings

## ----parallelize example, eval=FALSE---------------------------------------
#  # example of how to use parallelization: use 10 cores or the maximal number of
#  # available cores if fewer than 10 are available and use Snow if on Windows
#  if(.Platform$OS.type == "unix") {
#      register(MulticoreParam(workers=min(10, multicoreWorkers())))
#  } else {
#      register(SnowParam(workers=min(10, multicoreWorkers())))
#  }

## ----counting reads, echo=TRUE---------------------------------------------
# count reads
fds <- countRNAData(settings)
fds

## ----create fds with counts, echo=TRUE-------------------------------------
# example sample annoation for precalculated count matrices
sampleTable <- fread(system.file("extdata", "sampleTable_countTable.tsv",
        package="FRASER", mustWork=TRUE))
head(sampleTable)

# get raw counts
junctionCts   <- fread(system.file("extdata", "raw_junction_counts.tsv.gz",
        package="FRASER", mustWork=TRUE))
head(junctionCts)

spliceSiteCts <- fread(system.file("extdata", "raw_site_counts.tsv.gz",
        package="FRASER", mustWork=TRUE))
head(spliceSiteCts)

# create FRASER object
fds <- FraserDataSet(colData=sampleTable, junctions=junctionCts,
        spliceSites=spliceSiteCts, workingDir=tempdir())
fds

## ----calculate psi/zscore values, echo=TRUE--------------------------------
fds <- calculatePSIValues(fds)
fds

## ----filter_junctions, echo=TRUE-------------------------------------------
fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.0, filter=FALSE)

plotFilterExpression(fds, bins=100)

## ----subset to filtered junctions, echo=TRUE-------------------------------
fds_filtered <- fds[mcols(fds, type="j")[,"passed"],]
fds_filtered
# filtered_fds not further used for this tutorial because the example dataset
# is otherwise too small

## ----sample_covariation, echo=TRUE-----------------------------------------
# Heatmap of the sample correlation
plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE)

## ----intron_sample_correlation, echo=TRUE, eval=FALSE----------------------
#  # Heatmap of the intron/sample expression
#  plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE,
#      plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)

## ----model fitting, echo=TRUE----------------------------------------------
# This is computational heavy on real size datasets and can take awhile
fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2))

## ----covariation_after_fitting, echo=TRUE----------------------------------
plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE)

## ----result table, echo=TRUE-----------------------------------------------
# annotate introns with the HGNC symbols of the corresponding gene
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgDb <- org.Hs.eg.db
fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
# fds <- annotateRanges(fds) # alternative way using biomaRt

# retrieve results with default and recommended cutoffs (padj <= 0.05 and
# |deltaPsi| >= 0.3)
res <- results(fds)

## ----result_table, echo=TRUE-----------------------------------------------
# to show result visualization functions for this tuturial, zScore cutoff used
res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1)
res

## ----finding_candidates, echo=TRUE-----------------------------------------
plotVolcano(fds, type="psi5", "sample10")

## ----sample result, echo=TRUE----------------------------------------------
sampleRes <- res[res$sampleID == "sample10"]
sampleRes

## ----plot_expression, echo=TRUE, eval=FALSE--------------------------------
#  plotExpression(fds, type="psi5", result=sampleRes[1])
#  plotExpectedVsObservedPsi(fds, result=sampleRes[1])

## ----save_load, echo=TRUE--------------------------------------------------
# saving a fds
workingDir(fds) <- tempdir()
name(fds) <- "ExampleAnalysis"
saveFraserDataSet(fds, dir=workingDir(fds), name=name(fds))

# two ways of loading a fds by either specifying the directory and anaysis name
# or directly giving the path the to fds-object.RDS file
fds <- loadFraserDataSet(dir=workingDir(fds), name=name(fds))
fds <- loadFraserDataSet(file=file.path(workingDir(fds), 
    "savedObjects", "ExampleAnalysis", "fds-object.RDS"))


## ----control confounders, echo=TRUE----------------------------------------
# Using an alternative way to correct splicing ratios
# here: only 2 iteration to speed the calculation up
# for the vignette, the default is 15 iterations
fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder", 
            iterations=2)

## ----findBestQ, echo=TRUE--------------------------------------------------
set.seed(42)
# hyperparameter opimization
fds <- optimHyperParams(fds, type="psi5", plot=FALSE)

# retrieve the estimated optimal dimension of the latent space
bestQ(fds, type="psi5")

## ----figure_findBestQ, echo=TRUE-------------------------------------------
plotEncDimSearch(fds, type="psi5")

## ----p-value calculation, echo=TRUE----------------------------------------
fds <- calculatePvalues(fds, type="psi5")
head(pVals(fds, type="psi5"))

## ----p-adj calculation, echo=TRUE------------------------------------------
fds <- calculatePadjValues(fds, type="psi5", method="BY")
head(padjVals(fds,type="psi5"))

## ----z-score calculation, echo=TRUE----------------------------------------
fds <- calculateZscore(fds, type="psi5")
head(zScores(fds, type="psi5"))

## ----result_visualization, echo=TRUE---------------------------------------
plotAberrantPerSample(fds)

# qq-plot for single junction
plotQQ(fds, result=res[1])

# global qq-plot (on gene level since aggregate=TRUE)
plotQQ(fds, aggregate=TRUE, global=TRUE)

## ----sessionInfo, echo=FALSE-----------------------------------------------
sessionInfo()