inst/doc/CLI_tutorial.R

## ----install, eval=FALSE------------------------------------------------------
#  install.packages("BiocManager")
#  BiocManager::install("psichomics")

## ----load, message=FALSE------------------------------------------------------
library(psichomics)

## ----TCGA options-------------------------------------------------------------
# Available tumour types
cohorts <- getFirebrowseCohorts()

# Available sample dates
date <- getFirebrowseDates()

# Available data types
dataTypes <- getFirebrowseDataTypes()

## ----download, eval=FALSE-----------------------------------------------------
#  # Set download folder
#  folder <- getDownloadsFolder()
#  
#  # Download and load most recent junction quantification and clinical data from
#  # TCGA/Firebrowse for Breast Cancer
#  data <- loadFirebrowseData(folder=folder,
#                             cohort="BRCA",
#                             data=c("clinical", "junction_quantification",
#                                    "RSEM_genes"),
#                             date="2016-01-28")
#  names(data)
#  names(data[[1]])
#  
#  # Select clinical and junction quantification dataset
#  clinical      <- data[[1]]$`Clinical data`
#  sampleInfo    <- data[[1]]$`Sample metadata`
#  junctionQuant <- data[[1]]$`Junction quantification (Illumina HiSeq)`
#  geneExpr      <- data[[1]]$`Gene expression`

## ----prepare examples, include=FALSE------------------------------------------
clinical <- readRDS("BRCA_clinical.RDS")
geneExpr <- readRDS("BRCA_geneExpr.RDS")

## ----normalise gene expression------------------------------------------------
# Check genes with 10 or more counts in at least some samples and 15 or more
# total counts across all samples
filter <- filterGeneExpr(geneExpr, minCounts=10, minTotalCounts=15)

# What normaliseGeneExpression() does:
# 1) Filter gene expression
# 2) Normalise gene expression with edgeR::calcNormFactors (internally) using
#    the trimmed mean of M-values (TMM) method (by default)
# 3) Calculate log2-counts per million (logCPM)
geneExprNorm <- normaliseGeneExpression(geneExpr,
                                        geneFilter=filter,
                                        method="TMM",
                                        log2transform=TRUE)

## ----quantify options---------------------------------------------------------
# Available alternative splicing annotation
annotList <- listSplicingAnnotations()
annotList

## ----prepare to quantify splicing, eval=FALSE---------------------------------
#  # Load Human (hg19/GRCh37 assembly) annotation
#  hg19 <- listSplicingAnnotations(assembly="hg19")[[1]]
#  annotation <- loadAnnotation(hg19)

## ----event types--------------------------------------------------------------
# Available alternative splicing event types (skipped exon, alternative 
# first/last exon, mutually exclusive exons, etc.)
getSplicingEventTypes()

## ----quantify splicing, eval=FALSE--------------------------------------------
#  # Discard alternative splicing quantified using few reads
#  minReads <- 10 # default
#  
#  psi <- quantifySplicing(annotation, junctionQuant, minReads=minReads)

## ----load splicing, echo=FALSE------------------------------------------------
psi <- readRDS("BRCA_psi.RDS")
sampleInfo <- parseTCGAsampleInfo(colnames(psi))

## ----check splicing events----------------------------------------------------
# Check the identifier of the splicing events in the resulting table
events <- rownames(psi)
head(events)

## ----data grouping------------------------------------------------------------
# Group by normal and tumour samples
types  <- createGroupByAttribute("Sample types", sampleInfo)
normal <- types$`Solid Tissue Normal`
tumour <- types$`Primary solid Tumor`

# Group by tumour stage (I, II, III or IV) or normal samples
stages <- createGroupByAttribute(
    "patient.stage_event.pathologic_stage_tumor_stage", clinical)
groups <- list()
for (i in c("i", "ii", "iii", "iv")) {
    stage <- Reduce(union,
           stages[grep(sprintf("stage %s[a|b|c]{0,1}$", i), names(stages))])
    # Include only tumour samples
    stageTumour <- names(getSubjectFromSample(tumour, stage))
    elem <- list(stageTumour)
    names(elem) <- paste("Tumour Stage", toupper(i))
    groups <- c(groups, elem)
}
groups <- c(groups, Normal=list(normal))

# Prepare group colours (for consistency across downstream analyses)
colours <- c("#6D1F95", "#FF152C", "#00C7BA", "#FF964F", "#00C65A")
names(colours) <- names(groups)
attr(groups, "Colour") <- colours

# Prepare normal versus tumour stage I samples
normalVSstage1Tumour <- groups[c("Tumour Stage I", "Normal")]
attr(normalVSstage1Tumour, "Colour") <- attr(groups, "Colour")

# Prepare normal versus tumour samples
normalVStumour <- list(Normal=normal, Tumour=tumour)
attr(normalVStumour, "Colour") <- c(Normal="#00C65A", Tumour="#EFE35C")

## ----perform pca--------------------------------------------------------------
# PCA of PSI between normal and tumour stage I samples
psi_stage1Norm    <- psi[ , unlist(normalVSstage1Tumour)]
pcaPSI_stage1Norm <- performPCA(t(psi_stage1Norm))

## ----plot pca-----------------------------------------------------------------
# Explained variance across principal components
plotPCAvariance(pcaPSI_stage1Norm)

# Score plot (clinical individuals)
plotPCA(pcaPSI_stage1Norm, groups=normalVSstage1Tumour)

# Loading plot (variable contributions)
plotPCA(pcaPSI_stage1Norm, loadings=TRUE, individuals=FALSE,
        nLoadings=100)

## ----pca contribution---------------------------------------------------------
# Table of variable contributions (as used to plot PCA, also)
table <- calculateLoadingsContribution(pcaPSI_stage1Norm)
head(table, 5)

## ----perform and plot remaining pca, eval=FALSE-------------------------------
#  # PCA of PSI between all samples (coloured by tumour stage and normal samples)
#  pcaPSI_all <- performPCA(t(psi))
#  plotPCA(pcaPSI_all, groups=groups)
#  plotPCA(pcaPSI_all, loadings=TRUE, individuals=FALSE)
#  
#  # PCA of gene expression between all samples (coloured by tumour stage and
#  # normal samples)
#  pcaGE_all <- performPCA(t(geneExprNorm))
#  plotPCA(pcaGE_all, groups=groups)
#  plotPCA(pcaGE_all, loadings=TRUE, individuals=FALSE)
#  
#  # PCA of gene expression between normal and tumour stage I samples
#  ge_stage1Norm    <- geneExprNorm[ , unlist(normalVSstage1Tumour)]
#  pcaGE_stage1Norm <- performPCA(t(ge_stage1Norm))
#  plotPCA(pcaGE_stage1Norm, groups=normalVSstage1Tumour)
#  plotPCA(pcaGE_stage1Norm, loadings=TRUE, individuals=FALSE)

## ----diff splicing NUMB exon 12-----------------------------------------------
# Find the right event
ASevents <- rownames(psi)
(tmp     <- grep("NUMB", ASevents, value=TRUE))
NUMBskippedExon12 <- tmp[1]

# Plot the representation of NUMB exon 12 inclusion
plotSplicingEvent(NUMBskippedExon12)

# Plot its PSI distribution
plotDistribution(psi[NUMBskippedExon12, ], normalVStumour)

## ----correlation, warning=FALSE-----------------------------------------------
# Find the right gene
genes <- rownames(geneExprNorm)
(tmp  <- grep("QKI", genes, value=TRUE))
QKI   <- tmp[1] # "QKI|9444"

# Plot its gene expression distribution
plotDistribution(geneExprNorm[QKI, ], normalVStumour, psi=FALSE)
plotCorrelation(correlateGEandAS(
    geneExprNorm, psi, QKI, NUMBskippedExon12, method="spearman"))

## ----exploratory diff analysis, message=FALSE---------------------------------
diffSplicing <- diffAnalyses(psi, normalVSstage1Tumour)

# Filter based on |∆ Median PSI| > 0.1 and q-value < 0.01
deltaPSIthreshold <- abs(diffSplicing$`∆ Median`) > 0.1
pvalueThreshold   <- diffSplicing$`Wilcoxon p-value (BH adjusted)` < 0.01
eventsThreshold <- diffSplicing[deltaPSIthreshold & pvalueThreshold, ]

# Plot results
library(ggplot2)
ggplot(diffSplicing, aes(`∆ Median`, 
                         -log10(`Wilcoxon p-value (BH adjusted)`))) +
    geom_point(data=eventsThreshold,
               colour="orange", alpha=0.5, size=3) + 
    geom_point(data=diffSplicing[!deltaPSIthreshold | !pvalueThreshold, ],
               colour="gray", alpha=0.5, size=3) + 
    theme_light(16) +
    ylab("-log10(q-value)")

# Table of events that pass the thresholds
head(eventsThreshold)

## ----survival-----------------------------------------------------------------
# Events already tested which have prognostic value
events <- c(
    "SE_9_+_6486925_6492303_6492401_6493826_UHRF2",
    "SE_4_-_87028376_87024397_87024339_87023185_MAPK10",
    "SE_2_+_152324660_152324988_152325065_152325155_RIF1",
    "SE_2_+_228205096_228217230_228217289_228220393_MFF",
    "MXE_15_+_63353138_63353397_63353472_63353912_63353987_63354414_TPM1",
    "SE_2_+_173362828_173366500_173366629_173368819_ITGA6",
    "SE_1_+_204957934_204971724_204971876_204978685_NFASC")

# Survival curves based on optimal PSI cutoff
library(survival)

# Assign alternative splicing quantification to patients based on their samples
samples <- colnames(psi)
match <- getSubjectFromSample(samples, clinical, sampleInfo=sampleInfo)

survPlots <- list()
for (event in events) {
    # Find optimal cutoff for the event
    eventPSI <- assignValuePerSubject(psi[event, ], match, clinical,
                                      samples=unlist(tumour))
    opt <- optimalSurvivalCutoff(clinical, eventPSI, censoring="right", 
                                 event="days_to_death", 
                                 timeStart="days_to_death")
    (optimalCutoff <- opt$par)    # Optimal exon inclusion level
    (optimalPvalue <- opt$value)  # Respective p-value
    
    label     <- labelBasedOnCutoff(eventPSI, round(optimalCutoff, 2), 
                                    label="PSI values")
    survTerms <- processSurvTerms(clinical, censoring="right",
                                  event="days_to_death", 
                                  timeStart="days_to_death",
                                  group=label, scale="years")
    surv <- survfit(survTerms)
    pvalue <- testSurvival(survTerms)
    plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE)
}

## ----exploratory diff expression, message=FALSE, warning=FALSE----------------
# Prepare groups of samples to analyse and further filter unavailable samples in
# selected groups for gene expression
ge           <- geneExprNorm[ , unlist(normalVSstage1Tumour), drop=FALSE]
isFromGroup1 <- colnames(ge) %in% normalVSstage1Tumour[[1]]
design       <- cbind(1, ifelse(isFromGroup1, 0, 1))

# Fit a gene-wise linear model based on selected groups
library(limma)
fit <- lmFit(ge, design)

# Calculate moderated t-statistics and DE log-odds using limma::eBayes
ebayesFit <- eBayes(fit, trend=TRUE)

# Prepare data summary
pvalueAdjust <- "BH" # Benjamini-Hochberg p-value adjustment (FDR)
summary <- topTable(ebayesFit, number=nrow(fit), coef=2, sort.by="none",
                    adjust.method=pvalueAdjust, confint=TRUE)
names(summary) <- c("log2 Fold-Change", "CI (low)", "CI (high)", 
                    "Average expression", "moderated t-statistics", "p-value", 
                    paste0("p-value (", pvalueAdjust, " adjusted)"),
                    "B-statistics")
attr(summary, "groups") <- normalVSstage1Tumour

# Calculate basic statistics
stats <- diffAnalyses(ge, normalVSstage1Tumour, "basicStats", 
                      pvalueAdjust=NULL)
final <- cbind(stats, summary)

# Differential gene expression between breast tumour stage I and normal samples
library(ggplot2)
library(ggrepel)
cognateGenes <- unlist(parseSplicingEvent(events)$gene)
logFCthreshold  <- abs(final$`log2 Fold-Change`) > 1
pvalueThreshold <- final$`p-value (BH adjusted)` < 0.01

final$genes <- gsub("\\|.*$", "\\1", rownames(final))
ggplot(final, aes(`log2 Fold-Change`, 
                  -log10(`p-value (BH adjusted)`))) +
    geom_point(data=final[logFCthreshold & pvalueThreshold, ],
               colour="orange", alpha=0.5, size=3) + 
    geom_point(data=final[!logFCthreshold | !pvalueThreshold, ],
               colour="gray", alpha=0.5, size=3) + 
    geom_text_repel(data=final[cognateGenes, ], aes(label=genes),
                    box.padding=0.4, size=5) +
    theme_light(16) +
    ylab("-log10(q-value)")

## ----UHRF2 exon 10 diff splicing----------------------------------------------
# UHRF2 skipped exon 10's PSI values per tumour stage I and normal samples
UHRF2skippedExon10 <- events[1]
plotDistribution(psi[UHRF2skippedExon10, ], normalVSstage1Tumour)

## ----UHRF2 PSI survival-------------------------------------------------------
# Find optimal cutoff for the event
UHRF2skippedExon10 <- events[1]
eventPSI <- assignValuePerSubject(psi[UHRF2skippedExon10, ], match, clinical,
                                  samples=unlist(tumour))
opt <- optimalSurvivalCutoff(clinical, eventPSI, censoring="right", 
                             event="days_to_death", timeStart="days_to_death")
(optimalCutoff <- opt$par)    # Optimal exon inclusion level
(optimalPvalue <- opt$value)  # Respective p-value

label     <- labelBasedOnCutoff(eventPSI, round(optimalCutoff, 2), 
                                label="PSI values")
survTerms <- processSurvTerms(clinical, censoring="right",
                              event="days_to_death", timeStart="days_to_death",
                              group=label, scale="years")
surv <- survfit(survTerms)
pvalue <- testSurvival(survTerms)
plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE)

## ----UHRF2 GE diff expression-------------------------------------------------
plotDistribution(geneExprNorm["UHRF2", ], normalVSstage1Tumour)

## ----UHRF2 GE survival--------------------------------------------------------
UHRF2ge <- assignValuePerSubject(geneExprNorm["UHRF2", ], match, clinical, 
                                 samples=unlist(tumour))

# Survival curves based on optimal gene expression cutoff
opt <- optimalSurvivalCutoff(clinical, UHRF2ge, censoring="right",
                             event="days_to_death", timeStart="days_to_death")
(optimalCutoff <- opt$par)    # Optimal exon inclusion level
(optimalPvalue <- opt$value)  # Respective p-value

# Process again after rounding the cutoff
roundedCutoff <- round(optimalCutoff, 2)
label     <- labelBasedOnCutoff(UHRF2ge, roundedCutoff, label="Gene expression")
survTerms <- processSurvTerms(clinical, censoring="right",
                              event="days_to_death", timeStart="days_to_death",
                              group=label, scale="years")
surv   <- survfit(survTerms)
pvalue <- testSurvival(survTerms)
plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE)

## ----load GTEx, eval=FALSE----------------------------------------------------
#  # Check GTEx tissues available based on the sample attributes
#  getGtexTissues(sampleAttr)
#  
#  tissues <- c("blood", "brain")
#  gtex <- loadGtexData("~/Downloads", tissues=tissues)
#  names(gtex)
#  names(gtex[[1]])

## ----load all GTEx, eval=FALSE------------------------------------------------
#  gtex <- loadGtexData("~/Downloads", tissues=NULL)
#  names(gtex)
#  names(gtex[[1]])

## ----load recount, eval=FALSE-------------------------------------------------
#  library(recount)
#  View(recount_abstract)
#  sra <- loadSRAproject("SRP053101")
#  names(sra)
#  names(sra[[1]])

## ----load local, eval=FALSE---------------------------------------------------
#  folder <- "~/Downloads/GTEx/"
#  ignore <- c(".aux.", ".mage-tab.") # File patterns to ignore
#  data <- loadLocalFiles(folder, ignore=ignore)[[1]]
#  names(data)
#  names(data[[1]])
#  
#  # Select clinical and junction quantification dataset
#  clinical      <- data[["Clinical data"]]
#  sampleInfo    <- data[["Sample metadata"]]
#  geneExpr      <- data[["Gene expression"]]
#  junctionQuant <- data[["Junction quantification"]]

Try the psichomics package in your browser

Any scripts or data that you put into this service are public.

psichomics documentation built on Nov. 8, 2020, 5:44 p.m.