inst/doc/recount3-quickstart.R

## ----'knitr_options', include = FALSE-----------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE----------------
## Track time spent on making the vignette
startTime <- Sys.time()

## Bib setup
library("knitcitations")

## Load knitcitations with a clean bibliography
cleanbib()
cite_options(hyperlink = "to.doc", citation_format = "text", style = "html")
# Note links won't show for now due to the following issue
# https://github.com/cboettig/knitcitations/issues/63

## Write bibliography information
bib <- c(
    R = citation(),
    BiocFileCache = citation("BiocFileCache"),
    BiocStyle = citation("BiocStyle"),
    knitcitations = citation("knitcitations"),
    knitr = citation("knitr")[3],
    recount3 = citation("recount3")[1],
    recount3paper = citation("recount3")[2],
    rmarkdown = citation("rmarkdown"),
    sessioninfo = citation("sessioninfo"),
    SummarizedExperiment = RefManageR::BibEntry(
        bibtype = "manual",
        key = "SummarizedExperiment",
        author = "Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès",
        title = "SummarizedExperiment: SummarizedExperiment container",
        year = 2019, doi = "10.18129/B9.bioc.SummarizedExperiment"
    ),
    testthat = citation("testthat"),
    covr = citation("covr"),
    RefManageR = citation("RefManageR"),
    pryr = citation("pryr"),
    interactiveDisplayBase = citation("interactiveDisplayBase"),
    rtracklayer = citation("rtracklayer"),
    S4Vectors = citation("S4Vectors"),
    RCurl = citation("RCurl"),
    data.table = citation("data.table"),
    R.utils = citation("R.utils"),
    Matrix = citation("Matrix"),
    GenomicRanges = citation("GenomicRanges"),
    recount2paper = citation("recount")[1],
    recount2workflow = citation("recount")[2],
    recount1paper = citation("recount")[5],
    recountbrain = citation("recount")[6],
    recount2fantom = citation("recount")[7],
    bioconductor2015 = RefManageR::BibEntry(
        bibtype = "Article",
        key = "bioconductor2015",
        author = "Wolfgang Huber and Vincent J Carey and Robert Gentleman and Simon Anders and Marc Carlson and Benilton S Carvalho and Hector Corrada Bravo and Sean Davis and Laurent Gatto and Thomas Girke and Raphael Gottardo and Florian Hahne and Kasper D Hansen and Rafael A Irizarry and Michael Lawrence and Michael I Love and James MacDonald and Valerie Obenchain and Andrzej K Oles and Hervé Pagès and Alejandro Reyes and Paul Shannon and Gordon K Smyth and Dan Tenenbaum and Levi Waldron and Martin Morgan",
        title = "Orchestrating high-throughput genomic analysis with Bioconductor",
        year = 2015, doi = "10.1038/nmeth.3252",
        journal = "Nature Methods",
        journaltitle = "Nat Methods"
    )
)

write.bibtex(bib, file = "quickstartRef.bib")

## ----'install', eval = FALSE--------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("recount3")
#  
#  ## Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## ----'install_dev', eval = FALSE----------------------------------------------
#  BiocManager::install("LieberInstitute/recount3")

## ----'citation'---------------------------------------------------------------
## Citation info
citation("recount3")

## ----'start', message=FALSE---------------------------------------------------
## Load recount3 R package
library("recount3")

## ----'quick_example'----------------------------------------------------------
## Find all available human projects
human_projects <- available_projects()

## Find the project you are interested in,
## here we use SRP009615 as an example
proj_info <- subset(
    human_projects,
    project == "SRP009615" & project_type == "data_sources"
)

## Create a RangedSummarizedExperiment (RSE) object at the gene level
rse_gene_SRP009615 <- create_rse(proj_info)

## Explore that RSE object
rse_gene_SRP009615

## ----"interactive_display", eval = FALSE--------------------------------------
#  ## Note that you can interactively explore the available projects
#  proj_info_interactive <- interactiveDisplayBase::display(human_projects)
#  
#  ## Select a single row, then hit "send". The following code checks this.
#  stopifnot(nrow(proj_info_interactive) == 1)
#  
#  ## Then create the RSE object
#  rse_gene_interactive <- create_rse(proj_info_interactive)

## ----"tranform_counts"--------------------------------------------------------
## Once you have your RSE object, you can transform the raw coverage
## base-pair coverage counts using transform_counts().
## For RPKM, TPM or read outputs, check the details in transform_counts().
assay(rse_gene_SRP009615, "counts") <- transform_counts(rse_gene_SRP009615)

## ----"recountWorkflowFig1", out.width="100%", fig.align="center", fig.cap = "Overview of the data available in recount2 and recount3. Reads (pink boxes) aligned to the reference genome can be used to compute a base-pair coverage curve and identify exon-exon junctions (split reads). Gene and exon count matrices are generated using annotation information providing the gene (green boxes) and exon (blue boxes) coordinates together with the base-level coverage curve. The reads spanning exon-exon junctions (jx) are used to compute a third count matrix that might include unannotated junctions (jx 3 and 4). Without using annotation information, expressed regions (orange box) can be determined from the base-level coverage curve to then construct data-driven count matrices. DOI: < https://doi.org/10.12688/f1000research.12223.1>.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/LieberInstitute/recountWorkflow/master/vignettes/Figure1.png")

## ----recountWorkflowFig2, out.width="100%", fig.align="center", fig.cap = "recount2 and recount3 provide coverage count matrices in RangedSummarizedExperiment (rse) objects. Once the rse object has been downloaded and loaded into R (using `recount3::create_rse()` or `recount2::download_study()`), the feature information is accessed with `rowRanges(rse)` (blue box), the counts with assays(rse) (pink box) and the sample metadata with `colData(rse)` (green box). The sample metadata stored in `colData(rse)` can be expanded with custom code (brown box) matching by a unique sample identifier such as the SRA Run ID. The rse object is inside the purple box and matching data is highlighted in each box. DOI: < https://doi.org/10.12688/f1000research.12223.1>.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/LieberInstitute/recountWorkflow/master/vignettes/Figure2.png")

## ----"human_projects"---------------------------------------------------------
human_projects <- available_projects()
dim(human_projects)
head(human_projects)

## Select a study of interest
project_info <- subset(
    human_projects,
    project == "SRP009615" & project_type == "data_sources"
)
project_info

## ----"gtex_projects"----------------------------------------------------------
subset(human_projects, file_source == "gtex" & project_type == "data_sources")

## ----"select_interactively", eval = FALSE-------------------------------------
#  ## Alternatively, interactively browse the human projects,
#  ## select one, then hit send
#  selected_study <- interactiveDisplayBase::display(human_projects)

## ----"display_proj_info"------------------------------------------------------
project_info[, c("project", "organism", "project_home")]

## ----"annotations"------------------------------------------------------------
annotation_options("human")
annotation_options("mouse")

## ----"rse_gencode_v26"--------------------------------------------------------
## Create a RSE object at the gene level
rse_gene_SRP009615 <- create_rse(project_info)

## Explore the resulting RSE gene object
rse_gene_SRP009615

## ----"metadata_rse"-----------------------------------------------------------
## Information about how this RSE object was made
metadata(rse_gene_SRP009615)

## ----"row_ranges"-------------------------------------------------------------
## Number of genes by number of samples
dim(rse_gene_SRP009615)

## Information about the genes
rowRanges(rse_gene_SRP009615)

## ----"recount3_meta_cols"-----------------------------------------------------
## Sample metadata
recount3_cols <- colnames(colData(rse_gene_SRP009615))

## How many are there?
length(recount3_cols)

## View the first few ones
head(recount3_cols)

## Group them by source
recount3_cols_groups <- table(gsub("\\..*", "", recount3_cols))

## Common sources and number of columns per source
recount3_cols_groups[recount3_cols_groups > 1]

## Unique columns
recount3_cols_groups[recount3_cols_groups == 1]

## ----"explore_all", eval = FALSE----------------------------------------------
#  ## Explore them all
#  recount3_cols

## ----"expand attributes"------------------------------------------------------
rse_gene_SRP009615_expanded <-
    expand_sra_attributes(rse_gene_SRP009615)
colData(rse_gene_SRP009615_expanded)[, ncol(colData(rse_gene_SRP009615)):ncol(colData(rse_gene_SRP009615_expanded))]

## ----"raw_counts"-------------------------------------------------------------
assayNames(rse_gene_SRP009615)

## ----"scaling_counts"---------------------------------------------------------
## Once you have your RSE object, you can transform the raw coverage
## base-pair coverage counts using transform_counts().
## For RPKM, TPM or read outputs, check the details in transform_counts().
assay(rse_gene_SRP009615, "counts") <- transform_counts(rse_gene_SRP009615)

## ----"rse_exon"---------------------------------------------------------------
## Create a RSE object at the exon level
rse_exon_SRP009615 <- create_rse(
    proj_info,
    type = "exon"
)

## Explore the resulting RSE exon object
rse_exon_SRP009615

## Explore the object
dim(rse_exon_SRP009615)

## ----"exon_ranges"------------------------------------------------------------
## Exon annotation information
rowRanges(rse_exon_SRP009615)

## Exon ids are repeated because a given exon can be part of
## more than one transcript
length(unique(rowRanges(rse_exon_SRP009615)$exon_id))

## ----"exon_memory"------------------------------------------------------------
## Check how much memory the gene and exon RSE objects use
pryr::object_size(rse_gene_SRP009615)
pryr::object_size(rse_exon_SRP009615)

## ----"rse_jxn"----------------------------------------------------------------
## Create a RSE object at the exon-exon junction level
rse_jxn_SRP009615 <- create_rse(
    proj_info,
    type = "jxn"
)

## Explore the resulting RSE exon-exon junctions object
rse_jxn_SRP009615
dim(rse_jxn_SRP009615)

## Exon-exon junction information
rowRanges(rse_jxn_SRP009615)

## Memory used
pryr::object_size(rse_jxn_SRP009615)

## ----"BigWigURLs"-------------------------------------------------------------
## BigWig file names
## The full URL is stored in BigWigUrl
basename(head(rse_gene_SRP009615$BigWigURL))

## ----"BiocFileCache"----------------------------------------------------------
## List the URLs of the recount3 data that you have downloaded
recount3_cache_files()

## ----"remove_files", eval = FALSE---------------------------------------------
#  ## Delete the recount3 files from your cache
#  recount3_cache_rm()
#  
#  ## Check that no files are listed
#  recount3_cache_files()

## ----createVignette, eval=FALSE-----------------------------------------------
#  ## Create the vignette
#  library("rmarkdown")
#  system.time(render("recount3-quickstart.Rmd", "BiocStyle::html_document"))
#  
#  ## Extract the R code
#  library("knitr")
#  knit("recount3-quickstart.Rmd", tangle = TRUE)

## ----createVignette2----------------------------------------------------------
## Clean up
file.remove("quickstartRef.bib")

## ----reproduce1, echo=FALSE---------------------------------------------------
## Date the vignette was generated
Sys.time()

## ----reproduce2, echo=FALSE---------------------------------------------------
## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits = 3)

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()

## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
bibliography()

Try the recount3 package in your browser

Any scripts or data that you put into this service are public.

recount3 documentation built on Feb. 13, 2021, 2 a.m.