knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette aims to guide users through the process of analyzing raw reads obtained from a bulk RNA-Seq experiment using the Isoformic package. We will cover the essential steps from data preprocessing to isoform-level analysis, ensuring that users can leverage the full capabilities of Isoformic for their RNA-Seq data.
By the end of this vignette, you will be able to:
{fishpond}
.{isoformic}
.library(isoformic)
We are going to access the previously quantified data included in the {macrophage}
R package.
Paraphrasing the @swish vignette:
The data was quantified using Salmon [@salmon] 0.12.0 against the Gencode v29 human reference transcripts [@gencode]. For more details and all code used for quantification, refer to the macrophage package vignette.
Importantly,
--numGibbsSamples 20
was used to generate 20 inferential replicates with Salmon's Gibbs sampling procedure. We also recommend to use--gcBias
when running Salmon to protect against common sample-specific biases present in RNA-seq data.
For more details refer to: https://thelovelab.github.io/fishpond/articles/swish.html
# usethis::use_package("macrophage", type = "suggests") if (!rlang::is_installed("macrophage")) { pak::pkg_install("bioc::macrophage") } base_dir <- fs::path_package("macrophage", "extdata") coldata <- readr::read_csv(fs::path(base_dir, "coldata.csv")) head(coldata)
coldata <- coldata[, c(1, 2, 3, 5)] names(coldata) <- c("names", "id", "line", "condition") coldata$files <- fs::path(base_dir, "quants", coldata$names, "quant.sf.gz") all(file.exists(coldata$files))
suppressPackageStartupMessages(library(SummarizedExperiment)) coldata <- coldata[coldata$condition %in% c("naive", "IFNg"), ] coldata$condition <- factor(coldata$condition, levels = c("naive", "IFNg") )
If you used @salmon for the RNA-Seq quantification. The same reference files should ge used to create the reference here.
If GENCODE annotation was used with @salmon, tximeta()
can automatically identify the relevant transcript and gene annotations.
NOTE: If your experimental design do not allow for the use of
{tximeta}
automatically, check how to manually create aLinkedTxome
object for your organism or annotation with?tximeta::makeLinkedTxome()
.
Also check out vignette on how to use a custom annotation: Isoformic Reference Annotation vignette.
# vignette(topic="allelic", package="fishpond") # vignette(topic="swish", package="fishpond") # vignette(topic = "isoformic-intro", package = "isoformic") vignette(topic = "isoformic-annotation", package = "isoformic")
# library(tximeta) se_tx <- tximeta::tximeta( coldata = coldata, type = "salmon", txOut = TRUE, useHub = FALSE ) download_reference( version = "46", reference = "gencode", file_type = "gtf", organism = "human", output_path = base_dir ) download_reference( version = "46", reference = "gencode", file_type = "fasta", organism = "human", output_path = base_dir ) download_reference( version = "46", reference = "gencode", file_type = "gff", organism = "human", output_path = base_dir ) str(se_tx@metadata) mcols(se_tx) se_gene <- tximport::summarizeToGene(se_tx)
gencode_gtf_path <- fs::path("data-raw", "gencode.v46.annotation.gtf.gz") gencode_gff_path <- fs::path("data-raw", "gencode.v46.annotation.gff3.gz") gencode_fasta_path <- fs::path("data-raw", "gencode.v46.transcripts.fa.gz") # pak::pkg_install("bioc::txdbmaker") txdb <- txdbmaker::makeTxDbFromGFF( file = gencode_gtf_path, format = "gtf", organism = "Homo sapiens", dataSource = "localGENCODE", metadata = list( Genome = "GRCh38" ) ) library(rtracklayer) tictoc::tic("rtracklayer::import()") gtf_obj <- rtracklayer::import(gencode_gtf_path) tictoc::toc() tictoc::tic("vroom GTF") gtf_df <- vroom::vroom( gencode_gtf_path, delim = "\t" ) tictoc::toc() rm(gtf_df) tictoc::tic("vroom GFF") gff_df <- vroom::vroom( gencode_gff_path, delim = "\t" ) tictoc::toc() lobstr::obj_sizes(gff_df) rm(gff_df) gtf_df[1, ] lobstr::obj_sizes(gtf_df) gtf_obj gtf_obj$type == "transcript" readr::read_lines_raw(gencode_gtf_path, n_max = 10) lobstr::obj_size(gtf_obj) table(gtf_obj$type) make_tx_to_gene() # readr:: txdbInfo <- metadata(txdb)$value names(txdbInfo) <- metadata(txdb)$name isoformic::make_tx_to_gene() isoformic::prepare_exon_annotation() vroom::vroom(file = gencode_gtf_path, delim = "\t", comment = "#", n_max = 10, col_names = FALSE)
head(annot_gff_df)
head(annot_gtf_df)
rowRanges(se_gene) columns(txdb) AnnotationDbi::select(txdb, keys = keys(txdb), columns = columns(txdb)) select(tx) GenomicFeatures::genes(txdb) GenomicFeatures::transcripts(txdb) GenomicFeatures::features(txdb) AnnotationDbi::saveDb(txdb, file = "data-raw/txdb_gencode_v46.sqlite") AnnotationDbi::loadDb(file = "data-raw/txdb_gencode_v46.sqlite") mae@ExperimentList$transcript |> rowData() |> nrow() # pak::pkg_install("bioc::AnnotationForge") # AnnotationForge::makeOrgPackage()
se_experiment_level(se_gene) se_experiment_level(se_tx)
We will rename our SummarizedExperiment
to y
for the statistical analysis. For speed of the vignette, we subset to the transcripts on chromosome 4.
Note on factor levels: The
swish()
function compares expression level across factors such that log2 fold changes are reported as the non-reference level over the reference level. By default, R will choose a reference level for factors based on alphabetical order, unless levels are explicitly set. It is recommended to set the factors levels, as in the above code chunk, with the reference level coming first in the character vector, so that log2 fold changes correspond to the comparison of interest.
{fishpond}
# library(fishpond) # y <- y[seqnames(y) == "chr4",] se_tx <- run_swish_pairwise(se_tx, contrast_var = "condition") se_gene <- run_swish_pairwise(se_gene, contrast_var = "condition") # number of genes that pass # + `fishpond::labelKeep(se, minCount = 10, minN = 3, x = contrast_var)` names(se_tx@metadata)
Differential expression results can be seen at
metadata(se_tx)$isoformic$dea metadata(se_gene)$isoformic$dea
lobstr::obj_size() se_gene |> S4Vectors::mcols() |> base::as.data.frame() |> tibble::as_tibble(rownames = "row_names") |> lobstr::obj_size() base::as.data.frame() |> tibble::as_tibble(rownames = "row_names") #|> # dplyr::select(row_names, )
S4Vectors::mcols(se_gene) metadata(se_tx) S4Vectors::mcols(se_tx) str(se_tx) sum(mcols(se_tx)$keep) sum(mcols(se_gene)$keep)
S4Vectors::mcols(se_tx)
SummarizedExperiment
objects into MultiAssayExperiment
library(MultiAssayExperiment) isoformic_mae <- create_isoformic_mae_from_se(se_tx, se_gene)
MultiAssayExperiment
metadata(isoformic_mae)[["isoformic"]] class(isoformic_mae) isoformic_mae@ExperimentList$transcript@assays@data$abundance
isoformic()
# isoformic( # method = "fishpond", # annotation = "gencode" # ) experiments(isoformic_mae)$transcript |> rowData() validate_isoformic_mae(isoformic_mae)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.