knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Introduction

This vignette aims to guide users through the process of analyzing raw reads obtained from a bulk RNA-Seq experiment using the Isoformic package. We will cover the essential steps from data preprocessing to isoform-level analysis, ensuring that users can leverage the full capabilities of Isoformic for their RNA-Seq data.

Objectives

By the end of this vignette, you will be able to:

  1. Differential isoform expression analysis with {fishpond}.
  2. Interpret and visualize the results of your analysis with {isoformic}.
library(isoformic)

Isoform-level quantification

We are going to access the previously quantified data included in the {macrophage} R package.

Paraphrasing the @swish vignette:

The data was quantified using Salmon [@salmon] 0.12.0 against the Gencode v29 human reference transcripts [@gencode]. For more details and all code used for quantification, refer to the macrophage package vignette.

Importantly, --numGibbsSamples 20 was used to generate 20 inferential replicates with Salmon's Gibbs sampling procedure. We also recommend to use --gcBias when running Salmon to protect against common sample-specific biases present in RNA-seq data.

For more details refer to: https://thelovelab.github.io/fishpond/articles/swish.html

# usethis::use_package("macrophage", type = "suggests")
if (!rlang::is_installed("macrophage")) {
  pak::pkg_install("bioc::macrophage")
}
base_dir <- fs::path_package("macrophage", "extdata")
coldata <- readr::read_csv(fs::path(base_dir, "coldata.csv"))
head(coldata)
coldata <- coldata[, c(1, 2, 3, 5)]
names(coldata) <- c("names", "id", "line", "condition")
coldata$files <- fs::path(base_dir, "quants", coldata$names, "quant.sf.gz")
all(file.exists(coldata$files))
suppressPackageStartupMessages(library(SummarizedExperiment))
coldata <- coldata[coldata$condition %in% c("naive", "IFNg"), ]
coldata$condition <- factor(coldata$condition,
  levels = c("naive", "IFNg")
)

Preparing the annotation

If you used @salmon for the RNA-Seq quantification. The same reference files should ge used to create the reference here.

If GENCODE annotation was used with @salmon, tximeta() can automatically identify the relevant transcript and gene annotations.

NOTE: If your experimental design do not allow for the use of {tximeta} automatically, check how to manually create a LinkedTxome object for your organism or annotation with ?tximeta::makeLinkedTxome().

Also check out vignette on how to use a custom annotation: Isoformic Reference Annotation vignette.

# vignette(topic="allelic", package="fishpond")
# vignette(topic="swish", package="fishpond")

# vignette(topic = "isoformic-intro", package = "isoformic")
vignette(topic = "isoformic-annotation", package = "isoformic")
# library(tximeta)
se_tx <- tximeta::tximeta(
  coldata = coldata,
  type = "salmon",
  txOut = TRUE,
  useHub = FALSE
)

download_reference(
  version = "46",
  reference = "gencode",
  file_type = "gtf",
  organism = "human",
  output_path = base_dir
)
download_reference(
  version = "46",
  reference = "gencode",
  file_type = "fasta",
  organism = "human",
  output_path = base_dir
)
download_reference(
  version = "46",
  reference = "gencode",
  file_type = "gff",
  organism = "human",
  output_path = base_dir
)

str(se_tx@metadata)
mcols(se_tx)

se_gene <- tximport::summarizeToGene(se_tx)
gencode_gtf_path <- fs::path("data-raw", "gencode.v46.annotation.gtf.gz")
gencode_gff_path <- fs::path("data-raw", "gencode.v46.annotation.gff3.gz")
gencode_fasta_path <- fs::path("data-raw", "gencode.v46.transcripts.fa.gz")
# pak::pkg_install("bioc::txdbmaker")
txdb <- txdbmaker::makeTxDbFromGFF(
  file = gencode_gtf_path,
  format = "gtf",
  organism = "Homo sapiens",
  dataSource = "localGENCODE",
  metadata = list(
    Genome = "GRCh38"
  )
)

library(rtracklayer)
tictoc::tic("rtracklayer::import()")
gtf_obj <- rtracklayer::import(gencode_gtf_path)
tictoc::toc()
tictoc::tic("vroom GTF")
gtf_df <- vroom::vroom(
  gencode_gtf_path,
  delim = "\t"
)
tictoc::toc()
rm(gtf_df)
tictoc::tic("vroom GFF")
gff_df <- vroom::vroom(
  gencode_gff_path,
  delim = "\t"
)
tictoc::toc()
lobstr::obj_sizes(gff_df)
rm(gff_df)
gtf_df[1, ]

lobstr::obj_sizes(gtf_df)
gtf_obj
gtf_obj$type == "transcript"

readr::read_lines_raw(gencode_gtf_path, n_max = 10)

lobstr::obj_size(gtf_obj)
table(gtf_obj$type)

make_tx_to_gene()

# readr::
txdbInfo <- metadata(txdb)$value
names(txdbInfo) <- metadata(txdb)$name
isoformic::make_tx_to_gene()

isoformic::prepare_exon_annotation()

vroom::vroom(file = gencode_gtf_path, delim = "\t", comment = "#", n_max = 10, col_names = FALSE)
head(annot_gff_df)
head(annot_gtf_df)
rowRanges(se_gene)
columns(txdb)

AnnotationDbi::select(txdb, keys = keys(txdb), columns = columns(txdb))
select(tx)
GenomicFeatures::genes(txdb)
GenomicFeatures::transcripts(txdb)
GenomicFeatures::features(txdb)

AnnotationDbi::saveDb(txdb, file = "data-raw/txdb_gencode_v46.sqlite")

AnnotationDbi::loadDb(file = "data-raw/txdb_gencode_v46.sqlite")
mae@ExperimentList$transcript |>
  rowData() |>
  nrow()
# pak::pkg_install("bioc::AnnotationForge")
# AnnotationForge::makeOrgPackage()
se_experiment_level(se_gene)
se_experiment_level(se_tx)

We will rename our SummarizedExperiment to y for the statistical analysis. For speed of the vignette, we subset to the transcripts on chromosome 4.

Note on factor levels: The swish() function compares expression level across factors such that log2 fold changes are reported as the non-reference level over the reference level. By default, R will choose a reference level for factors based on alphabetical order, unless levels are explicitly set. It is recommended to set the factors levels, as in the above code chunk, with the reference level coming first in the character vector, so that log2 fold changes correspond to the comparison of interest.

Differential transcript expression using {fishpond}

# library(fishpond)
# y <- y[seqnames(y) == "chr4",]
se_tx <- run_swish_pairwise(se_tx, contrast_var = "condition")
se_gene <- run_swish_pairwise(se_gene, contrast_var = "condition")

# number of genes that pass
# + `fishpond::labelKeep(se, minCount = 10, minN = 3, x = contrast_var)`
names(se_tx@metadata)

Differential expression results can be seen at

metadata(se_tx)$isoformic$dea
metadata(se_gene)$isoformic$dea
lobstr::obj_size()

se_gene |>
  S4Vectors::mcols() |>
  base::as.data.frame() |>
  tibble::as_tibble(rownames = "row_names") |>
  lobstr::obj_size()

base::as.data.frame() |>
  tibble::as_tibble(rownames = "row_names") #|>
# dplyr::select(row_names, )
S4Vectors::mcols(se_gene)

metadata(se_tx)

S4Vectors::mcols(se_tx)
str(se_tx)

sum(mcols(se_tx)$keep)
sum(mcols(se_gene)$keep)
S4Vectors::mcols(se_tx)

Merging SummarizedExperiment objects into MultiAssayExperiment

library(MultiAssayExperiment)
isoformic_mae <- create_isoformic_mae_from_se(se_tx, se_gene)

Exploring the Isoformic MultiAssayExperiment

metadata(isoformic_mae)[["isoformic"]]

class(isoformic_mae)
isoformic_mae@ExperimentList$transcript@assays@data$abundance

Running isoformic()

# isoformic(
#   method = "fishpond",
#   annotation = "gencode"
# )

experiments(isoformic_mae)$transcript |>
  rowData()
validate_isoformic_mae(isoformic_mae)

Session information

sessionInfo()

References



luciorq/txomics documentation built on Dec. 23, 2024, 10:36 a.m.