#' Import Salmon quant.sf files to SummarizedExperiment
#'
#' Import Salmon quant.sf files to SummarizedExperiment
#'
#' This function is intended to automate the process of importing
#' a series of `quant.sf` files, then generating `SummarizedExperiment`
#' objects at the transcript and gene level. It optionally includes
#' sample annotation provided as a `data.frame` in argument `curation_txt`.
#' It also includes transcript and gene annotations through either
#' `data.frame` from argument `tx2gene`, or it derives `tx2gene`
#' from a GTF file from argument `gtf`. The GTF file option then calls
#' `splicejam::makeTx2geneFromGtf()`.
#'
#' This function can optionally process data that includes full length
#' gene body regions, annotated with `"gene_body"`. This option is specific
#' for Salmon quantitation where the transcripts include full length
#' gene body for multi-exon genes, for example to measure unspliced
#' transcript abundance.
#'
#' * `import_types="gene"` summarizes only the proper transcripts,
#' excluding `"gene_body"` entries.
#' * `import_types="gene_body"` summarizes all transcript
#' and full gene entries into one summary transcript abundance.
#' * `import_types="gene_tx"` summarizes proper transcript to gene level,
#' and separately represents `"gene_body"` entries for comparison.
#'
#' The current recommendation is to use default values for `import_types`
#' which imports all the following types of data:
#' 1. `TxSE` as transcripts per row
#' 2. `GeneSE` as genes per row, excluding unspliced `gene_body` transcripts.
#' 3. `GeneBodySE` as genes per row, summarizing unspliced
#' and spliced `"gene_body"` transcripts together for each gene.
#' 4. `GeneTxSE` as spliced and unspliced genes per row, so that
#' unspliced, spliced, or both can be analyzed together.
#'
#' We typically use `GeneBodySE` and define a subset of `rownames(GeneBodySE)`
#' to use only spliced transcripts during analysis. Optionally we may
#' run `limma::diffSplice()` to compare the unspliced:spliced ratio
#' across experiment groups.
#'
#' @return `list` with `SummarizedExperiment` objects, each of which
#' contain assay names `c("counts", "abundance", "length)`, where
#' `c("counts", "abundance")` are transformed with `log2(1 + x)`.
#' The transform can be reversed with `10^x - 1`.
#' The `SummarizedExperiment` objects by name:
#' * `"TxSE"`: transcript-level values imported from `quant.sf`.
#' * `"GeneSE"`: gene-level summary values, excluding
#' `"gene_body"` entries.
#' * `"GeneBodySE"`: gene-level summary values, summarizing unspliced
#' and spliced `"gene_body"` transcripts together for each gene.
#' * `"GeneTxSE"`: gene-level summary values, where transcripts are
#' combined to gene level, and `"gene_body"` entries are represented
#' separately, with suffix `"_gene_body"` added to the gene name.
#'
#' @family jam import functions
#'
#' @param salmonOut_paths `character` vectors to each individual folder
#' that contains the `"quant.sf"` output file for Salmon.
#' @param import_types `character` indicating which type or types of
#' data to return. Note that the distinction between `gene` and
#' `gene_body` is only relevant when there are transcript entries
#' defined with `transcript_type="gene_body"`. These entries specifically
#' represent unspliced transcribed regions for a gene locus, and
#' only for multi-exon genes.
#' * `tx`: transcript quantitation, direct import of `quant.sf` files.
#' * `gene`: gene quantitation after calling `tximport::summarizeToGene()`,
#' excluding `transcript_type="gene_body"`.
#' * `gene_body`: gene quantitation after calling `tximport::summarizeToGene()`,
#' including `transcript_type="gene_body"`.
#' @param tx2gene `character` path to file, or `data.frame` with at
#' least two columns matching `tx_colname` and `gene_colname` below.
#' When supplied, the `gtf` argument is ignored, unless the file
#' path is not accessible, or the data is not `data.frame`.
#' @param gtf `character` path to a GTF file, used only when `tx2gene`
#' is not supplied. When used, `splicejam::makeTx2geneFromGtf()` is
#' called to create a `data.frame` object `tx2gene`.
#' @param curation_txt `data.frame` whose first column should match the
#' sample column headers found in the PD abundance columns, and
#' subsequent columns contain associated sample annotations.
#' If `curation_txt` is not supplied, then values will be split into
#' columns by `_` underscore or `" "` whitespace characters.
#' @param tx_colname,gene_colname `character` strings indicating colnames
#' in `tx2gene` that should be used.
#' * `tx_colname` represents unique identifier for each transcript,
#' usually `"transcript_id"`.
#' * `gene_colname` represents a gene label associated with gene
#' summarized expression values, typically `"gene_name"`.
#' @param geneFeatureType,txFeatureType `character` arguments passed to
#' `splicejam::makeTx2geneFromGtf()` only when supplying argument
#' `gtf` with a path to a GTF file.
#' @param countsFromAbundance `character` string passed to
#' `tximport::summarizeToGene()` to define the method for calculating
#' abundance.
#' @param gene_body_ids `character` optional vector with specific row
#' identifiers that should be considered `transcript_type="gene_body"`
#' entries, relevant to argument `import_types` above. When `gene_body_ids`
#' is defined, these entries are used directly without using `tx2gene`.
#' When `gene_body_ids` is not defined, `tx2gene$transcript_type` is used
#' if present. If that column is not present, or does not contain any
#' entries with `"gene_body"`, then all transcripts are used for
#' `import_types="gene"`, and `import_types="gene_body"` is not valid
#' and therefore is not returned.
#' @param trim_tx_from,trim_tx_to `character` vector with one or more
#' regular expression patterns used to curate the values in `tx_colname`
#' prior to assigning them as `rownames()`, and back to `tx_colname`.
#' These values are joined to `tx2gene[[tx_colname]]` to assign
#' additional gene annotations.
#' * The default as of version 0.0.79.900 is to leave strand information
#' `"(-)"` and `"(+)"` without removing it, since in some rare cases a
#' gene's unspliced transcripts can be present on two strands.
#' * The previous default (version <= 0.0.78.900) was to remove
#' `"(-)"` and `"(+)"` from the transcript_id `tx_colname` column.
#' @param verbose `logical` indicating whether to print verbose output.
#' @param ... additional arguments are passed to supporting functions.
#'
#' @export
import_salmon_quant <- function
(salmonOut_paths,
import_types=c("tx",
"gene",
"gene_body",
"gene_tx"),
gtf=NULL,
tx2gene=NULL,
curation_txt=NULL,
tx_colname="transcript_id",
gene_colname="gene_name",
gene_body_colname="transcript_type",
geneFeatureType="exon",
txFeatureType="exon",
countsFromAbundance="lengthScaledTPM",
gene_body_ids=NULL,
trim_tx_from=NULL,#c("[(][-+][)]"),
trim_tx_to=NULL,
verbose=FALSE,
...)
{
#
import_types <- match.arg(import_types,
several.ok=TRUE);
if (!jamba::check_pkg_installed("SummarizedExperiment")) {
stop("The Bioconductor package SummarizedExperiment is required.")
}
# tx2gene
if (length(tx2gene) > 0) {
if (length(tx2gene) == 1 && file.exists(tx2gene)) {
tx2gene <- data.table::fread(tx2gene,
data.table=FALSE);
} else if (!"data.frame" %in% class(tx2gene)) {
warning(paste("tx2gene should be a file path, or data.frame.",
"Ignoring tx2gene."));
tx2gene <- NULL;
}
}
# gtf
if (length(tx2gene) == 0) {
if (length(gtf) == 1 && file.exists(gtf)) {
# create tx2gene data.frame
gtf_tx2gene <- gsub("[.]gtf(|[.].*)$",
".tx2gene.txt",
ignore.case=TRUE,
gtf);
if (file.exists(gtf_tx2gene)) {
tx2gene <- data.table::fread(gtf_tx2gene,
data.table=FALSE);
rownames(tx2gene) <- tx2gene$transcript_id;
} else {
tx2gene <- splicejam::makeTx2geneFromGtf(gtf,
geneFeatureType=geneFeatureType,
txFeatureType=txFeatureType,
...);
rownames(tx2gene) <- tx2gene[[tx_colname]];
# save for re-use
tryCatch({
data.table::fwrite(x=tx2gene,
file=gtf_tx2gene,
sep="\t",
row.names=FALSE);
}, error=function(e){
warning(paste("Could not write tx2gene to file for re-use: ",
gtf_tx2gene));
});
}
} else {
stop(paste("Input GTF must be a single accessible filename."));
}
}
# tximport
if (length(names(salmonOut_paths)) == 0) {
names(salmonOut_paths) <- jamba::makeNames(
gsub("_salmonOut$",
"",
basename(salmonOut_paths)));
}
salmon_files <- file.path(salmonOut_paths, "quant.sf")
names(salmon_files) <- names(salmonOut_paths);
if (any(!file.exists(salmon_files))) {
warning(paste("Note some files were not accessible: ",
jamba::cPaste(salmon_files[!file.exists(salmon_files)], sep=", ")));
salmon_files <- salmon_files[file.exists(salmon_files)];
}
if (length(salmon_files) == 0) {
stop("No salmon quant.sf files were accessible.");
}
# Import transcript quant.sf
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"importing tx data: ",
"TxSE");
if (verbose > 1) {
print(data.frame(salmon_files));
}
}
txiTx <- tximport::tximport(salmon_files,
type="salmon",
importer=data.table::fread,
countsFromAbundance=countsFromAbundance,
txOut=TRUE);
isamples <- colnames(txiTx[[1]]);
# optionally curate transcript rownames
if (length(trim_tx_from) > 0) {
if (length(trim_tx_to) == 0) {
trim_tx_to <- "";
}
trim_tx_to <- rep(trim_tx_to,
length.out=length(trim_tx_from));
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"trimming rownames using: ",
c("trim_tx_from", "trim_tx_to"));
}
for (itype in c("counts", "abundance", "length")) {
for (iseq in seq_along(trim_tx_from)) {
rownames(txiTx[[itype]]) <- gsub(trim_tx_from[[iseq]],
trim_tx_to[[iseq]],
rownames(txiTx[[itype]]))
}
}
}
rownames(tx2gene) <- tx2gene[[tx_colname]];
# import sample annotations
if (length(curation_txt) > 0) {
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"applying sample annotations via curation_txt.");
}
# sample_df
sample_df <- curate_to_df_by_pattern(
isamples,
df=curation_txt,
...);
isamples1 <- rownames(sample_df);
if (!all(isamples1 %in% isamples)) {
filename_colname <- head(jamba::vigrep("^filename$",
colnames(sample_df)),
1);
isamples_match <- match(sample_df[[filename_colname]],
isamples);
isamples_from <- isamples[isamples_match];
isamples_to <- isamples1;
if (verbose > 1) {
jamba::printDebug("import_salmon_quant(): ",
"renaming sample colnames to match sample_df:");
print(data.frame(isamples_from,
isamples_to));
}
for (i in names(txiTx)) {
if ("matrix" %in% class(txiTx[[i]])) {
txiTx[[i]] <- jamba::renameColumn(txiTx[[i]],
from=isamples_from,
to=isamples_to);
}
}
}
isamples <- isamples1;
if (verbose) {
if (verbose > 1) {
jamba::printDebug("import_salmon_quant(): ",
"sample_df:");
print(sample_df);
}
}
} else {
# create default
sample_df <- data.frame(check.names=FALSE,
stringsAsFactors=FALSE,
sample_id=isamples);
rownames(sample_df) <- isamples;
}
ret_list <- list();
if ("tx" %in% import_types) {
# rowData for transcripts
# table(rownames(txiTx[[1]]) %in% tx2gene$transcript_id)
rowData_tx <- tx2gene[match(rownames(txiTx[[1]]), tx2gene[[tx_colname]]),,drop=FALSE];
# TODO: debug workarounds when this step fails,
# usually occurs when rownames do not match tx2gene, which is
# often the version numbers ENST000012.1_1
rownames(rowData_tx) <- rowData_tx[[tx_colname]];
# Create tx SummarizedExperiment
TxSE <- SummarizedExperiment(
assays=list(
counts=log2(1+txiTx$counts)[,isamples, drop=FALSE],
abundance=log2(1+txiTx$abundance)[,isamples, drop=FALSE],
length=txiTx$length[,isamples, drop=FALSE]
),
rowData=rowData_tx,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiTx$countsFromAbundance)
);
ret_list$TxSE <- TxSE;
}
# optionally define gene_body_ids
if (any(c("gene", "gene_body", "gene_tx") %in% import_types)) {
if (length(gene_body_ids) == 0) {
gene_body_colname <- head(intersect(gene_body_colname,
colnames(tx2gene)), 1)
if (length(gene_body_colname) > 0) {
gene_body_ids <- subset(tx2gene, tx2gene[[gene_body_colname]] %in% "gene_body")[[tx_colname]];
}
}
gene_body_ids <- intersect(gene_body_ids,
rownames(txiTx$counts));
} else {
gene_body_ids <- NULL
}
# remove gene_body entries from transcript rows used here
tx_ids <- setdiff(rownames(txiTx$counts), gene_body_ids)
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"length(tx_ids): ",
jamba::formatInt(length(tx_ids)))
jamba::printDebug("import_salmon_quant(): ",
"length(gene_body_ids): ",
jamba::formatInt(length(gene_body_ids)))
}
if ("gene" %in% import_types) {
if (length(tx_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no tx_ids exist for import_types='gene'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx data to gene level: ",
"GeneSE");
}
txiTx_use <- lapply(txiTx[c("abundance", "counts", "length")], function(i){
i[match(tx_ids, rownames(i)), , drop=FALSE]
});
tx2gene_use <- tx2gene[match(tx_ids, tx2gene[[tx_colname]]),
c(tx_colname, gene_colname), drop=FALSE]
txiGene <- tximport::summarizeToGene(
txiTx_use,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene_use);
igenes <- rownames(txiGene[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene))),
tx_colname);
rowData_gene <- tx2gene[match(igenes, tx2gene[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
GeneSE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGene$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGene$abundance)[, isamples, drop=FALSE],
length=txiGene$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGene$countsFromAbundance)
);
ret_list$GeneSE <- GeneSE;
}
}
if ("gene_body" %in% import_types) {
if (length(gene_body_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no gene_body_ids exist for import_types='gene_body'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx and gene_body data to gene level: ",
"GeneBodySE");
}
txiGeneBody <- tximport::summarizeToGene(
txiTx,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene[,c(tx_colname, gene_colname), drop=FALSE]);
igenes <- rownames(txiGeneBody[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene))),
tx_colname);
rowData_gene <- tx2gene[match(igenes, tx2gene[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
GeneBodySE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGeneBody$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGeneBody$abundance)[, isamples, drop=FALSE],
length=txiGeneBody$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGeneBody$countsFromAbundance)
);
ret_list$GeneBodySE <- GeneBodySE;
}
}
if ("gene_tx" %in% import_types) {
if (length(gene_body_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no gene_body_ids exist for import_types='gene_tx'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx data to gene level, with gene_body data separately: ",
"GeneTxSE");
}
# prepare custom tx2gene
tx2gene_tx <- tx2gene;
tx2gene_tx[match(gene_body_ids, rownames(tx2gene_tx)), gene_colname] <- paste0(
tx2gene_tx[match(gene_body_ids, rownames(tx2gene_tx)), gene_colname],
"_gene_body")
txiGeneTx <- tximport::summarizeToGene(
txiTx,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene_tx[,c(tx_colname, gene_colname), drop=FALSE]);
igenes <- rownames(txiGeneTx[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene_tx))),
tx_colname);
rowData_gene <- tx2gene_tx[match(igenes, tx2gene_tx[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
rowData_gene$gene_body <- ifelse(grepl("_gene_body$", igenes),
"gene_body",
"transcript");
rowData_gene$base_gene_name <- gsub("_gene_body$", "", igenes)
rowData_gene$has_gene_body <- igenes %in% c(tx2gene[gene_body_ids, gene_colname],
paste0(tx2gene[gene_body_ids, gene_colname], "_gene_body"));
GeneTxSE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGeneTx$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGeneTx$abundance)[, isamples, drop=FALSE],
length=txiGeneTx$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGeneTx$countsFromAbundance)
);
ret_list$GeneTxSE <- GeneTxSE;
}
}
return(ret_list);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.