#' import plate-seq runs from Kallisto into a SingleCellExperiment
#'
#' Automated (more or less) importation of STORM-seq and SMART-seq[123] data
#' for downstream attempts at merging, index visualization, etc. Assumes that
#' users will quantify against ENSEMBL transcripts instead of ridiculous crap
#' like GENCODE and similar. Default is ERCC spikes, ENSembl transcripts, and
#' if found, intronic quantifications of the same. Everything else gets labeled
#' as 'viruses_and_repeats' since in our transcriptomes, that's what it is. If
#' altExps is set to FALSE, no splitting will be attempted.
#'
#' @param runs a vector of .tsv file paths (usually abundance.tsv) w/names
#' @param what columns to import (default: tpm, est_counts, eff_length)
#' @param altExps split spliced, unspliced, spikes, viruses & repeats? (TRUE)
#' @param asgenes collapse transcripts to genes? (implies altExps & t2g)
#' @param t2g data.frame with tx2gene mappings (required if asgenes)
#' @param BPPARAM optional BiocParallel parameter object for bplapply()
#' @param ... additional parameters to pass to .split_altExps
#'
#' @return A SingleCellExperiment
#'
#'
#' @examples
#'
#' t2g <- read.delim("../hs_mm_spikes_repeats_ens100.trimmed.velo.tx2gene.tsv",
#' sep="\t", row.names=1, head=FALSE)
#' t2g <- t2g[grep("ENS", rownames(t2g)), , drop=FALSE]
#'
#' @import SingleCellExperiment
#' @import BiocParallel
#'
#' @export
import_kallisto_txis <- function(runs,
what=c(tpm="tpm",
counts="est_counts",
eff_length="eff_length"),
altExps=TRUE,
asgenes=FALSE,
t2g=NULL,
BPPARAM=bpparam(),
...) {
stopifnot(!is.null(names(runs)))
message("Processing ", length(runs), " runs.")
message("Importing first run as a prototype to check transcript names.")
proto <- .import_kallisto(runs[1])[, c("est_counts", "eff_length", "tpm")]
# yes this would be faster in C/C++. No, I don't care
message("Importing all ", length(runs), " runs in parallel...")
imported <- bplapply(runs, .import_kallisto, proto=proto, BPPARAM=BPPARAM)
mats <- lapply(what, .extract_columns, imported=imported, BPPARAM=BPPARAM)
message("Constructing SingleCellExperiment...")
sce <- SingleCellExperiment(mats)
if (asgenes) sce <- .collapse_genes(sce, t2g=t2g)
if (altExps) sce <- .split_altExps(sce, ...)
return(sce)
}
# helper fn
.import_kallisto_tsv <- function(x, ...) {
read.table(x, sep="\t", header=TRUE, row.names=1,
colClasses=c(target_id="character",
length="integer",
eff_length="numeric",
est_counts="numeric",
tpm="numeric"))
}
# helper fn
.import_kallisto <- function(x, proto=NULL) {
message("Importing ", x, " ... ", appendLF=FALSE)
res <- .import_kallisto_tsv(x)
message("done.")
if (!is.null(proto)) return(res[rownames(proto), colnames(proto)])
else return(res)
}
# helper fn
.extract_columns <- function(imported, what="tpm", BPPARAM=bpparam()) {
getcol <- function(x) x[, what, drop=FALSE]
message("Extracting ", what, " for each run...")
res <- do.call(cbind, bplapply(imported, getcol, BPPARAM=BPPARAM))
rownames(res) <- rownames(imported[[1]])
colnames(res) <- names(imported)
return(as.matrix(res))
}
# helper fn
.split_altExps <- function(sce,
tx="ENS",
sep="\\-",
unspliced="\\-I",
spikes="ERCC",
t2g=NULL) {
message("Categorizing transcripts...")
txs <- grep(unspliced, invert=TRUE, value=TRUE,
grep(tx, rownames(sce), value=TRUE))
txus <- grep(unspliced, value=TRUE,
grep(tx, rownames(sce), value=TRUE))
txu2tx <- sapply(strsplit(txus, sep), `[`, 1)
names(txu2tx) <- txus
rowData(sce)$type <- "viruses_and_repeats"
rowData(sce[txs, ])$type <- "spliced"
rowData(sce[txus, ])$type <- "unspliced"
rowData(sce[grep(spikes, rownames(sce), value=TRUE),])$type <- "spikes"
splitAltExps(sce, rowData(sce)$type, "spliced")
}
# helper fn; run for spliced and unspliced
.collapse_genes <- function(sce, t2g) {
mappable <- grep("ENS", rownames(sce))
for (asy in assayNames(sce)) {
message("Collapsing ", asy, " from transcripts to genes...")
stop("Not done.")
}
}
# helper fn; quantify biotypes
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.