R/tximport-wrapper.R

Defines functions readTxResults

Documented in readTxResults

## Wrapper for tximport package for reading in transcript quantification data

#' Read transcript quantification data with tximport package
#' 
#' After generating transcript/feature abundance results using kallisto, Salmon,
#' Sailfish or RSEM for a batch of samples, read these abundance values into an
#' \code{SCESet} object.
#'
#' @param samples character vector providing a set of sample names to use for
#' the abundance results.
#' @param files character vector providing a set of filenames containing
#' kallisto abundance results to be read in.
#' @param log list (optional), generated by \code{runKallisto}. If provided, then
#' \code{samples} and \code{files} arguments are ignored.
#' @param type character, the type of software used to generate the abundances.
#' Options are "kallisto", "salmon", "sailfish", "rsem". This argument is passed
#' to \code{\link[tximport]{tximport}}.
#' @param txOut logical, whether the function should just output transcript-level (default TRUE)
#' @param logExprsOffset numeric scalar, providing the offset used when doing
#' log2-transformations of expression data to avoid trying to take logs of zero.
#' Default offset value is \code{1}.
#' @param verbose logical, should function provide output about progress?
#' @param ... optional parameters passed to \code{\link[tximport]{tximport}}. 
#' See documentation for \code{\link[tximport]{tximport}} for options and 
#' details.
#' 
#' @details Note: tximport does not import bootstrap estimates from kallisto, 
#' Salmon, or Sailfish. If you want bootstrap estimates use the 
#' \code{\link{readKallistoResults}} or \code{\link{readSalmonResults}} 
#' functions.
#' 
#' @return an \code{SCESet} object containing the abundance, count and feature
#' length data from the supplied samples.
#' 
#' @importFrom tximport tximport
#' @export
#' @references 
#' Soneson C, Love MI, Robinson MD. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences. F1000Res. 2015;4: 1521.
#' 
#' @examples 
#' \dontrun{
#' ## this example requires installation of the tximportData package from 
#' ## Bioconductor 
#' library(tximportData)
#' dir <- system.file("extdata", package = "tximportData")
#' list.files(dir)
#' samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
#' samples
#' directories <- file.path(dir, "kallisto", samples$run)
#' names(directories) <- paste0("sample", 1:6)
#' files <- file.path(directories, "abundance.tsv")
#' sce_example <- readTxResults(samples = names(directories), 
#' files = files, type = "kallisto")
#' 
#' ## for faster reading of results use the read_tsv function from the readr pkg
#' library(readr)
#' sce_example <- readTxResults(samples = names(directories), 
#' files = files, type = "kallisto", reader = read_tsv)
#' }
#' 
readTxResults <- function(samples = NULL, files = NULL, 
                          log = NULL, type = "kallisto", txOut = TRUE, 
                          logExprsOffset = 1, verbose = TRUE, ...) {
    ## allow for some failed runs
    tx_fail <- rep(FALSE, length(samples))
    
    ## Checks on arguments
    if ( !is.null(log) && verbose ) {
        cat("Using log argument to define samples and results directories.")
        if ( !is.list(log) )
            stop("The kallisto_log argument should be a list returned by runKallisto()")
        samples <- names(log)
        directories <- vapply(log, function(x) {x$output_dir}, character(1))
        logs <- lapply(log, function(x) {x$kallisto_log})
        
        ## Can only check kallisto fail if log provided
        tx_fail <- vapply(logs, function(x) {
            any(grepl("[wW]arning|[eE]rror", x))}, logical(1))
        if ( any(tx_fail) ) {
            warning(paste0("The transcript quantification job failed for the following samples:\n ",
                           paste0(names(logs)[tx_fail], collapse = "\n"),
                           "\n It is recommended that you inspect the log object for these samples."))
        }
        
        
    } else {
        if ( verbose )
            cat("Kallisto log not provided - assuming all runs successful")
        if ( is.null(samples) | is.null(files) )
            stop("If kallisto_log argument is not used, then both samples and files must be provided.")
        if ( length(samples) != length(files) )
            stop("samples and files arguments must be the same length")
        }

    if ( !is.null(log) ) {
        ## define sample names and directories
        samples <- samples[!tx_fail]
        directories <- directories[!tx_fail]
        files <- file.path(directories, "abundance.tsv")
    }
    
    ## Read tx results into results objects
    names(samples) <- NULL # found that named vector can mess things up
    if ( !all(file.exists(files)) )
        stop("Some of the files provided do not exist!")
    txi <- tximport::tximport(files, type = type, txOut = txOut, ...)
    
    ## define pdata and fdata 
    nsamples <- length(samples)
    nfeatures <- nrow(txi$abundance)
    nbootstraps <- rep(0, nsamples)
    
    ## Set up results objects
    pdata <- data.frame(n_features = rep(nfeatures, nsamples),
                        n_bootstraps = nbootstraps)
    rownames(pdata) <- samples
    fdata <- data.frame(feature_id = rownames(txi$abundance),
                        ave_feature_length = rowMeans(txi$length))
    rownames(fdata) <- rownames(txi$abundance)
    est_counts <- txi$counts
    tpm <- txi$abundance
    feature_length <- txi$length
    colnames(est_counts) <- colnames(tpm) <- colnames(feature_length) <- samples
    rownames(est_counts) <- rownames(tpm) <- rownames(feature_length) <- 
        rownames(txi$abundance)

    ## Produce SCESet object
    pdata <- new("AnnotatedDataFrame", pdata)
    fdata <- new("AnnotatedDataFrame", fdata)
    sce_out <- newSCESet(exprsData = log2(tpm + logExprsOffset),
                         phenoData = pdata, featureData = fdata,
                         countData = est_counts,
                         logExprsOffset = logExprsOffset,
                         lowerDetectionLimit = 0)
    tpm(sce_out) <- tpm
    set_exprs(sce_out, "feature_length") <- feature_length
    if ( verbose )
        cat("Using log2(TPM + logExprsOffset) as 'exprs' values in output.")
    ## Return SCESet object
    sce_out
}
dynverse/scaterlegacy documentation built on Feb. 17, 2020, 5:07 a.m.