R/kallisto-wrapper.R

Defines functions runKallisto .call_kallisto readKallistoResultsOneSample readKallistoResults

Documented in readKallistoResults readKallistoResultsOneSample runKallisto

## Wrappers for kallisto quantification of abundance of RNA-seq reads

################################################################################
#' Run kallisto on FASTQ files to quantify feature abundance
#'
#' Run the abundance quantification tool \code{kallisto} on a set of FASTQ
#' files. Requires \code{kallisto} (\url{http://pachterlab.github.io/kallisto/})
#' to be installed and a kallisto feature index must have been generated prior
#' to using this function. See the kallisto website for installation and basic
#' usage instructions.
#'
#' @param targets_file character string giving the path to a tab-delimited text
#' file with either 2 columns (single-end reads) or 3 columns (paired-end reads)
#' that gives the sample names (first column) and FastQ file names (column 2 and
#' if applicable 3). The file is assumed to have column headers, although these
#' are not used.
#' @param transcript_index character string giving the path to the kallisto
#' index to be used for the feature abundance quantification.
#' @param single_end logical, are single-end reads used, or paired-end reads?
#' @param output_prefix character string giving the prefix for the output folder
#' that will contain the kallisto results. The default is \code{"output"} and
#' the sample name (column 1 of \code{targets_file}) is appended (preceded by an
#' underscore).
#' @param fragment_length scalar integer or numeric giving the estimated
#' average fragment length. Required argument if \code{single_end} is \code{TRUE},
#' optional if \code{FALSE} (kallisto default for paired-end data is that the
#' value is estimated from the input data).
#' @param fragment_standard_deviation scalar numeric giving the estimated 
#' standard deviation of read fragment length. Required argument if 
#' \code{single_end} is \code{TRUE}, optional if \code{FALSE} (kallisto default 
#' for paired-end data is that the value is estimated from the input data).
#' @param n_cores integer giving the number of cores (nodes/threads) to use for
#' the kallisto jobs. The package \code{parallel} is used. Default is 2 cores.
#' @param n_bootstrap_samples integer giving the number of bootstrap samples
#' that kallisto should use (default is 0). With bootstrap samples, uncertainty
#' in abundance can be quantified.
#' @param bootstrap_seed scalar integer or numeric giving the seed to use for
#' the bootstrap sampling (default used by kallisto is 42). Optional argument.
#' @param correct_bias logical, should kallisto's option to model and correct
#' abundances for sequence specific bias? Requires kallisto version 0.42.2 or
#' higher.
#' @param plaintext logical, if \code{TRUE} then bootstrapping results are
#' returned in a plain text file rather than an HDF5
#' \url{https://www.hdfgroup.org/HDF5/} file.
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#' @param verbose logical, should timings for the run be printed?
#' @param dry_run logical, if \code{TRUE} then a list containing the kallisto
#' commands that would be run and the output directories is returned. Can be
#' used to read in results if kallisto is run outside an R session or to produce
#'  a script to run outside of an R session.
#' @param kallisto_cmd (optional) string giving full command to use to call
#' kallisto, if simply typing "kallisto" at the command line does not give the
#' required version of kallisto or does not work. Default is simply "kalliso".
#' If used, this argument should give the full path to the desired kallisto
#' binary.
#'
#' @details A kallisto transcript index can be built from a FASTA file:
#' \code{kallisto index [arguments] FASTA-file}. See the kallisto documentation
#' for further details.
#'
#' @return A list containing three elements for each sample for which feature
#' abundance has been quantified: (1) \code{kallisto_call}, the call used for
#' kallisto, (2) \code{kallisto_log} the log generated by kallisto, and (3)
#' \code{output_dir} the directory in which the kallisto results can be found.
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel stopCluster
#' @importFrom parallel parLapply
#' @importFrom utils read.delim
#' @export
#' @examples
#' \dontrun{
#' ## If in kallisto's 'test' directory, then try these calls:
#' ## Generate 'targets.txt' file:
#' write.table(data.frame(Sample="sample1", File1="reads_1.fastq.gz", File2="reads_1.fastq.gz"),
#'  file="targets.txt", quote=FALSE, row.names=FALSE, sep="\t")
#' kallisto_log <- runKallisto("targets.txt", "transcripts.idx", single_end=FALSE,
#'          output_prefix="output", verbose=TRUE, n_bootstrap_samples=10,
#'          dry_run = FALSE)
#' }
runKallisto <- function(targets_file, transcript_index, single_end = TRUE,
                        output_prefix = "output", fragment_length = NULL,
                        fragment_standard_deviation = NULL,
                        n_cores = 2, n_bootstrap_samples = 0,
                        bootstrap_seed = NULL, correct_bias = TRUE,
                        plaintext = FALSE, kallisto_version = "current",
                        verbose = TRUE, dry_run = FALSE,
                        kallisto_cmd = "kallisto") {
    targets_dir <- paste0(dirname(targets_file), "/")
    targets <- read.delim(targets_file, stringsAsFactors = FALSE, header = TRUE)
    if ( !(ncol(targets) == 2 || ncol(targets) == 3) )
        stop("Targets file must have either 2 columns (single-end reads) or
             3 columns (paired-end reads). File should be tab-delimited with
             column headers")
    if ( ncol(targets) == 2 && !single_end ) {
        warning("targets only has two columns; proceeding assuming single-end
reads")
        single_end <- TRUE
    }
    if ( ncol(targets) == 3 && single_end ) {
        warning("targets only has three columns, but 'single_end' was TRUE;
proceeding assuming paired-end reads")
        single_end <- FALSE
    }
    samples <- targets[,1]
    ## If we have single-end reads then fragment_length must be defined
    if ( single_end && is.null(fragment_length) )
        stop("If single-end reads are used, then fragment_length must be
defined. Either a scalar giving the average fragment length to use for all
samples, or a vector providing the ave fragment length for each sample.")
    else
        paired_end <- !single_end
    if ( single_end && is.null(fragment_standard_deviation) )
        stop("If single-end reads are used, then fragment_standard_deviation must be defined.")
    else
        paired_end <- !single_end
    if ( correct_bias && kallisto_version == "pre-0.42.2" )
        stop("Bias correction requires kallisto version 0.42.2 or higher.")
    ## Make sure that we'll be able to find the fastq files
    targets[, 2] <- paste0(targets_dir, targets[, 2])
    if ( ncol(targets) == 3 )
        targets[, 3] <- paste0(targets_dir, targets[, 3])
    ## Generate calls to kallisto
    output_dirs <- paste(output_prefix, samples, sep = "_")
    kallisto_args <- paste("quant -i", transcript_index, "-o",
                            output_dirs, "-b", n_bootstrap_samples)
    names(kallisto_args) <- samples
    if ( single_end )
        kallisto_args <- paste(kallisto_args, "--single")
    if ( !is.null(fragment_length) )
        kallisto_args <- paste(kallisto_args, "-l", fragment_length)
    if ( !is.null(fragment_standard_deviation) )
        kallisto_args <- paste(kallisto_args, "-s", fragment_standard_deviation)
    if ( !is.null(bootstrap_seed) )
        kallisto_args <- paste0(kallisto_args, " --seed=", bootstrap_seed)
    if ( correct_bias && kallisto_version != "pre-0.42.2" )
        kallisto_args <- paste0(kallisto_args, " --bias")
    if ( plaintext ) # output bootstrap results in a plaintext file
        kallisto_args <- paste0(kallisto_args, " --plaintext" )
    kallisto_args <- paste(kallisto_args, targets[, 2])
    if (paired_end)
        kallisto_args <- paste(kallisto_args, targets[, 3])
    ##
    if ( dry_run ) {
        kallisto_log <- vector("list", length(samples))
        names(kallisto_log) <- samples
        kallisto_calls <- paste("kallisto", kallisto_args)
        for (i in seq_len(length(kallisto_calls))) {
            kallisto_log[[i]]$output_dir <- output_dirs[i]
            kallisto_log[[i]]$kallisto_call <- kallisto_calls[i]
            kallisto_log[[i]]$kallisto_log <-
                "Dry run: kallisto commands not executed."
        }
    } else {
        if (verbose)
            print(paste("Analysis started: ", Sys.time()))
        cl <- parallel::makeCluster(n_cores)
        # one or more parLapply calls to kallisto
        kallisto_log <- parallel::parLapply(cl, kallisto_args, .call_kallisto,
                                            kallisto_cmd, verbose)
        parallel::stopCluster(cl)
        ## Return log of kallisto jobs, so user knows where to find results
        names(kallisto_log) <- samples
        if (verbose) {
            print(paste("Analysis completed: ", Sys.time()))
            print(paste("Processed", length(samples), "samples"))
        }
        for (i in seq_len(length(kallisto_log))) {
            kallisto_log[[i]]$output_dir <- output_dirs[i]
        }
    }
    kallisto_log
}

.call_kallisto <- function(kcall, kallisto_cmd, verbose=TRUE) {
    out <- tryCatch(ex <- system2(kallisto_cmd, kcall, stdout = TRUE,
                                  stderr = TRUE),
                    warning = function(w){w}, error = function(e){e})
    list(kallisto_call = paste(kallisto_cmd, kcall), kallisto_log = out)
}

### Possible test for kallisto wrapper, but seems to leave tests hanging (not clear why)
# context("kallisto tests on inputs")
#
# test_that("tests for presence of average fragment length if single-end reads", {
#     expect_that(runKallisto("../extdata/targets.txt",
#                             "../extdata/transcripts.idx",
#                             output_prefix="../extdata/output", n_cores=12,
#                             fragment_length=NULL, verbose=TRUE),
#                 gives_warning("targets only has three columns, but 'single_end' was TRUE; proceeding assuming paired-end reads"))
# })
#
#

################################################################################
#' Read kallisto results for a single sample into a list
#'
#' @param directory character string giving the path to the directory containing
#' the kallisto results for the sample.
#' @param read_h5 logical, if \code{TRUE} then read in bootstrap results from
#' the HDF5 object produced by kallisto.
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#'
#' @details The directory is expected to contain results for just a single
#' sample. Putting more than one sample's results in the directory will result
#' in unpredictable behaviour with this function. The function looks for the
#' files (with the default names given by kallisto) 'abundance.txt',
#' 'run_info.json' and (if \code{read_h5=TRUE}) 'abundance/h5'. If these files
#' are missing, or if results files have different names, then this function
#' will not find them.
#'
#' @return A list with two elements: (1) a data.frame \code{abundance} with
#' columns for 'target_id' (feature, transcript, gene etc), 'length' (feature
#' length), 'eff_length' (effective feature length), 'est_counts' (estimated
#' feature counts), 'tpm' (transcripts per million) and possibly many columns
#' containing bootstrap estimated counts; and (2) a list \code{run_info} with
#' details about the kallisto run that generated the results.
#'
#' @importFrom data.table fread
#' @importFrom rhdf5 h5read
#' @importFrom rhdf5 H5close
#' @importFrom rjson fromJSON
#' @export
#' @examples
#' # If kallisto results are in the directory "output", then call:
#' # readKallistoResultsOneSample("output")
readKallistoResultsOneSample <- function(directory, read_h5=FALSE,
                                         kallisto_version = "current") {
    ## Read in abundance information for the sample
    if ( kallisto_version == "pre-0.42.2" )
        file_to_read <- paste0(directory, "/abundance.txt")
    else
        file_to_read <- paste0(directory, "/abundance.tsv")
    if ( file.exists(file_to_read) ) {
        abundance <- data.table::fread(file_to_read, colClasses = c("numeric", "numeric", "numeric", "character", "character"), sep = "\t")
        abundance$est_counts <- as.numeric(abundance$est_counts)
        abundance$tpm <- as.numeric(abundance$tpm)
        abundance$eff_length <- as.numeric(abundance$eff_length)
    } else
        stop(paste("File", file_to_read, "not found or does not exist. Please check directory is correct."))
    ## Read in run information
    run_info <- rjson::fromJSON(file = paste0(directory, "/run_info.json"))
    ## Read in HDF5 data file with bootstrap results
    if ( read_h5 ) {
        file_to_read <- paste0(directory, "/abundance.h5")
        if ( file.exists(file_to_read) ) {
            h5 <- rhdf5::h5read(file_to_read, "/")
            rhdf5::H5close()
        } else {
            stop(paste("File", file_to_read, "not found or does not exist. Please check directory is correct and that you want to read results in HDF5 format."))
        }
        if ( h5$aux$num_bootstrap > 0 ) {
            boot_mat <- data.frame(matrix(unlist(h5$bootstrap),
                                          nrow = length(h5$est_counts),
                                          ncol = h5$aux$num_bootstrap))
            colnames(boot_mat) <- names(h5$bootstrap)
            abundance <- cbind(abundance, boot_mat)
        }
    }
    list(abundance = abundance, run_info = run_info)
}


################################################################################
#' Read kallisto results from a batch of jobs
#'
#' After generating transcript/feature abundance results using kallisto for a
#' batch of samples, read these abundance values into an \code{SCESet} object.
#'
#' @param kallisto_log list, generated by \code{runKallisto}. If provided, then
#' \code{samples} and \code{directories} arguments are ignored.
#' @param samples character vector providing a set of sample names to use for
#' the abundance results.
#' @param directories character vector providing a set of directories containing
#' kallisto abundance results to be read in.
#' @param read_h5 logical, should the bootstrap results be read in from the HDF5
#' objects produced by kallisto?
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#' @param logExprsOffset numeric scalar, providing the offset used when doing
#' log2-transformations of expression data to avoid trying to take logs of zero.
#' Default offset value is \code{1}.
#' @param verbose logical, should function provide output about progress?
#'
#' @details This function expects to find only one set of kallisto abundance
#' results per directory; multiple adundance results in a given directory will
#' be problematic.
#'
#' @return an SCESet object
#'
#' @export
#' @examples
#' \dontrun{
#' kallisto_log <- runKallisto("targets.txt", "transcripts.idx", single_end=FALSE,
#'          output_prefix="output", verbose=TRUE, n_bootstrap_samples=10)
#' sceset <- readKallistoResults(kallisto_log)
#' }
#'
readKallistoResults <- function(kallisto_log = NULL, samples = NULL,
                                directories = NULL, read_h5 = FALSE,
                                kallisto_version = "current",
                                logExprsOffset = 1,
                                verbose = TRUE) {
    kallisto_fail <- rep(FALSE, length(samples))

    ## Checks on arguments
    if ( !is.null(kallisto_log) ) {
        cat("Using kallisto_log argument to define samples and results directories.")
        if ( !is.list(kallisto_log) )
            stop("The kallisto_log argument should be a list returned by runKallisto()")
        samples <- names(kallisto_log)
        directories <- sapply(kallisto_log, function(x) {x$output_dir})
        logs <- lapply(kallisto_log, function(x) {x$kallisto_log})

        ## Can only check kallisto fail if log provided
        kallisto_fail <- sapply(logs, function(x) {
          any(grepl("[wW]arning|[eE]rror", x))})
        if ( any(kallisto_fail) ) {
          warning(paste0("The kallisto job failed for the following samples:\n ",
                         paste0(names(logs)[kallisto_fail], collapse = "\n"),
                         "\n It is recommended that you inspect kallisto_log for these samples."))
        }

    } else {
      cat("Kallisto log not provided - assuming all runs successful")
        if ( is.null(samples) | is.null(directories) )
            stop("If kallisto_log argument is not used, then both samples and directories must be provided.")
        if ( length(samples) != length(directories) )
            stop("samples and directories arguments must be the same length")
    }


    samples <- samples[!kallisto_fail]
    directories <- directories[!kallisto_fail]

    if ( !all(dir.exists(directories)) )
        stop("Some of the desired directories to import do not exist!")
    
    ## Read first file to get size of feature set
    s1 <- readKallistoResultsOneSample(directories[1], read_h5 = read_h5,
                                       kallisto_version = kallisto_version)
    nsamples <- length(samples)
    nfeatures <- nrow(s1$abundance)
    nbootstraps <- s1$run_info$n_bootstraps
    navec_samples <- rep(NA, nsamples)

    ## Set up results objects
    pdata <- data.frame(n_targets = navec_samples, n_bootstraps = navec_samples,
                        kallisto_version = navec_samples,
                        index_version = navec_samples, start_time = navec_samples,
                        call = navec_samples)
    rownames(pdata) <- samples
    fdata <- data.frame(feature_id = s1$abundance$target_id,
                        feature_length = s1$abundance$length,
                        feature_eff_length = s1$abundance$eff_length)
    rownames(fdata) <- s1$abundance$target_id
    est_counts <- tpm <- feat_eff_len <- 
        matrix(NA, nrow = nfeatures, ncol = nsamples)
    colnames(est_counts) <- colnames(tpm) <- colnames(feat_eff_len) <- samples
    rownames(est_counts) <- rownames(tpm) <- rownames(feat_eff_len) <- 
        s1$abundance$target_id
    
    if ( read_h5 ) {
        bootstraps <- array(NA, dim = c(nfeatures, nsamples, nbootstraps))
        rownames(bootstraps) <- s1$abundance$target_id
        colnames(bootstraps) <- samples
    }

    ## Read kallisto results into results objects
    if ( verbose )
        cat(paste("\nReading results for", nsamples, "samples:\n"))
    for (i in seq_len(nsamples)) {
        tmp_samp <- readKallistoResultsOneSample(
            directories[i], read_h5 = read_h5,
            kallisto_version = kallisto_version)
        ## counts
        if ( length(tmp_samp$abundance$est_counts) != nfeatures )
            warning(paste("Results for directory", directories[i], "do not match dimensions of other samples."))
        else
            est_counts[,i] <- tmp_samp$abundance$est_counts
        ## tpm
        if ( length(tmp_samp$abundance$est_counts) == nfeatures )
            tpm[,i] <- tmp_samp$abundance$tpm
        if ( length(tmp_samp$abundance$est_counts) == nfeatures )
            tpm[,i] <- tmp_samp$abundance$tpm
        ## feature effective length
        if ( length(tmp_samp$abundance$eff_length) == nfeatures )
            feat_eff_len[, i] <- tmp_samp$abundance$eff_length
        
        ## run info
        pdata$n_targets[i] <- tmp_samp$run_info$n_targets
        pdata$n_processed[i] <- tmp_samp$run_info$n_processed
        pdata$n_bootstraps[i] <- tmp_samp$run_info$n_bootstraps
        pdata$kallisto_version[i] <- tmp_samp$run_info$kallisto_version
        pdata$index_version[i] <- tmp_samp$run_info$index_version
        pdata$start_time[i] <- tmp_samp$run_info$start_time
        pdata$call[i] <- tmp_samp$run_info$call
        ## bootstraps
        if ( read_h5 )
            bootstraps[,i,] <- as.matrix(tmp_samp$abundance[,-c(1:5)])
        if ( verbose ) {
            cat(".")
            if ( i %% 80 == 0)
                cat("\n")
        }
    }
    ## Add median feature effective length to fData
    fdata$median_effective_length <- matrixStats::rowMedians(feat_eff_len)
    
    if ( verbose )
        cat("\n")
    ## Produce SCESet object
    pdata <- new("AnnotatedDataFrame", pdata)
    fdata <- new("AnnotatedDataFrame", fdata)
    sce_out <- newSCESet(exprsData = log2(tpm + logExprsOffset),
                         phenoData = pdata, featureData = fdata,
                         countData = est_counts,
                         logExprsOffset = logExprsOffset,
                         lowerDetectionLimit = 0)
    tpm(sce_out) <- tpm
    set_exprs(sce_out, "feature_effective_length") <- feat_eff_len
    if ( verbose )
        cat("Using log2(TPM + 1) as 'exprs' values in output.")
    if ( read_h5 )
        bootstraps(sce_out) <- bootstraps
    ## Return SCESet object
    sce_out
}
dynverse/scaterlegacy documentation built on Feb. 17, 2020, 5:07 a.m.