R/mergeKallisto.R

Defines functions .extractTranscriptomeFromCall mergeKallisto

Documented in mergeKallisto

#' Merge multiple Kallisto results, yielding a KallistoExperiment.
#'
#' If no transcriptomes are specified (or the ones specified are not found),
#' the resulting KallistoExperiment will have a rowData/rowRanges object 
#' which consists entirely of unannotated transcripts from chromosome "Unknown".
#' If transcriptomes are specified and annotations for those transcriptomes are
#' found, the transcripts described in the annotations will be fully annotated
#' for transcript ID, gene ID, gene name, entrez ID, and transcript biotype, 
#' provided that these fields are supported in the annotation resources.
#'
#' FIXME: automatically determine which transcriptomes were used (in process!)
#'
#' @param outputDirs      character: directories holding Kallisto results (NULL)
#' @param outputPath      character: base path to the outputDirs (default is .)
#' @param covariates      data.frame or DataFrame: per-sample covariates (NULL)
#' @param annotate        boolean: auto-annotate the transcripts? (FALSE)
#' @param collapse        string: collapsing string for indices ("_mergedWith_")
#' @param transcriptomes  string: collapsing string for indices ("_mergedWith_")
#' @param parallel        boolean: try to run the merge in parallel? (TRUE)
#' @param summarize       boolean: summarize bootstraps if found? (FALSE) 
#' @param ...             further arguments for KallistoExperiment constructor
#' @importFrom parallel mclapply
#' @importFrom S4Vectors DataFrame 
#' @export
mergeKallisto <- function(outputDirs=NULL,
                          outputPath=".",
                          covariates=NULL,
                          annotate=FALSE, 
                          collapse="_mergedWith_",
                          transcriptomes=NULL,
                          parallel=TRUE,
                          summarize=FALSE,
                          ...) { 

  
  if (is.null(outputDirs) & is.null(covariates)) {
    stop("At least one of outputDirs or covariates must be non-null to proceed")
  } 

  if (is.null(outputDirs)) {
    if (!"outputDir" %in% names(covariates)) { 
      stop("Your covariates need to have a column $outputDir for each sample")
    } else { 
      outputDirs <- covariates$outputDir
    }
  }
  targets <- paste0(path.expand(outputPath), "/", outputDirs)
  stopifnot(all(targets %in% list.dirs(outputPath)))
  names(targets) <- outputDirs

  if (parallel == TRUE) { 
    res <- mclapply(targets, fetchKallisto, 
                    transcriptomes=transcriptomes,
                    summarize=summarize)
  } else {
    res <- lapply(targets, fetchKallisto, 
                  transcriptomes=transcriptomes,
                  summarize=summarize)
  }
 
  ## check and make sure all the results came from the same Kallisto version,
  kversion <- sapply(res, attr, "kallisto_version")
  if (length(unique(kversion)) > 1) {
    stop("Your runs are from different Kallisto versions! Aborting merge.")
  } else {
    kallistoVersion <- unique(kversion)
  } 

  cnames <- Reduce(base::intersect, lapply(res, colnames))
  names(cnames) <- cnames
  asys <- lapply(cnames, function(j) 
                 do.call(cbind, lapply(res, function(x) x[,j])))
  stopifnot(all(sapply(asys, is, "matrix")))
  stopifnot(identical(colnames(asys[[1]]), colnames(asys[[2]])))
  if (!"est_counts_mad" %in% names(asys)) asys$est_counts_mad <- NULL

  coldat <- DataFrame(ID=outputDirs)
  rownames(coldat) <- coldat$ID

  ktxs <- sapply(res, .extractTranscriptomeFromCall)
  if (length(unique(ktxs)) > 1) {
    print(table(ktxs))
    stop("Your runs are against different transcriptomes! Aborting merge.")
  } 
 
  # infer txomes if not provided 
  if (is.null(transcriptomes)) {
    message("Setting transcriptome automatically from Kallisto call string.")
    transcriptomes <- attr(res[[1]], "transcriptomes")
    if (is.null(transcriptomes)) {
      transcriptomes <- c(unknown=attr(res[[1]], "fastaFiles"))
    }
  } else {
    message("Transcriptomes set from provided argument:")
    message(paste(transcriptomes, collapse=", "))
  } 
  data(unannotatedTranscript, package="arkas")
  rowdat <- rep(unannotatedTranscript, nrow(asys$est_counts))
  names(rowdat) <- rownames(asys$est_counts)
  kexp <- KallistoExperiment(est_counts=asys$est_counts,
                             eff_length=asys$eff_length,
                             est_counts_mad=asys$est_counts_mad,
                             transcriptomes=transcriptomes,
                             kallistoVersion=kallistoVersion,
                             covariates=coldat,
                             features=rowdat,
                             ...)
  colnames(kexp) <- kexp$ID
  if(!is.null(transcriptomes) && annotate == TRUE) {
    feats <- rowRanges(kexp)
    mapped <- annotateFeatures(kexp, level="transcript", what="GRanges") 
    feats[names(mapped)] <- mapped
    rowRanges(kexp) <- feats
  }
  return(kexp)
}

.extractTranscriptomeFromCall <- function(x) extractIndexName(attr(x, "call"))
RamsinghLab/arkas documentation built on March 14, 2021, 11:39 a.m.