sleuth: Tools for investigating RNA-Seq

Documented in read_kallisto read_kallisto_h5 read_kallisto_tsv

#
#    sleuth: inspect your RNA-Seq with a pack of kallistos
#
#    Copyright (C) 2015  Harold Pimentel, Nicolas Bray, Pall Melsted, Lior Pachter
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

#' Read kallisto output
#'
#' This is a general driver function to read kallisto output. It can read either
#' H5 or tsv.
#' @param either the kallisto directory name or the file name of a h5 or tsv output from kallisto
#' @param read_bootstrap if \code{TRUE}, bootstraps will be read (h5 only)
#' @param max_bootstrap an integer denoting the number of bootstraps to read.
#' if \code{NULL} read everything available (h5 only)
#' @return a \code{kallisto} object
#' @export
read_kallisto <- function(path, read_bootstrap = TRUE, max_bootstrap = NULL) {
  stopifnot(is(path, "character"))

  kal_path <- get_kallisto_path(path)

  if ( kal_path$ext == "tsv" && read_bootstrap ) {
    warning("You specified to read bootstraps, but we won't do so for plaintext")
  }

  result <- NULL
  if ( kal_path$ext == "h5" ) {
    result <- read_kallisto_h5(kal_path$path, read_bootstrap = read_bootstrap,
      max_bootstrap = max_bootstrap)
  } else {
    result <- read_kallisto_tsv(kal_path$path)
  }

  result
}

#' Read a kallisto object from an HDF5 file
#'
#' Read a kallisto object from an HDF5 file.
#'
#' @param fname the file name for the HDF5 file
#' @param read_bootstrap if \code{TRUE} load bootstraps, otherwise do not
#' @param max_bootstrap an integer denoting the number of bootstraps to read.
#' if \code{NULL} read everything available
#' @return a \code{kallisto} object
#' @export
read_kallisto_h5 <- function(fname, read_bootstrap = TRUE, max_bootstrap = NULL) {
  stopifnot(is(fname, "character"))
  stopifnot( is.null(max_bootstrap) ||
    is(max_bootstrap, "numeric") ||
    is(max_bootstrap, "integer") )

  fname <- path.expand(fname)

  if (!file.exists(fname)) {
    stop("Can't find file: '", fname, "'")
  }

  target_id <- as.character(rhdf5::h5read(fname, "aux/ids"))
  if ( length(target_id) != length(unique(target_id))) {
    tid_counts <- table(target_id)
    warning(
      'Some target_ids in your kallisto index are exactly the same.',
      ' We will make these unique but strongly suggest you change the names',
      ' of the FASTA and recreate the index.',
      ' These are the repeats: ',
      paste(names(tid_counts[which(tid_counts > 1)]), collapse = ', '))
    rm(tid_counts)

    target_id <- make.unique(target_id, sep = '_')
  }

  abund <- adf(target_id = target_id)
  abund$est_counts <- as.numeric(rhdf5::h5read(fname, "est_counts"))
  abund$eff_len <- as.numeric(rhdf5::h5read(fname, "aux/eff_lengths"))
  abund$len <- as.numeric(rhdf5::h5read(fname, "aux/lengths"))

  num_processed <- if ( h5check(fname, '/aux', 'num_processed') ) { # nolint
    as.integer(rhdf5::h5read(fname, 'aux/num_processed'))
  } else {
    NA_integer_
  }

  fld <- if ( h5check(fname, '/aux', 'fld') ) { # nolint
    as.integer(rhdf5::h5read(fname, 'aux/fld'))
  } else {
    NA_integer_
  }

  bias_observed <- NA
  bias_normalized <- NA
  if ( h5check(fname, '/aux', 'bias_observed') ) { # nolint
    bias_observed <- rhdf5::h5read(fname, 'aux/bias_observed')
    bias_normalized <- rhdf5::h5read(fname, 'aux/bias_normalized')
  }

  bs_samples <- list()
  num_bootstrap <- as.integer(rhdf5::h5read(fname, "aux/num_bootstrap"))
  if (!is.null(max_bootstrap) && max_bootstrap < num_bootstrap) {
    num_bs <- max_bootstrap
  } else {
    num_bs <- num_bootstrap
  }
  if (read_bootstrap) {
    if (num_bootstrap > 0) {
      msg("Found ", num_bootstrap, " bootstrap samples")
      if (!is.null(max_bootstrap) && max_bootstrap < num_bootstrap) {
        msg("Only reading ", max_bootstrap, " bootstrap samples")
      }
      bs_samples <- lapply(0:(num_bs[1] - 1),
        function(i) {
          .read_bootstrap_hdf5(fname, i, abund)
        })
    } else {
      msg("No bootstrap samples found")
    }
  }

  abund$tpm <- counts_to_tpm(abund$est_counts, abund$eff_len)

  res <- list(
    abundance = abund,
    bias_normalized = bias_normalized,
    bias_observed = bias_observed,
    bootstrap = bs_samples,
    fld = fld,
    excluded_ids = character()
    )
  class(res) <- 'kallisto'

  attr(res, 'index_version') <- rhdf5::h5read(fname, 'aux/index_version')
  attr(res, 'kallisto_version') <- rhdf5::h5read(fname, 'aux/kallisto_version')
  attr(res, 'start_time') <- rhdf5::h5read(fname, 'aux/start_time')
  attr(res, 'num_targets') <- nrow(abund)
  attr(res, 'original_num_targets') <- nrow(abund)
  attr(res, 'num_mapped') <- sum(abund$est_counts)
  attr(res, 'num_processed') <- num_processed
  attr(res, 'num_bootstrap_found') <- num_bootstrap
  attr(res, 'num_bootstrap_used') <- num_bs
  invisible(res)
}

h5check <- function(fname, group, name) {
  objs <- rhdf5::h5ls(fname)
  objs <- dplyr::rename(objs, grp = group, nm = name)
  objs <- dplyr::filter(objs, grp == group, nm == name)

  nrow(objs) == 1
}

# read a bootstrap from an HDF5 file and return a \code{data.frame}
.read_bootstrap_hdf5 <- function(fname, i, main_est) {
  bs <- adf( target_id = main_est$target_id )
  bs$est_counts <- as.numeric(rhdf5::h5read(fname, paste0("bootstrap/bs", i)))
  bs$tpm <- counts_to_tpm(bs$est_counts, main_est$eff_len)
  bs
}

# @return a matrix with each row being a bootstrap sample
read_bootstrap_mat <- function(fname,
  num_bootstraps,
  num_transcripts,
  est_count_sf) {
  bs_mat <- matrix(ncol = num_bootstraps, nrow = num_transcripts)
  for (i in 1:ncol(bs_mat)) {
    bs_mat[, i] <- rhdf5::h5read(fname, paste0("bootstrap/bs", i - 1)) / est_count_sf
  }
  bs_mat <- t(bs_mat)
  target_id <- as.character(rhdf5::h5read(fname, "aux/ids"))
  colnames(bs_mat) <- target_id

  bs_mat
}

#' Read kallisto plaintext output
#'
#' This function reads kallisto plaintext output. Note, it cannot be used with
#' sleuth. It also does not read bootstraps since reading plaintext bootstraps
#' is quite slow.
#' @param fname the filename for the tsv file
#' @return a \code{kallisto} object (currently missing attributes and
#' bootstraps)
#' @export
read_kallisto_tsv <- function(fname) {
  stopifnot(is(fname, "character"))

  fname <- path.expand(fname)

  if (!file.exists(fname)) {
    stop("Can't find file: '", fname, "'")
  }

  abundance <- suppressWarnings(data.table::fread(fname, data.table = FALSE))
  abundance <- dplyr::rename(abundance,
    len = length,
    eff_len = eff_length)
  abundance <- dplyr::arrange(abundance, target_id)
  abundance$target_id <- as.character(abundance$target_id)

  result <- list(abundance = abundance, bootstrap = NULL)
  class(result) <- 'kallisto'

  invisible(result)
}

# this function takes a path and tries to infer whether it is a h5, tsv, or neither
get_kallisto_path <- function(path) {
  output <- list()

  if ( dir.exists(path) ) {
    if (file.exists(file.path(path, "abundance.h5"))) {
      # standard case where the user has not changed the filename
      output$ext <- "h5"
      output$path <- file.path(path, "abundance.h5")
    } else if ( file.exists(file.path(path, 'abundance.tsv')) ){
      # HDF5 doesn't exist, but we have plaintext
      output$ext <- "tsv"
      output$path <- file.path(path, "abundance.tsv")
    } else {
      stop(path, ' exists, but does not contain kallisto output (abundance.h5)')
    }
  } else if ( file.exists(path) ){
    # make an assumption that the user has kept the correct extension
    base <- basename(path)
    s <- strsplit(base, '\\.')
    ext <- s[[1]][length(s[[1]])]

    if (ext == 'h5') {
      output$ext <- 'h5'
    } else if (ext == 'tsv') {
      output$ext <- 'tsv'
    } else {
      stop("'", path, "' exists, but does not have a recognized extension")
    }
    output$path <- path
  } else {
    stop("'", path, "' does not exist.")
  }

  output
}

#' @export
print.sleuth <- function(obj) {
  cat("\tsleuth object\n")
  cat("\n")
  cat("bears:", length(obj$kal), "\n")
  cat("design:", deparse(obj$full_formula), "\n")
}


# @importFrom dplyr %>%
# @export
kv_vec_to_df <- function(x, cols = c("gene_id", "transcript_id")) {
  stopifnot(length(x) %% 2 == 0)
  `%>%` <- dplyr::`%>%`

  key_idx <- seq(1, length(x), 2)
  val_idx <- key_idx + 1

  vals <- x[val_idx]
  vals <- vals %>%
    gsub('"', "", .) %>%
    gsub(";", "", .)

  res <- setNames(as.list(vals), x[key_idx])
  res[cols]
}

# @importFrom dplyr %>%
# @export
gtf_attributes_to_gene_trans <- function(gtf_attr) {
  stopifnot(is(gtf_attr, "character"))
  `%>%` <- dplyr::`%>%`

  lapply(strsplit(gtf_attr, " "), kv_vec_to_df) %>%
    rbind_all()
}

# @importFrom dplyr %>%
# @export
gtf_gene_names <- function(gtf_attr) {
  stopifnot(is(gtf_attr, "character"))
  all_attr <- strsplit(gtf_attr, " ")
  `%>%` <- dplyr::`%>%`

  gene_id <- vector("character", length(all_attr))
  trans_id <- vector("character", length(all_attr))

  for (i in 1:length(all_attr)) {
    j <- 1
    while ( (nchar(gene_id[i]) < 1 || nchar(trans_id[i]) < 1) &&
      j <= length(all_attr[[i]]) ) {
      if (all_attr[[i]][j] == "gene_id") {
        gene_id[i] <- all_attr[[i]][j + 1] %>%
          gsub('"', "", .) %>%
          sub(";", "", .)
        j <- j + 2
      } else if (all_attr[[i]][j] == "transcript_id") {
        trans_id[i] <- all_attr[[i]][j + 1] %>%
          gsub('"', "", .) %>%
          sub(";", "", .)
        j <- j + 2
      } else {
        j <- j + 1
      }
    }
  }

  data.frame(gene_id = gene_id, transcript_id = trans_id)
}

# @export
read_gtf <- function(fname) {
  gtf <- data.table::fread(fname, sep = "\t", header = FALSE,
    data.table = FALSE)

  gtf_colnames <- c("seqname", "source", "feature", "start", "end", "score",
    "strand", "frame", "attribute")

  data.table::setnames(gtf, colnames(gtf), gtf_colnames)


  gtf
}

# @export
trans_to_genes_from_gtf <- function(fname) {
  trans <- rtracklayer::import(fname)
  trans <- data.frame(GenomicRanges::mcols(trans), stringsAsFactors = FALSE)
  trans <- dplyr::select(trans, transcript_id, gene_id)
  trans <- dplyr::distinct(trans)

  trans
}

#' Write a kallisto object to HDF5
#'
#' Write a kallisto object to HDF5.
#'
#' @param kal the kallisto object to write out
#' @param fname the file name to write out to
#' @param overwrite whether the file should be overwritten if it exists
#' @param write_bootstrap whether the bootstraps should be written to file
#' @param compression an integer between 0 and 7 that indicates the level of compression
#'    to use, with 0 being no compression and 7 being the highest supported by this method.
#'    The default of 6 is a good choice for most applications.
#' @return the kallisto object \code{kal} invisibly.
#' @importFrom rhdf5 h5write
#' @export
write_kallisto_hdf5 <- function(kal, fname, overwrite = TRUE,
                                write_bootstrap = TRUE, compression = 6L) {
  stopifnot(is(kal, "kallisto"))
  stopifnot(is(fname, "character"))
  stopifnot(is(compression, "integer"))
  stopifnot(length(compression) == 1)

  # TODO: ensure that all bootstraps are sorted according to abundance
  if (compression < 0 || compression > 7 ) {
    stop("'compression' must be an integer between 0 and 7")
  }

  fname <- path.expand(fname)

  if (file.exists(fname)) {
    if (overwrite) {
      warning(paste0("'", fname, "' already exists. Overwriting."))
      file.remove(fname)
    } else {
      stop(paste0("'", fname, "' already exists."))
    }
  }

  if (!rhdf5::h5createFile(fname)) {
    stop(paste0("Error: Couldn't open '", fname, "' to write out."))
  }

  dims <- c(nrow(kal$abundance), 1)
  cat("dims: ", dims, "\n")

  # write out auxilary info
  rhdf5::h5createGroup(fname, "aux")

  stopifnot(rhdf5::h5createDataset(fname, "aux/ids", dims = dims,
                                   storage.mode = "character", size = 100,
                                   level = compression))

  if (write_bootstrap) {
    rhdf5::h5write(length(kal$bootstrap), fname, "aux/num_bootstrap")
  } else {
    rhdf5::h5write(0L, fname, "aux/num_bootstrap")
  }

  rhdf5::h5write(kal$abundance$target_id, fname, "aux/ids")
  rhdf5::h5write(kal$abundance$eff_len, fname, "aux/eff_lengths")
  rhdf5::h5write(kal$abundance$len, fname, "aux/lengths")
  rhdf5::h5write(kal$fld, fname, "aux/fld")
  rhdf5::h5write(kal$bias_normalized, fname, "aux/bias_normalized")
  rhdf5::h5write(kal$bias_observed, fname, "aux/bias_observed")
  rhdf5::h5write(attributes(kal)$num_processed, fname, "aux/num_processed")
  rhdf5::h5write(attributes(kal)$index_version, fname, "aux/index_version")
  rhdf5::h5write(attributes(kal)$kallisto_version, fname, "aux/kallisto_version")
  rhdf5::h5write(attributes(kal)$start_time, fname, "aux/start_time")
  rhdf5::h5write(kal$abundance$est_counts, fname, "est_counts")

  if (write_bootstrap && length(kal$bootstrap) > 0) {
    rhdf5::h5createGroup(fname, "bootstrap")
    for (i in seq_along(kal$bootstrap)) {
      bs <- kal$bootstrap[[i]]$est_counts
      bs_name <- paste0("bootstrap/bs", i-1)
      rhdf5::h5write(bs, fname, bs_name)
    }
  }

  invisible(kal)
}

#' save a sleuth object
#'
#' save a sleuth object
#'
#' @param obj a \code{sleuth} object
#' @param the location to save the object to
#' @seealso \code{\link{sleuth_load}}, \code{\link{sleuth_deploy}}
#' @export
sleuth_save <- function(obj, file) {
  if (!is(obj, 'sleuth')) {
    stop('please provide a sleuth object')
  }
  saveRDS(obj, file=file)
}

#' load a sleuth object
#'
#' load a sleuth object previously saved with \code{sleuth_save}
#'
#' @param file the file to load
#' @return a \code{sleuth} object
#' @seealso \code{\link{sleuth_save}}, \code{\link{sleuth_deploy}}
#' @export
sleuth_load <- function(file) {
  readRDS(file)
}