sleuth-ALR: Compositional Approach to Sleuth Data Analysis

Documented in choose_denom

#' Choose denominator
#'
#' This function selects the best denominator for an ALR
#' transformation based on which feature has the most consistent
#' proportion across all samples. Currently, only "coefficient of variation"
#' (choose_method "cov") is implemented. Users can supply a "sleuth" object, or
#' alternatively can supply a sample_to_covariates table. It runs a truncated
#' form of sleuth_prep to get raw counts and TPMs to calculate the metric.
#'
#' @param obj a sleuth object. This or sample_info must be specified. If both
#'   are specified, the sleuth object will take precedence.
#' @param sample_info a sample_to_covariates table that must match sleuth API.
#'   see ?sleuth::sleuth_prep for more information
#' @param target_mapping a table that matches each target_id to gene identifiers
#'   or other metadata. If aggregation_column is specified, this table must be
#'   provided, and it must have aggregation_column as one of its named columns.
#' @param aggregation_column If you plan to examine gene-level information,
#'   you must provide an aggregation_column to aggregate transcripts to the gene
#'   level. You must provide a target_mapping table with a column whose name is
#'   this argument or a sleuth object with that table.
#' @param gene_mode If you plan to examine gene-level information and want to do
#'   count aggregation to select a denominator, this must be set to \code{TRUE}.
#'   Transcript-level selection will be used if \code{FALSE}.
#' @param num_cores the number of cores that parallel should use to process samples
#' @param num_denoms the number of features to select for normalization. The default is one.
#' @param denom_var must be one of "est_counts", "tpm", or "scaled_reads_per_base" (for gene-level counts).
#' @param min_value the minimum threshold for the mean of 'denom_var' for the candidate denominator.
#' @param choose_method the metric used to select the denominator. Currently only "cov" is implemented.
#' @param filter_length boolean to filter possible denominators by length. This requires that 'length'
#'   is a column in the target_mapping. If \code{TRUE}, then all features with a length less
#'   than 300 bases is excluded from consideration. This is recommended when modeling TPMs,
#'   and is thus \code{TRUE} by default since TPMs are modeled by default.
#'
#' @return a character vector with one or more denominators for use. If multiple
#'   features have ties for the metric of choice, currently all of them are returned so
#'   that their geometric mean can be used as the denominator.
#' @importFrom data.table as.data.table
#' @importFrom dplyr select_
#' @importFrom methods is
#' @importFrom sleuth sleuth_prep sleuth_to_matrix
#' @importFrom stats formula
#' @importFrom matrixStats rowAnys rowSds
#' @export
choose_denom <- function(obj = NULL, sample_info = NULL, target_mapping = NULL,
                         aggregation_column = NULL, gene_mode = FALSE, num_cores = 1,
                         num_denoms = 1, denom_var = "tpm", min_value = 5,
                         choose_method = "cov", filter_length = TRUE) {
  stopifnot(denom_var %in% c("est_counts", "scaled_reads_per_base", "tpm"))
  if (is.null(obj)) {
    if (is.null(sample_info)) {
      stop("You must provide either a sleuth object to 'obj', ",
           "or a sample_to_covariates data frame to 'sample_info'.")
    } else if (!is.null(aggregation_column) & is.null(target_mapping)) {
      stop("If an aggregation_column is provided, you must also ",
           "provide target_mapping to do the aggregation on.")
    } else if (gene_mode && !is.null(aggregation_column)) {
      stop("If 'gene_mode' is TRUE, an 'aggregation_column' must be provided")
    }
    message("Preparing the sleuth object to select the best denominator")
    if (gene_mode) {
      obj <- suppressMessages(sleuth::sleuth_prep(sample_info,
                                                  target_mapping = target_mapping,
                                                  aggregation_column = aggregation_column,
                                                  gene_mode = gene_mode,
                                                  norm_fun_counts = norm_identity,
                                                  norm_fun_tpm = norm_identity,
                                                  max_bootstrap = 2, num_cores = 1,
                                                  full_model = stats::formula("~1")))
    } else {
      obj <- suppressMessages(sleuth::sleuth_prep(sample_info,
                                                  target_mapping = target_mapping,
                                                  num_cores = 1,
                                                  normalize = FALSE))
    }
  } else {
    if(!is(obj, "sleuth")) {
      stop("You must provide either a sleuth object to 'obj', ",
           "or a sample_to_covariates data frame to 'sample_info'.")
    } else if (!is.null(sample_info)) {
      warning("You provided both a sleuth object and a ",
              "sample_to_covariates data frame. The s2c data frame will be ignored.")
    }
  }

  if (gene_mode && denom_var == "est_counts") {
    denom_var <- "scaled_reads_per_base"
  } else if (is.null(aggregation_column) && denom_var == "scaled_reads_per_base") {
    denom_var <- "est_counts"
  }

  if (filter_length) {
    message("filtering by length")
    if(!("length" %in% names(target_mapping))) {
      stop("The 'target_mapping' table is missing a 'length' column. This is required to filter features by length.")
    }
    if(gene_mode) {
      transcript_lengths <- data.table::as.data.table(dplyr::select_(target_mapping, aggregation_column, "length"))
      length_bool <- transcript_lengths[, list(bool = all(length >= 300)), by = list(eval(parse(text = aggregation_column)))]
      length_bool_ids <- length_bool[[aggregation_column]][length_bool$bool]
    } else {
      transcript_lengths <- dplyr::select_(target_mapping, "target_id", "length")
      length_bool <- transcript_lengths$length >= 300
      length_bool_ids <- transcript_lengths$target_id[length_bool]
    }
  }

  if (choose_method == "cov") {
    message("Calculating the coefficient of variation of all targets")
    if (is.null(aggregation_column)) {
      mat <- sleuth::sleuth_to_matrix(obj, 'obs_raw', denom_var)
    } else {
      mat <- sleuth::sleuth_to_matrix(obj, 'obs_norm', denom_var)
    }
    # only have a matrix containing filtered values, to remove low expressing transcripts
    # which often have low coefficients of variation
    # gene-level, using the normalized matrix, is already filtered
    allNonZero <- !matrixStats::rowAnys(mat, value = 0)
    mat <- mat[obj$filter_bool & allNonZero, ]

    if (filter_length) mat <- mat[row.names(mat) %in% length_bool_ids, ]
    mean_vals <- rowMeans(mat, na.rm = T)
    cov <- matrixStats::rowSds(mat, na.rm = T) / rowMeans(mat, na.rm = T)

    if (num_denoms == 1) {
      min_cov <- min(cov[which(mean_vals >= min_value & cov > 0)], na.rm = T)
      min_index <- which(cov == min_cov)
    } else {
      filt_cov <- cov[which(mean_vals >= min_value & cov > 0)]
      ordered_cov <- filt_cov[order(filt_cov)]
      min_cov <- ordered_cov[1:num_denoms]
      min_index <- which(cov %in% min_cov)
    }
    denom_name <- names(cov)[min_index]
  } else {
    stop("Currently, 'cov' is the only implemented choose_method. ",
         "Contact the developer if you are interested in another choose_method.")
  }
  denom_name
}