R/helpers.R

Defines functions delayed_matrix_multiply subtract_vector_from_each_column .handle_overdispersion estimate_size_factors .handle_size_factors .handle_data_parameter

Documented in estimate_size_factors .handle_data_parameter

#' Take any kind of data and extract the matrix
#'
#'
#' Adapted from glmGamPoi:::handle_data_parameter
#'
#' @return A matrix.
#'
#' @keywords internal
.handle_data_parameter <- function(data, on_disk, allow_sparse = TRUE){
  if(is.vector(data)){
    data <- matrix(data, nrow = 1)
  }
  if(is.matrix(data)){
    if(! is.numeric(data)){
      stop("The data argument must consist of numeric values and not of ", mode(data), " values")
    }
    if(is.null(on_disk) || isFALSE(on_disk)){
      data_mat <- data
    }else if(isTRUE(on_disk)){
      data_mat <- HDF5Array::writeHDF5Array(data)
    }else{
      stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
    }
  }else if(is(data, "DelayedArray")){
    if(is.null(on_disk) || isTRUE(on_disk)){
      data_mat <- data
    }else if(isFALSE(on_disk)){
      data_mat <- as.matrix(data)
    }else{
      stop("Illegal argument type for on_disk. Can only handle 'NULL', 'TRUE', or 'FALSE'")
    }
  }else if(is(data, "SummarizedExperiment")){
    data_mat <- .handle_data_parameter(SummarizedExperiment::assay(data, "counts"), on_disk, allow_sparse)
  }else if(canCoerce(data, "SummarizedExperiment")){
    se <- as(data, "SummarizedExperiment")
    data_mat <- .handle_data_parameter(SummarizedExperiment::assay(se, "counts"), on_disk, allow_sparse)
  }else if(is(data, "dgCMatrix") || is(data, "dgTMatrix")) {
    if(isTRUE(on_disk)){
      data_mat <- HDF5Array::writeHDF5Array(data)
    }else if(isFALSE(on_disk)){
      if(allow_sparse){
        data_mat <- data
      }else{
        data_mat <- as.matrix(data)
      }
    }else{
      if(allow_sparse){
        data_mat <- data
      }else{
        stop("transformGamPoi does not yet support sparse input data of type '", class(data),"'. ",
             "Please explicitly set the 'on_disk' parameter to force a conversion to a dense format either in-memory ('on_disk = FALSE') ",
             "or on-disk ('on_disk = TRUE')")
      }
    }
  }else if(inherits(data, "glmGamPoi")){
    data_mat <- .handle_data_parameter(data$data, on_disk, allow_sparse)
  }else{
    stop("Cannot handle data of class '", class(data), "'.",
         "It must be of a matrix object (i.e., a base matrix or DelayedArray),",
         " or a container for such a matrix (for example: SummarizedExperiment).")
  }
  data_mat
}




.handle_size_factors <- function(size_factors, Y, verbose = FALSE){
  n_samples <- ncol(Y)

  if(isTRUE(size_factors) || is.character(size_factors)){
    method <- if(isTRUE(size_factors)){
      "normed_sum"
    }else if(all(size_factors == c("normed_sum", "deconvolution", "poscounts"))){
      "normed_sum"
    }else if(length(size_factors) == 1 && size_factors %in% c("normed_sum", "deconvolution", "poscounts")){
      size_factors
    }else{
      stop("Cannot handle size factor ", paste0(size_factors, collapse = ", "))
    }
    if(verbose){ message("Calculate Size Factors (", method, ")") }
    sf <- estimate_size_factors(Y, method = method, verbose = verbose)
  }else if(isFALSE(size_factors)){
    sf <- rep(1, n_samples)
  }else{
    stopifnot(is.numeric(size_factors) && (length(size_factors) == 1 || length(size_factors) == n_samples))
    if(any(size_factors < 0)){
      stop("size factor 'size_factors' must be larger than 0")
    }
    if(length(size_factors) == 1){
      sf <- rep(size_factors, n_samples)
    }else{
      sf <- size_factors
    }
  }
  sf
}







#' Estimate the Size Factors
#'
#' @param Y any matrix-like object (\code{base::matrix()}, \code{DelayedArray}, \code{HDF5Matrix},
#'   \code{Matrix::Matrix()}, etc.)
#' @param method one of \code{c("normed_sum", "deconvolution", "poscounts")}
#'
#'
#' @return a vector with one size factor per column of `Y`
#'
#' @keywords internal
estimate_size_factors <- function(Y, method, verbose = FALSE){
  if(nrow(Y) <= 1){
    if(verbose) {
      message("Skipping size factor estimation, because there is only a single gene.",
              call. = FALSE)
    }
    return(rep(1, ncol(Y)))
  }
  stopifnot(length(method) == 1 && is.character(method))

  if(method == "poscounts"){
    # Accept any matrix-like object
    log_geometric_means <- MatrixGenerics::rowMeans2(log(Y + 0.5))
    Y2 <- Y
    Y2[Y2 == 0] <- NA
    sf <- exp(MatrixGenerics::colMedians(subtract_vector_from_each_column(log(Y2), log_geometric_means), na.rm = TRUE))
  }else if(method == "deconvolution"){
    if(requireNamespace("scran", quietly = TRUE)){
      tryCatch({
        sf <- scran::calculateSumFactors(Y)
      }, error = function(err){
        stop("Error in size factor estimation with 'size_factors = \"deconvolution\"'.\n",
             "Alternative size factor estimation procedures are: \"normed_sum\" or \"poscounts\"\n",
             "'scran::calculateSumFactors(Y)' threw the following error: \n\t",
             err, call. = FALSE)
      })
    }else{
      stop("To use the \"deconvolution\" method for size factor calculation, you need to install the ",
           "'scran' package from Bioconductor. Alternatively, you can use \"normed_sum\" or \"poscounts\"",
           "to calculate the size factors.", call. = FALSE)
    }
  }else if(method == "normed_sum"){
    sf <- colSums2(Y)
    # sf <- matrixStats::colSums2(as.matrix(Y))
  }else{
    stop("Unknown size factor estimation method: ", method)
  }



  # stabilize size factors to have geometric mean of 1
  all_zero_column <- is.nan(sf) | sf <= 0
  sf[all_zero_column] <- NA
  if(any(all_zero_column)){
    sf <- sf/exp(mean(log(sf), na.rm=TRUE))
    sf[all_zero_column] <- 0.001
  }else{
    sf <- sf/exp(mean(log(sf)))
  }
  sf
}


.handle_overdispersion <- function(overdispersion, counts, verbose = FALSE){
  stopifnot(length(overdispersion) == 1 || length(overdispersion) == nrow(counts) || all(dim(overdispersion) == dim(counts)))
  if(length(overdispersion) == 1){
    rep(overdispersion, nrow(counts))
  }else{
    overdispersion
  }

  if(verbose && any(overdispersion > 1)){
    warning("The overdispersion at position", paste0(head(which(overdispersion > 1)), collapse = ", "), " seems unusually large.")
  }
  overdispersion
}


subtract_vector_from_each_column <- function(matrix, vector){
  stopifnot(length(vector) == 1 || length(vector) == nrow(matrix))
  matrix - vector
}



delayed_matrix_multiply <- function(x, y){
  res_sink <- HDF5Array::HDF5RealizationSink(c(nrow(x), ncol(y)))
  on.exit({
    DelayedArray::close(res_sink)
  }, add = TRUE)

  res_grid <- DelayedArray::defaultAutoGrid(res_sink)

  row_ticks <- cumsum(vapply(seq_len(dim(res_grid)[1]), function(idx){
    dim(res_grid[[idx, 1L]])[1]
  }, FUN.VALUE = 0L))
  col_ticks <- cumsum(vapply(seq_len(dim(res_grid)[2]), function(idx){
    dim(res_grid[[1L, idx]])[2]
  }, FUN.VALUE = 0L))



  x_grid <- DelayedArray::ArbitraryArrayGrid(tickmarks = list(row_ticks, ncol(x)))
  y_grid <- DelayedArray::ArbitraryArrayGrid(tickmarks = list(nrow(y), col_ticks))


  for (coord1 in seq_len(ncol(res_grid))) {
    for(coord2 in seq_len(nrow(res_grid))){
      x_block <- DelayedArray::read_block(x, x_grid[[coord2]])
      y_block <- DelayedArray::read_block(y, y_grid[[coord1]])
      res_block <- x_block %*% y_block
      DelayedArray::write_block(res_sink, res_grid[[coord2, coord1]], res_block)
    }
  }

  as(res_sink, "DelayedArray")
}
const-ae/transformGamPoi documentation built on April 14, 2023, 11:33 p.m.