knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "README-"
)

DelayedMatrixStats

DelayedMatrixStats is a port of the matrixStats API to work with DelayedMatrix objects from the DelayedArray package.

For a DelayedMatrix, x, the simplest way to apply a function, f(), from matrixStats ismatrixStats::f(as.matrix(x)). However, this "realizes" x in memory as a base::matrix, which typically defeats the entire purpose of using a DelayedMatrix for storing the data.

The DelayedArray package already implements a clever strategy called "block-processing" for certain common "matrix stats" operations (e.g. colSums(), rowSums()). This is a good start, but not all of the matrixStats API is currently supported. Furthermore, certain operations can be optimized with additional information about x. I'll refer to these "seed-aware" implementations.

Installation

You can install DelayedMatrixStats from Bioconductor with:

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("DelayedMatrixStats")

Example

This example compares two ways of computing column sums of a DelayedMatrix object:

  1. DelayedMatrix::colSums(): The 'block-processing strategy', implemented in the DelayedArray package. The block-processing strategy works for any DelayedMatrix object, regardless of the type of seed.
  2. DelayedMatrixStats::colSums2(): The 'seed-aware' strategy, implemented in the DelayedMatrixStats package. The seed-aware implementation is optimized for both speed and memory but only for DelayedMatrix objects with certain types of seed.
devtools::load_all()
library(sparseMatrixStats)
library(microbenchmark)
library(profmem)
library(DelayedMatrixStats)
library(sparseMatrixStats)
library(microbenchmark)
library(profmem)
set.seed(666)

# Fast column sums of DelayedMatrix with matrix seed
dense_matrix <- DelayedArray(matrix(runif(20000 * 600), nrow = 20000,
                                    ncol = 600))
class(seed(dense_matrix))
dense_matrix
microbenchmark(DelayedArray::colSums(dense_matrix),
               DelayedMatrixStats::colSums2(dense_matrix),
               times = 10)
profmem::total(profmem::profmem(DelayedArray::colSums(dense_matrix)))
profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(dense_matrix)))

# Fast, low-memory column sums of DelayedMatrix with sparse matrix seed
sparse_matrix <- seed(dense_matrix)
zero_idx <- sample(length(sparse_matrix), 0.6 * length(sparse_matrix))
sparse_matrix[zero_idx] <- 0
sparse_matrix <- DelayedArray(Matrix::Matrix(sparse_matrix, sparse = TRUE))
class(seed(sparse_matrix))
sparse_matrix
microbenchmark(DelayedArray::colSums(sparse_matrix),
               DelayedMatrixStats::colSums2(sparse_matrix),
               times = 10)
profmem::total(profmem::profmem(DelayedArray::colSums(sparse_matrix)))
profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(sparse_matrix)))

# Fast column sums of DelayedMatrix with Rle-based seed
rle_matrix <- RleArray(Rle(sample(2L, 200000 * 6 / 10, replace = TRUE), 100),
                       dim = c(2000000, 6))
class(seed(rle_matrix))
rle_matrix
microbenchmark(DelayedArray::colSums(rle_matrix),
               DelayedMatrixStats::colSums2(rle_matrix),
               times = 10)
profmem::total(profmem::profmem(DelayedArray::colSums(rle_matrix)))
profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(rle_matrix)))

Benchmarking

An extensive set of benchmarks is under development at http://peterhickey.org/BenchmarkingDelayedMatrixStats/.

API coverage

matrixStats <- sort(
  c("colsum", "rowsum", grep("^(col|row)", 
                             getNamespaceExports("matrixStats"), 
                             value = TRUE)))
sparseMatrixStats <- getNamespaceExports("sparseMatrixStats")
DelayedMatrixStats <- getNamespaceExports("DelayedMatrixStats")
DelayedArray <- getNamespaceExports("DelayedArray")

api_df <- data.frame(
  Method = paste0("`", matrixStats, "()`"),
  `Block processing` = ifelse(
    matrixStats %in% DelayedMatrixStats,
    "✔",
    ifelse(matrixStats %in% c(DelayedArray, sparseMatrixStats), "☑️", "❌")),
  `_base::matrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "matrix_OR_array_OR_table_OR_numeric"), 
           "✔", 
           "❌"),
  `_Matrix::dgCMatrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "dgCMatrix"), 
           "✔", 
           "❌"),
  `_Matrix::lgCMatrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "lgCMatrix"), 
           "✔", 
           "❌"),
  `_DelayedArray::RleArray_ (_SolidRleArraySeed_) optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "SolidRleArraySeed"),
           "✔", 
           "❌"),
  `_DelayedArray::RleArray_  (_ChunkedRleArraySeed_) optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "ChunkedRleArraySeed"),
           "✔", 
           "❌"),
  `_HDF5Array::HDF5Matrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "HDF5ArraySeed"),
           "✔", 
           "❌"),
  `_base::data.frame_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "data.frame"),
           "✔", 
           "❌"),
  `_S4Vectors::DataFrame_ optimized` =
    ifelse(sapply(matrixStats, existsMethod, signature = "DataFrame"),
           "✔", 
           "❌"), 
  check.names = FALSE)
knitr::kable(api_df, row.names = FALSE)


PeteHaitch/DelayedMatrixStats documentation built on May 6, 2024, 10:25 p.m.