knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "README-" )
DelayedMatrixStats is a port of the matrixStats API to work with DelayedMatrix objects from the DelayedArray package.
For a DelayedMatrix, x
, the simplest way to apply a function, f()
, from
matrixStats ismatrixStats::f(as.matrix(x))
. However, this "realizes"
x
in memory as a base::matrix, which typically defeats the entire purpose
of using a DelayedMatrix for storing the data.
The DelayedArray package already implements a clever strategy called
"block-processing" for certain common "matrix stats" operations (e.g.
colSums()
, rowSums()
). This is a good start, but not all of the
matrixStats API is currently supported. Furthermore, certain operations can
be optimized with additional information about x
. I'll refer to these
"seed-aware" implementations.
You can install DelayedMatrixStats from Bioconductor with:
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DelayedMatrixStats")
This example compares two ways of computing column sums of a DelayedMatrix object:
DelayedMatrix::colSums()
: The 'block-processing strategy', implemented in the DelayedArray package. The block-processing strategy works for any DelayedMatrix object, regardless of the type of seed.DelayedMatrixStats::colSums2()
: The 'seed-aware' strategy, implemented in the
DelayedMatrixStats package. The seed-aware implementation is optimized for both speed and memory but only for DelayedMatrix objects with certain types of seed.devtools::load_all() library(sparseMatrixStats) library(microbenchmark) library(profmem)
library(DelayedMatrixStats) library(sparseMatrixStats) library(microbenchmark) library(profmem)
set.seed(666) # Fast column sums of DelayedMatrix with matrix seed dense_matrix <- DelayedArray(matrix(runif(20000 * 600), nrow = 20000, ncol = 600)) class(seed(dense_matrix)) dense_matrix microbenchmark(DelayedArray::colSums(dense_matrix), DelayedMatrixStats::colSums2(dense_matrix), times = 10) profmem::total(profmem::profmem(DelayedArray::colSums(dense_matrix))) profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(dense_matrix))) # Fast, low-memory column sums of DelayedMatrix with sparse matrix seed sparse_matrix <- seed(dense_matrix) zero_idx <- sample(length(sparse_matrix), 0.6 * length(sparse_matrix)) sparse_matrix[zero_idx] <- 0 sparse_matrix <- DelayedArray(Matrix::Matrix(sparse_matrix, sparse = TRUE)) class(seed(sparse_matrix)) sparse_matrix microbenchmark(DelayedArray::colSums(sparse_matrix), DelayedMatrixStats::colSums2(sparse_matrix), times = 10) profmem::total(profmem::profmem(DelayedArray::colSums(sparse_matrix))) profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(sparse_matrix))) # Fast column sums of DelayedMatrix with Rle-based seed rle_matrix <- RleArray(Rle(sample(2L, 200000 * 6 / 10, replace = TRUE), 100), dim = c(2000000, 6)) class(seed(rle_matrix)) rle_matrix microbenchmark(DelayedArray::colSums(rle_matrix), DelayedMatrixStats::colSums2(rle_matrix), times = 10) profmem::total(profmem::profmem(DelayedArray::colSums(rle_matrix))) profmem::total(profmem::profmem(DelayedMatrixStats::colSums2(rle_matrix)))
An extensive set of benchmarks is under development at http://peterhickey.org/BenchmarkingDelayedMatrixStats/.
matrixStats <- sort( c("colsum", "rowsum", grep("^(col|row)", getNamespaceExports("matrixStats"), value = TRUE))) sparseMatrixStats <- getNamespaceExports("sparseMatrixStats") DelayedMatrixStats <- getNamespaceExports("DelayedMatrixStats") DelayedArray <- getNamespaceExports("DelayedArray") api_df <- data.frame( Method = paste0("`", matrixStats, "()`"), `Block processing` = ifelse( matrixStats %in% DelayedMatrixStats, "✔", ifelse(matrixStats %in% c(DelayedArray, sparseMatrixStats), "☑️", "❌")), `_base::matrix_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "matrix_OR_array_OR_table_OR_numeric"), "✔", "❌"), `_Matrix::dgCMatrix_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "dgCMatrix"), "✔", "❌"), `_Matrix::lgCMatrix_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "lgCMatrix"), "✔", "❌"), `_DelayedArray::RleArray_ (_SolidRleArraySeed_) optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "SolidRleArraySeed"), "✔", "❌"), `_DelayedArray::RleArray_ (_ChunkedRleArraySeed_) optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "ChunkedRleArraySeed"), "✔", "❌"), `_HDF5Array::HDF5Matrix_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "HDF5ArraySeed"), "✔", "❌"), `_base::data.frame_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "data.frame"), "✔", "❌"), `_S4Vectors::DataFrame_ optimized` = ifelse(sapply(matrixStats, existsMethod, signature = "DataFrame"), "✔", "❌"), check.names = FALSE) knitr::kable(api_df, row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.