inst/doc/DelayedMatrixStatsOverview.R

## ---- include = FALSE, setup--------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, comment = "#>", collapse = TRUE,
                      message = FALSE)

## ----data_sim, message = FALSE------------------------------------------------
library(DelayedArray)

x <- do.call(cbind, lapply(1:20, function(j) {
  rpois(n = 10000, lambda = sample(20:40, 10000, replace = TRUE))
}))
colnames(x) <- paste0("S", 1:20)
x <- realize(x, "HDF5Array")
x

## ----apply--------------------------------------------------------------------
system.time(row_sds <- apply(x, 1, sd))
head(row_sds)

## ----matrixStats, error = TRUE------------------------------------------------
matrixStats::rowSds(x)

## ----realization--------------------------------------------------------------
system.time(row_sds <- matrixStats::rowSds(as.matrix(x)))
head(row_sds)

## ----DelayedMatrixStats-------------------------------------------------------
library(DelayedMatrixStats)

system.time(row_sds <- rowSds(x))
head(row_sds)

## ----API, echo = FALSE--------------------------------------------------------
matrixStats <- sort(
  c("colsum", "rowsum", grep("^(col|row)", 
                             getNamespaceExports("matrixStats"), 
                             value = TRUE)))
sparseMatrixStats <- getNamespaceExports("sparseMatrixStats")
DelayedMatrixStats <- getNamespaceExports("DelayedMatrixStats")
DelayedArray <- getNamespaceExports("DelayedArray")

api_df <- data.frame(
  Method = paste0("`", matrixStats, "()`"),
  `Block processing` = ifelse(
    matrixStats %in% DelayedMatrixStats,
    "✔",
    ifelse(matrixStats %in% c(DelayedArray, sparseMatrixStats), "☑️", "❌")),
  `_base::matrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "matrix_OR_array_OR_table_OR_numeric"), 
           "✔", 
           "❌"),
  `_Matrix::dgCMatrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "dgCMatrix"), 
           "✔", 
           "❌"),
  `_Matrix::lgCMatrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "xgCMatrix") | sapply(matrixStats, existsMethod, signature = "lgCMatrix"), 
           "✔", 
           "❌"),
  `_DelayedArray::RleArray_ (_SolidRleArraySeed_) optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "SolidRleArraySeed"),
           "✔", 
           "❌"),
  `_DelayedArray::RleArray_  (_ChunkedRleArraySeed_) optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "ChunkedRleArraySeed"),
           "✔", 
           "❌"),
  `_HDF5Array::HDF5Matrix_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "HDF5ArraySeed"),
           "✔", 
           "❌"),
  `_base::data.frame_ optimized` = 
    ifelse(sapply(matrixStats, existsMethod, signature = "data.frame"),
           "✔", 
           "❌"),
  `_S4Vectors::DataFrame_ optimized` =
    ifelse(sapply(matrixStats, existsMethod, signature = "DataFrame"),
           "✔", 
           "❌"), 
  check.names = FALSE)
knitr::kable(api_df, row.names = FALSE)

## ----benchmarking, message = FALSE, echo = TRUE, error = TRUE-----------------
library(DelayedMatrixStats)
library(sparseMatrixStats)
library(microbenchmark)
library(profmem)

set.seed(666)

# -----------------------------------------------------------------------------
# Dense with values in (0, 1)
# Fast, memory-efficient column sums of DelayedMatrix with ordinary matrix seed
#

# Generate some data
dense_matrix <- matrix(runif(20000 * 600), 
                       nrow = 20000,
                       ncol = 600)

# Benchmark
dm_matrix <- DelayedArray(dense_matrix)
class(seed(dm_matrix))
dm_matrix
microbenchmark(
  block_processing = colSums2(dm_matrix, force_block_processing = TRUE),
  seed_aware = colSums2(dm_matrix),
  times = 10)
total(profmem(colSums2(dm_matrix, force_block_processing = TRUE)))
total(profmem(colSums2(dm_matrix)))

# -----------------------------------------------------------------------------
# Sparse (60% zero) with values in (0, 1)
# Fast, memory-efficient column sums of DelayedMatrix with ordinary matrix seed
#

# Generate some data
sparse_matrix <- dense_matrix
zero_idx <- sample(length(sparse_matrix), 0.6 * length(sparse_matrix))
sparse_matrix[zero_idx] <- 0

# Benchmark
dm_dgCMatrix <- DelayedArray(Matrix(sparse_matrix, sparse = TRUE))
class(seed(dm_dgCMatrix))
dm_dgCMatrix
microbenchmark(
  block_processing = colSums2(dm_dgCMatrix, force_block_processing = TRUE),
  seed_aware = colSums2(dm_dgCMatrix),
  times = 10)
total(profmem(colSums2(dm_dgCMatrix, force_block_processing = TRUE)))
total(profmem(colSums2(dm_dgCMatrix)))

# -----------------------------------------------------------------------------
# Dense with values in {0, 100} featuring runs of identical values
# Fast, memory-efficient column sums of DelayedMatrix with Rle-based seed
#

# Generate some data
runs <- rep(sample(100, 500000, replace = TRUE), rpois(500000, 100))
runs <- runs[seq_len(20000 * 600)]
runs_matrix <- matrix(runs, 
                      nrow = 20000,
                      ncol = 600)

# Benchmark
dm_rle <- RleArray(Rle(runs),
                   dim = c(20000, 600))
class(seed(dm_rle))
dm_rle
microbenchmark(
  block_processing = colSums2(dm_rle, force_block_processing = TRUE),
  seed_aware = colSums2(dm_rle),
  times = 10)
total(profmem(colSums2(dm_rle, force_block_processing = TRUE)))
total(profmem(colSums2(dm_rle)))

## ----sin----------------------------------------------------------------------
system.time(sin_dm_matrix <- sin(dm_matrix))

## ----colSums2_sin-------------------------------------------------------------
all.equal(colSums2(sin_dm_matrix), colSums(sin(as.matrix(dm_matrix))))

Try the DelayedMatrixStats package in your browser

Any scripts or data that you put into this service are public.

DelayedMatrixStats documentation built on Feb. 5, 2021, 2:04 a.m.