Parallel computing is supported in Signac through the future package, making it easy to specify different parallelization options. Here we demonstrate parallelization of the FeatureMatrix function and show some benchmark results to get a sense for the amount of speedup you might expect.

The Seurat package also uses future for parallelization, and you can see the Seurat vignette for more information.

The following functions currently enable parallelization in Signac:

How to enable parallelization in Signac

Parallelization can be enabled simply by importing the future package and setting the plan.

library(future)
plan()

By default the plan is set to sequential processing (no parallelization). We can change this to multicore or multisession to get asynchronous processing, and set the number of workers to change the number of cores used.

plan("multicore", workers = 10)
plan()

You might also need to increase the maximum memory usage:

options(future.globals.maxSize = 50 * 1024 ^ 3) # for 50 Gb RAM

Note that as of future version 1.14.0, forked processing is disabled when running in RStudio. To enable parallel computing in RStudio, you will need to select the "multisession" option.

Benchmarking

Here we demonstrate the runtime of FeatureMatrix run on 144,023 peaks for 9,688 human PBMCs under different parallelization options:

View benchmarking code

The following code was run on REHL with Intel Platinum 8268 CPU @ 2.00GHz

```{bash eval=FALSE}

download data

wget https://cf.10xgenomics.com/samples/cell-atac/2.0.0/atac_pbmc_10k_nextgem/atac_pbmc_10k_nextgem_fragments.tsv.gz wget https://cf.10xgenomics.com/samples/cell-atac/2.0.0/atac_pbmc_10k_nextgem/atac_pbmc_10k_nextgem_fragments.tsv.gz.tbi wget https://cf.10xgenomics.com/samples/cell-atac/2.0.0/atac_pbmc_10k_nextgem/atac_pbmc_10k_nextgem_peaks.bed wget https://cf.10xgenomics.com/samples/cell-atac/2.0.0/atac_pbmc_10k_nextgem/atac_pbmc_10k_nextgem_singlecell.csv

```r
library(Signac)

# load data
fragments <- "../vignette_data/atac_pbmc_10k_nextgem_fragments.tsv.gz"
peaks.10k <- read.table(
  file = "../vignette_data/atac_pbmc_10k_nextgem_peaks.bed",
  col.names = c("chr", "start", "end")
)
peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks.10k)
md <- read.csv("../vignette_data/atac_pbmc_10k_nextgem_singlecell.csv", row.names = 1, header = TRUE)[-1, ]
cells <- rownames(md[md[['is__cell_barcode']] == 1, ])

fragments <- CreateFragmentObject(path = fragments, cells = cells, validate.fragments = FALSE)

# set number of replicates
nrep <- 5
results <- data.frame()
process_n <- 2000

# run sequentially
timing.sequential <- c()
for (i in seq_len(nrep)) {
  start <- Sys.time()
  fmat <- FeatureMatrix(fragments = fragments, features = peaks, cells = cells, process_n = process_n)
  timing.sequential <- c(timing.sequential, as.numeric(Sys.time() - start, units = "secs"))
}
res <- data.frame(
  "setting" = rep("Sequential", nrep),
  "cores" = rep(1, nrep),
  "replicate" = seq_len(nrep),
  "time" = timing.sequential
)
results <- rbind(results, res)

# 4 core
library(future)
plan("multicore", workers = 4)
options(future.globals.maxSize = 100000 * 1024^2)

timing.4core <- c()
for (i in seq_len(nrep)) {
  start <- Sys.time()
  fmat <- FeatureMatrix(fragments = fragments, features = peaks, cells = cells, process_n = process_n)
  timing.4core <- c(timing.4core, as.numeric(Sys.time() - start, units = "secs"))
}
res <- data.frame(
  "setting" = rep("Parallel", nrep),
  "cores" = rep(4, nrep),
  "replicate" = seq_len(nrep),
  "time" = timing.4core
)
results <- rbind(results, res)

# 10 core
plan("multicore", workers = 10)

timing.10core <- c()
for (i in seq_len(nrep)) {
  start <- Sys.time()
  fmat <- FeatureMatrix(fragments = fragments, features = peaks, cells = cells, process_n = process_n)
  timing.10core <- c(timing.10core, as.numeric(Sys.time() - start, units = "secs"))
}
res <- data.frame(
  "setting" = rep("Parallel", nrep),
  "cores" = rep(10, nrep),
  "replicate" = seq_len(nrep),
  "time" = timing.10core
)
results <- rbind(results, res)

# save results
write.table(
  x = results,
  file = paste0("../vignette_data/pbmc10k/timings_", Sys.Date(), ".tsv"),
  quote = FALSE,
  row.names = FALSE
)

library(ggplot2)
results <- read.table("../vignette_data/pbmc10k/timings_2023-03-21.tsv", header = TRUE)
results$cores <- factor(results$cores, levels = c("1", "4", "10"))

p <- ggplot(results, aes(x = cores, y = time/60, color = cores)) +
  geom_jitter(width = 1/10) +
  ylim(c(0, 8)) +
  ylab("Time (min)") +
  xlab("Number of cores") +
  theme_bw() +
  theme(legend.position = "none") +
  ggtitle("FeatureMatrix runtime")
p

Session Info

sessionInfo()



timoast/signac documentation built on April 5, 2024, 1:34 a.m.