We use data.table
to find the dominant CTSS in tag clusters. I am
implementing a similar function for consensus clusters. Shall I do it
with Bioconductor core functions, data.table
or dplyr
?
We have CTSSes and clusters. The clusters may overlap with each other.
library(CAGEr) |> suppressPackageStartupMessages() ce <- exampleCAGEexp ctss <- CTSScoordinatesGR(ce) score(ctss) <- CTSSnormalizedTpmDF(ce) |> DelayedArray::DelayedArray() |> rowSums() clusters <- consensusClustersGR(ce) mcols(clusters) <- mcols(clusters)[,c("score", "dominant_ctss", "tpm.dominant_ctss")] # Simplify clusters.orig <- clusters mcols(clusters) <- NULL
The functions are given a lookup table associating CTSSes with clusters, and a function to find the dominant CTSS and break ties by selecting the one at the center.
o <- findOverlaps(clusters, ctss) find.dominant.idx <- function (x) { # which.max is breaking ties by taking the last, but this will give slightly # different biases on plus an minus strands. w <- which(x == max(x)) w[ceiling(length(w)/2)] }
bioC <- function () { s <- extractList(score(ctss), o) m <- sapply(s, find.dominant.idx) grl <- extractList(granges(ctss), o) dom <- mapply(`[`, grl, m) |> GRangesList() |> unlist() clusters$dominant_ctss <- dom clusters$tpm.dominant_ctss <- max(s) clusters } bioC() -> bioC_results
R
bioC2 <- function () { rl <- rle(queryHits(o))$length cluster_start_idx <- cumsum(c(1, head(rl, -1))) # Where each run starts grouped_scores <- extractList(score(ctss), o) local_max_idx <- sapply(grouped_scores, find.dominant.idx) -1 # Start at zero global_max_ids <- cluster_start_idx + local_max_idx clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids] clusters$tpm.dominant_ctss <- score(ctss)[subjectHits(o)][global_max_ids] clusters } bioC2() -> bioC_results2
data.table
waylibrary("data.table") |> suppressPackageStartupMessages() dataTable <- function() { dt <- ctss |> as.data.frame() |> data.table::as.data.table() dt$id <- dt$cluster |> as.factor() |> as.integer() dom <- dt[ , list( seqnames[1] , strand[1] , pos[find.dominant.idx(score)] , max(score)) , by = id] setnames(dom, c( "cluster", "chr", "strand" , "dominant_ctss", "tpm.dominant_ctss")) clusters$dominant_ctss <- GRanges(dom$chr, dom$dominant_ctss, dom$strand) seqinfo(clusters$dominant_ctss) <- seqinfo(ctss) clusters$tpm.dominant_ctss <- dom$tpm.dominant_ctss clusters } dataTable() -> dataTable_results
dplyr
waydplyr <- function() { tb <- ctss |> as.data.frame() |> tibble::as_tibble() tb$id <- tb$cluster |> as.factor() |> as.integer() dom <- tb |> dplyr::group_by(id) |> dplyr::summarise(seqnames = unique(seqnames), strand = unique(strand), tpm.dominant_ctss = max(score), dominant_ctss = pos[find.dominant.idx(score)]) clusters$dominant_ctss <- GRanges(dom$seqnames, dom$dominant_ctss, dom$strand) seqinfo(clusters$dominant_ctss) <- seqinfo(ctss) clusters$tpm.dominant_ctss <- dom$tpm.dominant_ctss clusters } dplyr() -> dplyr_results
bioC_results identical(bioC_results, bioC_results2) identical(bioC_results, dataTable_results) identical(bioC_results, dplyr_results) (benchmark <- microbenchmark::microbenchmark(bioC(), bioC2(), dataTable(), dplyr())) library("ggplot2") |> suppressPackageStartupMessages() ggplot(benchmark, aes(x = time / 1e9, y = expr, color = expr)) + # Plot performance comparison geom_boxplot() + scale_x_log10("time (seconds)")
The approach combining findOverlaps
from Bionductor and cumulative sums from
base R works the best on test data, with comparable performance with dplyr
and data.table
. This opens the way to the removal of the dependency to
data.table
. In the future, if we start to import dplyr
for reasons related
to ggplot2
or plyranges
, we may might replace the Bioc + base R version
with a dplyr
one, that is probably easier to read for most contributors.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.