title: "Fastest way to find dominant CTSS" author: "Charles Plessy" date: 'February 21, 2022' output: html_document: keep_md: yes editor_options: chunk_output_type: console
We use data.table
to find the dominant CTSS in tag clusters. I am
implementing a similar function for consensus clusters. Shall I do it
with Bioconductor core functions, data.table
or dplyr
?
We have CTSSes and clusters. The clusters may overlap with each other.
library(CAGEr) |> suppressPackageStartupMessages()
ce <- exampleCAGEexp
ctss <- CTSScoordinatesGR(ce)
score(ctss) <- CTSSnormalizedTpmDF(ce) |> DelayedArray::DelayedArray() |> rowSums()
clusters <- consensusClustersGR(ce)
mcols(clusters) <- mcols(clusters)[,c("score", "dominant_ctss", "tpm.dominant_ctss")] # Simplify
clusters.orig <- clusters
mcols(clusters) <- NULL
The functions are given a lookup table associating CTSSes with clusters, and a function to find the dominant CTSS and break ties by selecting the one at the center.
o <- findOverlaps(clusters, ctss)
find.dominant.idx <- function (x) {
# which.max is breaking ties by taking the last, but this will give slightly
# different biases on plus an minus strands.
w <- which(x == max(x))
w[ceiling(length(w)/2)]
}
bioC <- function () {
s <- extractList(score(ctss), o)
m <- sapply(s, find.dominant.idx)
grl <- extractList(granges(ctss), o)
dom <- mapply(`[`, grl, m) |> GRangesList() |> unlist()
clusters$dominant_ctss <- dom
clusters$tpm.dominant_ctss <- max(s)
clusters
}
bioC() -> bioC_results
R
bioC2 <- function () {
rl <- rle(queryHits(o))$length
cluster_start_idx <- cumsum(c(1, head(rl, -1))) # Where each run starts
grouped_scores <- extractList(score(ctss), o)
local_max_idx <- sapply(grouped_scores, find.dominant.idx) -1 # Start at zero
global_max_ids <- cluster_start_idx + local_max_idx
clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids]
clusters$tpm.dominant_ctss <- score(ctss)[subjectHits(o)][global_max_ids]
clusters
}
bioC2() -> bioC_results2
data.table
waylibrary("data.table") |> suppressPackageStartupMessages()
dataTable <- function() {
dt <- ctss |> as.data.frame() |> data.table::as.data.table()
dt$id <- dt$cluster |> as.factor() |> as.integer()
dom <- dt[ , list( seqnames[1]
, strand[1]
, pos[find.dominant.idx(score)]
, max(score))
, by = id]
setnames(dom, c( "cluster", "chr", "strand"
, "dominant_ctss", "tpm.dominant_ctss"))
clusters$dominant_ctss <- GRanges(dom$chr, dom$dominant_ctss, dom$strand)
seqinfo(clusters$dominant_ctss) <- seqinfo(ctss)
clusters$tpm.dominant_ctss <- dom$tpm.dominant_ctss
clusters
}
dataTable() -> dataTable_results
dplyr
waydplyr <- function() {
tb <- ctss |> as.data.frame() |> tibble::as_tibble()
tb$id <- tb$cluster |> as.factor() |> as.integer()
dom <- tb |> dplyr::group_by(id) |>
dplyr::summarise(seqnames = unique(seqnames), strand = unique(strand),
tpm.dominant_ctss = max(score), dominant_ctss = pos[find.dominant.idx(score)])
clusters$dominant_ctss <- GRanges(dom$seqnames, dom$dominant_ctss, dom$strand)
seqinfo(clusters$dominant_ctss) <- seqinfo(ctss)
clusters$tpm.dominant_ctss <- dom$tpm.dominant_ctss
clusters
}
dplyr() -> dplyr_results
bioC_results
## ConsensusClusters object with 806 ranges and 2 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## chr17:26027430:+ chr17 26027430 + |
## chr17:26050540:+ chr17 26050540 + |
## chr17:26118088:+ chr17 26118088 + |
## chr17:26142853:+ chr17 26142853 + |
## chr17:26166954:+ chr17 26166954 + |
## ... ... ... ... .
## chr17:32706021-32706407:+ chr17 32706021-32706407 + |
## chr17:32706605:+ chr17 32706605 + |
## chr17:32707132-32707170:+ chr17 32707132-32707170 + |
## chr17:32707322-32707376:+ chr17 32707322-32707376 + |
## chr17:32708847-32708958:+ chr17 32708847-32708958 + |
## dominant_ctss tpm.dominant_ctss
## <GRanges> <numeric>
## chr17:26027430:+ chr17:26027430:+ 64.1719
## chr17:26050540:+ chr17:26050540:+ 22.3101
## chr17:26118088:+ chr17:26118088:+ 64.1719
## chr17:26142853:+ chr17:26142853:+ 56.0704
## chr17:26166954:+ chr17:26166954:+ 64.1719
## ... ... ...
## chr17:32706021-32706407:+ chr17:32706231:+ 14993.7493
## chr17:32706605:+ chr17:32706605:+ 64.1719
## chr17:32707132-32707170:+ chr17:32707135:+ 99.6448
## chr17:32707322-32707376:+ chr17:32707322:+ 67.2963
## chr17:32708847-32708958:+ chr17:32708890:+ 1319.9060
## -------
## seqinfo: 26 sequences (1 circular) from danRer7 genome
identical(bioC_results, bioC_results2)
## [1] TRUE
identical(bioC_results, dataTable_results)
## [1] TRUE
identical(bioC_results, dplyr_results)
## [1] TRUE
(benchmark <- microbenchmark::microbenchmark(bioC(), bioC2(), dataTable(), dplyr()))
## Unit: milliseconds
## expr min lq mean median uq max
## bioC() 5383.80738 5604.18083 5810.65343 5682.86584 5800.26130 6887.00923
## bioC2() 48.39151 51.30989 53.76299 52.46706 54.49261 68.59898
## dataTable() 74.60784 79.24606 84.51815 80.90140 86.89849 126.27618
## dplyr() 145.54982 150.83453 159.48283 154.80674 159.93879 203.85146
## neval
## 100
## 100
## 100
## 100
library("ggplot2") |> suppressPackageStartupMessages()
ggplot(benchmark, aes(x = time / 1e9, y = expr, color = expr)) + # Plot performance comparison
geom_boxplot() +
scale_x_log10("time (seconds)")
The approach combining findOverlaps
from Bionductor and cumulative sums from
base R works the best on test data, with comparable performance with dplyr
and data.table
. This opens the way to the removal of the dependency to
data.table
. In the future, if we start to import dplyr
for reasons related
to ggplot2
or plyranges
, we may might replace the Bioc + base R version
with a dplyr
one, that is probably easier to read for most contributors.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.