To accelerate plotting and reduce the file size of vector graphics created with the plotCorrelation2 method, points are deduplicated after reducing their precision to a smaller number of significant digits. After benchmarking the alternatives below, I chose the approach based on complex numbers, because their performance is similar to data.table or dplyr objects, and because these functions are available from standard R installations.

options(width=120)
knitr::opts_chunk$set(cache  = TRUE, cache.lazy = FALSE)
knitr::opts_knit$set(verbose = TRUE)
set.seed(1)
pois1 <- rpois(1e7, 0.3)
pois2 <- rpois(1e7, 0.3)

nbin1 <- rnbinom(1e7, mu = 2, size = .1)
nbin2 <- rnbinom(1e7, mu = 2, size = .1)

# Same with a bit of noise
x     <- nbin1 * runif(1e7)
y     <- nbin2 * runif(1e7)

sx    <- signif(x, 2)
sy    <- signif(y, 2)

DataFrame tables are coerced to data.frame objects by the duplicated.DataFrame function, therefore execution times are similar.

f.df       <- function(x,y) {
  df <- unique(data.frame(x = x, y = y))
  data.frame(df$x, df$y)}

f.tbl <- function(x,y) {
  tbl <- dplyr::distinct(dplyr::tibble(x = x, y = y))
  data.frame(tbl$x, tbl$y)}

f.dt       <- function(x,y) {
  dt <- unique(data.table::data.table(x = x, y = y))
  data.frame(dt$x, dt$y)}

f.cplx     <- function(x,y) {
  u <- unique(complex(real=x, im=y))
  data.frame(Re(u), Im(u))}

f.cplx.Rle <- function(x,y) {
  u <- unique(S4Vectors::Rle(complex(real=x, im=y)))
  data.frame(Re(u), Im(u))}
microbenchmark::microbenchmark(
  f.df       (pois1, pois2),
  f.dt       (pois1, pois2),
  f.tbl      (pois1, pois2),
  f.cplx     (pois1, pois2),
  f.cplx.Rle (pois1, pois2),  times = 10L)
microbenchmark::microbenchmark(
  f.df       (nbin1, nbin2),
  f.dt       (nbin1, nbin2),
  f.tbl      (nbin1, nbin2),
  f.cplx     (nbin1, nbin2),
  f.cplx.Rle (nbin1, nbin2),  times = 10L)
microbenchmark::microbenchmark(
  f.df       (x, y),
  f.dt       (x, y),
  f.tbl      (x, y),
  f.cplx     (x, y),
  f.cplx.Rle (x, y),  times = 10L)
microbenchmark::microbenchmark(
  f.df       (sx, sy),
  f.dt       (sx, sy),
  f.tbl      (sx, sy),
  f.cplx     (sx, sy),
  f.cplx.Rle (sx, sy),  times = 10L)


charles-plessy/CAGEr documentation built on Aug. 2, 2024, 4:35 p.m.