knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.height = 4,
  fig.align = "center",
  fig.width = 8
)

valr Functions

Many valr functions are written in Rcpp/C++ to maximize speed. Here are benchmarks for 1 million random x and y intervals.

library(valr)
library(dplyr)
library(ggplot2)
library(tibble)
library(scales)
library(GenomicRanges)
library(bench)

genome <- read_genome(valr_example('hg19.chrom.sizes.gz'))

# number of intervals
n <- 1e6
# number of timing reps
nrep <- 10

seed_x <- 1010486
x <- bed_random(genome, n = n, seed = seed_x)
seed_y <- 9283019
y <- bed_random(genome, n = n, seed = seed_y)

res <- mark(
  # randomizing functions
  bed_random(genome, n = n, seed = seed_x),
  bed_shuffle(x, genome, seed = seed_x),
  # # single tbl functions
  bed_slop(x, genome, both = 1000),
  bed_flank(x, genome, both = 1000),
  bed_shift(x, genome),
  bed_merge(x),
  bed_partition(x),
  bed_cluster(x),
  bed_complement(x, genome),
  bed_genomecov(x, genome),
  # multi tbl functions
  bed_closest(x, y),
  bed_intersect(x, y),
  bed_map(x, y, .n = length(end)),
  bed_subtract(x, y),
  bed_window(x, y, genome),
  # stats
  bed_absdist(x, y, genome),
  bed_reldist(x, y),
  bed_jaccard(x, y),
  bed_fisher(x, y, genome),
  bed_projection(x, y, genome),
  # utilities
  bed_makewindows(x, win_size = 100),
  min_time = Inf,
  iterations = nrep,
  check = FALSE,
  time_unit = 's')

# covert nanoseconds to seconds
res2<- res |>
  as_tibble() |>
  mutate(expression = as.character(expression)) |> 
  tidyr::unnest(time) |> 
  arrange(time)

# futz with the x-axis
maxs <- res2 |>
  group_by(expression) |>
  summarize(max.time = max(boxplot.stats(time)$stats))

# filter out outliers
res <- res2 |>
  left_join(maxs) |>
  filter(time <= max.time * 1.05)

ggplot(res, aes(x=reorder(expression, time), y=time)) +
  geom_boxplot(fill = 'red', outlier.shape = NA, alpha = 0.5) +
  coord_flip() +
  theme_bw() +
  labs(
    y='execution time (seconds)',
    x='',
    title="valr benchmarks",
    subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))

Comparison to GenomicRanges

Several functions in valr are comparable in speed to the equivalent function in GenomicRanges.

# set up test for valr vs GenomicRanges
seed_x <- 1010486
x <- bed_random(genome, n = n, seed = seed_x)
seed_y <- 9283019
y <- bed_random(genome, n = n, seed = seed_y)
seed_z <- 1234567
z <- bed_random(genome, n = n, seed = seed_z, sorted = FALSE)

# convert valr tibbles into GR objects
x_gr <- GRanges(
  seqnames = Rle(x$chrom),
  ranges = IRanges(x$start + 1, end = x$end))
y_gr <- GRanges(
  seqnames = Rle(y$chrom),
  ranges = IRanges(y$start + 1, end = y$end))
z_gr <- GRanges(
  seqnames = Rle(z$chrom),
  ranges = IRanges(z$start + 1, end = z$end))

res2 <- mark(
  # valr
  bed_flank(x, genome, both = 1000),
  bed_shift(x, genome),
  bed_merge(x),
  bed_complement(x, genome),
  bed_closest(x, y),
  bed_intersect(x, y),
  bed_subtract(x, y),
  bed_makewindows(x, win_size = 100),
  bed_sort(z),
  bed_genomecov(x, genome),
  # GRanges
  flank(x_gr, 1000),
  shift(x_gr,0),
  reduce(x_gr),
  gaps(x_gr),
  nearest(x_gr),
  intersect(x_gr, y_gr),
  setdiff(x_gr, y_gr),
  tile(x_gr, width = 100),
  sort(z_gr),
  coverage(x_gr),
  min_time = Inf,
  iterations = nrep,
  check = FALSE,
  time_unit = 's')

# label experiment pair and software
comp <- as_tibble(summary(res2)) |> 
  mutate(pair = as.numeric(row_number()) %% (nrow(res2) / 1 / 2)) |> 
  mutate(software = rep(c("valr", "GR"), each = (nrow(res2) / 1 / 2))) |>
  select(expression, pair, software) |> 
  mutate(expression = as.character(expression))

# convert time table
res2 <- res2 |>
  mutate(expression = as.character(expression)) |> 
  as_tibble() |>
  left_join(comp) |>
  tidyr::unnest(time) |> 
  arrange(time)

# futz with the x-axis
maxs2 <- res2 |>
  group_by(expression) |>
  summarize(max.time = max(boxplot.stats(time)$stats))

# filter out outliers
res2 <- res2 |>
  left_join(maxs2) |>
  filter(time <= max.time * 1.05)

#label
ggplot(res2, aes(x=reorder(expression, pair), y=time)) +
  geom_boxplot(aes(fill = software), outlier.shape = NA, alpha = 0.5) +
  coord_flip() +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  geom_vline(xintercept=seq(1,nrow(res2),2)-.5, color = 'grey', alpha = 0.23) +
  labs(
    y='execution time (seconds)',
    x='',
    title="valr vs GenomicRanges benchmarks",
    subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))


rnabioco/valr documentation built on April 25, 2024, 11:22 a.m.