knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.height = 4, fig.align = "center", fig.width = 8 )
Many valr functions are written in Rcpp/C++ to maximize speed. Here are benchmarks for 1 million random x
and y
intervals.
library(valr) library(dplyr) library(ggplot2) library(tibble) library(scales) library(GenomicRanges) library(bench) genome <- read_genome(valr_example('hg19.chrom.sizes.gz')) # number of intervals n <- 1e6 # number of timing reps nrep <- 10 seed_x <- 1010486 x <- bed_random(genome, n = n, seed = seed_x) seed_y <- 9283019 y <- bed_random(genome, n = n, seed = seed_y) res <- mark( # randomizing functions bed_random(genome, n = n, seed = seed_x), bed_shuffle(x, genome, seed = seed_x), # # single tbl functions bed_slop(x, genome, both = 1000), bed_flank(x, genome, both = 1000), bed_shift(x, genome), bed_merge(x), bed_partition(x), bed_cluster(x), bed_complement(x, genome), bed_genomecov(x, genome), # multi tbl functions bed_closest(x, y), bed_intersect(x, y), bed_map(x, y, .n = length(end)), bed_subtract(x, y), bed_window(x, y, genome), # stats bed_absdist(x, y, genome), bed_reldist(x, y), bed_jaccard(x, y), bed_fisher(x, y, genome), bed_projection(x, y, genome), # utilities bed_makewindows(x, win_size = 100), min_time = Inf, iterations = nrep, check = FALSE, time_unit = 's') # covert nanoseconds to seconds res2<- res |> as_tibble() |> mutate(expression = as.character(expression)) |> tidyr::unnest(time) |> arrange(time) # futz with the x-axis maxs <- res2 |> group_by(expression) |> summarize(max.time = max(boxplot.stats(time)$stats)) # filter out outliers res <- res2 |> left_join(maxs) |> filter(time <= max.time * 1.05) ggplot(res, aes(x=reorder(expression, time), y=time)) + geom_boxplot(fill = 'red', outlier.shape = NA, alpha = 0.5) + coord_flip() + theme_bw() + labs( y='execution time (seconds)', x='', title="valr benchmarks", subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))
Several functions in valr are comparable in speed to the equivalent function in GenomicRanges.
# set up test for valr vs GenomicRanges seed_x <- 1010486 x <- bed_random(genome, n = n, seed = seed_x) seed_y <- 9283019 y <- bed_random(genome, n = n, seed = seed_y) seed_z <- 1234567 z <- bed_random(genome, n = n, seed = seed_z, sorted = FALSE) # convert valr tibbles into GR objects x_gr <- GRanges( seqnames = Rle(x$chrom), ranges = IRanges(x$start + 1, end = x$end)) y_gr <- GRanges( seqnames = Rle(y$chrom), ranges = IRanges(y$start + 1, end = y$end)) z_gr <- GRanges( seqnames = Rle(z$chrom), ranges = IRanges(z$start + 1, end = z$end)) res2 <- mark( # valr bed_flank(x, genome, both = 1000), bed_shift(x, genome), bed_merge(x), bed_complement(x, genome), bed_closest(x, y), bed_intersect(x, y), bed_subtract(x, y), bed_makewindows(x, win_size = 100), bed_sort(z), bed_genomecov(x, genome), # GRanges flank(x_gr, 1000), shift(x_gr,0), reduce(x_gr), gaps(x_gr), nearest(x_gr), intersect(x_gr, y_gr), setdiff(x_gr, y_gr), tile(x_gr, width = 100), sort(z_gr), coverage(x_gr), min_time = Inf, iterations = nrep, check = FALSE, time_unit = 's') # label experiment pair and software comp <- as_tibble(summary(res2)) |> mutate(pair = as.numeric(row_number()) %% (nrow(res2) / 1 / 2)) |> mutate(software = rep(c("valr", "GR"), each = (nrow(res2) / 1 / 2))) |> select(expression, pair, software) |> mutate(expression = as.character(expression)) # convert time table res2 <- res2 |> mutate(expression = as.character(expression)) |> as_tibble() |> left_join(comp) |> tidyr::unnest(time) |> arrange(time) # futz with the x-axis maxs2 <- res2 |> group_by(expression) |> summarize(max.time = max(boxplot.stats(time)$stats)) # filter out outliers res2 <- res2 |> left_join(maxs2) |> filter(time <= max.time * 1.05) #label ggplot(res2, aes(x=reorder(expression, pair), y=time)) + geom_boxplot(aes(fill = software), outlier.shape = NA, alpha = 0.5) + coord_flip() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + geom_vline(xintercept=seq(1,nrow(res2),2)-.5, color = 'grey', alpha = 0.23) + labs( y='execution time (seconds)', x='', title="valr vs GenomicRanges benchmarks", subtitle=paste(comma(n), "random x/y intervals,", comma(nrep), "repetitions"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.