library(magrittr)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = TRUE)

Aim: Check the speed of different tools for loading coverage.

Background

There are multiple tools that are able to obtain coverage from BigWig/BAM files. Specifically, here we test the tools megadepth, rtracklayer and pyBigWig. Some operations requiring coverage can be very computationally intensive - for example, the dasper R package requires loading coverage across 3 regions per junction per sample. Here, we aim to find the fastest tool for loading coverage from BigWig files by comparing the speed of the megadepth vs rtracklayer vs pyBigwig.

Input data

First off, we obtain paths to the annotation ranges and BigWig files to load coverage from. For these, we have used the examples hosted on https://github.com/ChristopherWilks/megadepth.

Annotation co-ordinates

# extract only the required ranges 
annotation <- dasper::junctions_example %>% 
  SummarizedExperiment::rowRanges() 

# keep only ranges on chr 21
GenomeInfoDb::seqlevels(annotation, pruning.mode = "coarse") <- "21"

# add chr to ranges to match bw
GenomeInfoDb::seqlevels(annotation) <- paste0("chr", GenomeInfoDb::seqlevels(annotation))

# take the first 1000 ranges for testing
annotation <- annotation[1:5000]

annotation_path <- file.path(tempdir(), "annotation_example.txt")

rtracklayer::export(annotation, 
                    con = annotation_path, 
                    format = "BED")

BigWig files

url <- recount::download_study(project = "SRP012682", 
                               type = "samples", 
                               download = FALSE)

set.seed(32)

# select 1 random sample
bw_remote <- sample(url, 1)

bw_local <- file.path(tempdir(), "example_gtex_bw.bw")
download.file(bw_remote, bw_local)

Are coverage values consistent between methods?

# get base-wise coverage as numeric list for summarisation after
rt_cov <- rtracklayer::import.bw(con = bw_local, 
                                 which = annotation, 
                                 as = "NumericList")

md_cov <- megadepth::get_coverage(bigwig_file = bw_local, 
                                  annotation = annotation_path, 
                                  op = "mean")

# round here to match 2 digits returned by megadepth
try(testthat::expect_equivalent(rt_cov %>% BiocGenerics::mean() %>% round(2), 
                                md_cov$cov))

Measure runtime for loading coverage

rtracklayer & megadepth

n_iters <- 10

benchmark_rt_md <- function(bw, times = 10){

  runtime <- microbenchmark::microbenchmark(
    "rt"= { 
      rtracklayer::import.bw(con = bw, 
                             which = annotation, 
                             as = "NumericList")
      }, 
    "md" = {
      megadepth::get_coverage(bigwig_file = bw, 
                              annotation = annotation_path, 
                              op = "mean")
    }, 
    times = times)

  runtime <- runtime %>% 
    dplyr::mutate(time = time / 1e9) %>% # convert time to secs
    dplyr::rename(tool = expr, run_time = time)

  return(runtime)

}

runtime_rt_md <- rbind(benchmark_rt_md(bw = bw_local, times = n_iters) %>% 
                         dplyr::mutate(local_remote = "local"), 
                       benchmark_rt_md(bw = bw_remote, times = n_iters) %>% 
                         dplyr::mutate(local_remote = "remote"))

pyBigWig {.tabset}

benchmark_pybw <- function(bw, times = 10){

  out_path <- paste0(tempdir(), "/pyBigWig_run_time.txt")

  system2("python3", 
          args = c(here::here("analysis", "test_pyBigWig.py"),
                   bw, 
                   annotation_path, 
                   out_path, 
                   times)
  )

  runtime <- readr::read_delim(out_path, delim = "\t")

  return(runtime)

}

runtime_pybw <- rbind(benchmark_pybw(bw = bw_local, times = n_iters) %>% 
                         dplyr::mutate(local_remote = "local"), 
                      benchmark_pybw(bw = bw_remote, times = n_iters) %>% 
                         dplyr::mutate(local_remote = "remote"))

Compare speeds

comp_time <- runtime_rt_md %>% 
  as.data.frame() %>% 
  rbind(runtime_pybw) %>% 
  dplyr::mutate(tool = tool %>% factor(c("md", "rt", "pyBigwig")))

comp_time_plot <- 
  ggplot2::ggplot(comp_time, ggplot2::aes(x = tool, 
                                          y = run_time, 
                                          colour = local_remote)) + 
  ggplot2::geom_boxplot() +  
  ggplot2::geom_jitter() + 
  ggplot2::scale_x_discrete(name = "Tool", 
                            labels = c("megadepth", 
                                       "rtracklayer", 
                                       "pyBigWig")) + 
  ggplot2::scale_y_continuous(name = paste0("Run time for ", length(annotation), " regions (seconds)")) + 
  ggplot2::scale_colour_manual(values = ggpubr::get_palette("npg", 2), 
                               guide = "none") + 
  ggplot2::facet_wrap(~ local_remote, 
                      scales = "free") + 
  ggplot2::theme_bw(base_size = 20) 

comp_time_plot %>% 
  ggplot2::ggsave(filename = "md_rt_pybw_runtime.png", 
                  plot = ., 
                  path = here::here("analysis"), 
                  dpi = 300, 
                  width = 10, height = 5)

Reproducibility

## Session info
library("sessioninfo")
options(width = 120)
session_info()


LieberInstitute/megadepth documentation built on May 6, 2024, 7:12 p.m.