library("hpgltools")
tt <- devtools::load_all("/data/hpgltools")
knitr::opts_knit$set(width=120,
                     progress=TRUE,
                     verbose=TRUE,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      dpi=96)
old_options <- options(digits=4,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
rundate <- format(Sys.Date(), format="%Y%m%d")
ver <- "20180822"

rmd_file <- "make_data.Rmd"

Creating AnnotationHub Metadata!

In this r markdown file, I am copying my make-data.R script into a R markdown document so that I can attempt to prove in a reproducible fashion that the csv and metadata I generate are valid.

I am removing the portion of my script which actually generated the data, because I do not want to wait for it to finish checking all the packages.

Loading my configuration

I am currently 'parallelizing' this script by invoking a copy of it for each eupathdb service. When each service completes, I come back and run make-data.R, which checks each data file, tries to regenerate anything which failed, and is responsible for making the final metadata. As a result, I use a single configuration file to define the version of the eupathdb to download, etc. config.R also loads my copy of the EuPathDB package. testthat is used when generating the actual data so that I can check each package.

library(testthat)
source("config.R")
webservice <- "eupathdb"

eupathdb_version
bioc_version

Download the metadata from the various eupathdb web services.

In this block I copy/paste the metadata downloader.

meta <- download_eupathdb_metadata(
  bioc_version=bioc_version, build_dir=build_dir, overwrite=TRUE, webservice=webservice,
  verbose=TRUE, eupathdb_version=eupathdb_version, write_csv=TRUE)
all_metadata <- meta[["valid"]]
write_eupathdb_metadata(all_metadata, bioc_version=bioc_version, eupathdb_version=eupathdb_version)
end <- nrow(all_metadata)

summary(all_metadata)
end

The eupathdb provided me apparently 376 entries that my script will then attempt to make the various packages for.

The package creation body

The following block will not be run, but is what actually does the work of making the various rda/sqlite databases.

start <- 1
for (it in start:end) {
  entry <- all_metadata[it, ]
  species <- entry[["Species"]]
  info("Starting generation of ", species, ", which is ", it, " of ", end, " species.")
  pkgnames <- get_eupathdb_pkgnames(entry)
  if (isTRUE(bsgenome)) {
    bsgenome_result <- make_eupathdb_bsgenome(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    expected <- "bsgenome_name"
    actual <- names(bsgenome_result)
    testthat::test_that("Does make_eupathdb_bsgenome return something sensible?", {
      expect_equal(expected, actual)
    })
    results[["bsgenome"]][[species]] <- bsgenome_result
  }
  if (isTRUE(orgdb)) {
    orgdb_result <- make_eupathdb_orgdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    if (is.null(orgdb_result)) {
      warn("There is insufficient data for ", species, " to make the OrgDB.")
    } else {
      actual <- orgdb_result[["orgdb_name"]]
      expected <- pkgnames[["orgdb"]]
      test_that("Does make_eupathdb_orgdb return something sensible?", {
        expect_equal(expected, actual)
      })
      results[["orgdb"]][[species]] <- orgdb_result
    }
  }
  if (isTRUE(txdb)) {
    txdb_result <- make_eupathdb_txdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)

    if (is.null(txdb_result)) {
      warn("Unable to create the txdb package: ", entry$TxdbFile)
    } else {
      actual <- txdb_result[["txdb_name"]]
      expected <- pkgnames[["txdb"]]
      test_that("Does make_eupathdb_txdb return something sensible?", {
        expect_equal(expected, actual)
      })
      results[["txdb"]][[species]] <- txdb_result[["txdb_name"]]
    }
  }
  if (isTRUE(organismdb)) {
    organ_result <- make_eupathdb_organismdbi(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    actual <- organ_result[["organismdb_name"]]
    expected <- pkgnames[["organismdbi"]]
    test_that("Does make_eupathdb_organismdbi return something sensible?", {
      expect_equal(expected, actual)
    })
    results[["organismdbi"]] <- organ_result
  }
} ## End iterating over every entry in the eupathdb metadata.

The final checks

Once the data has been generated (or not), the final task is to check the metadata and the generated files. The following block accomplishes this:

the variables bsgenome and organismdb are of course, FALSE in this invocation.

Check the orgdb metadata

One might wonder while I do a file.copy() here. I actually run all this stuff in a separate tree in order to keep it separate from my EuPathDB codebase, mostly because I need to run it on our computing cluster where there is a scratch disk of sufficient size for these files.

if (isTRUE(orgdb)) {
  org_csv <- check_csv(build_dir, file_type="OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  org_files <- check_files("OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", org_csv)
  copied <- file.copy(org_csv, csv_copy_path, overwrite=TRUE)
  expect_true(copied)
  org_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), org_csv)
  save(list=c("org_checked"), file=file.path(build_dir, "orgdb_metadata.rda"))
  class(org_checked)
  summary(org_checked)
}

Check the txdb metadata

if (isTRUE(txdb)) {
  txdb_csv <- check_csv(build_dir, file_type="TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  tx_files <- check_files("TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", txdb_csv)
  copied <- file.copy(txdb_csv, csv_copy_path, overwrite=TRUE)
  expect_true(copied)
  tx_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), txdb_csv)
  save(list=c("tx_checked"), file=file.path(build_dir, "txdb_metadata.rda"))
  class(tx_checked)
  summary(tx_checked)
}

Check the granges

if (isTRUE(granges)) {
  grange_csv <- check_csv(build_dir, file_type="GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  grange_files <- check_files("GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", grange_csv)
  copied <- file.copy(grange_csv, csv_copy_path, overwrite=TRUE)
  expect_true(copied)
  grange_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), grange_csv)
  save(list=c("grange_checked"), file=file.path(build_dir, "granges_metadata.rda"))
  class(grange_checked)
  summary(grange_checked)
}

Full script

For full disclosure, here is the full text of my script. In the above, I am just invoking the relevant portions for this discussion.

#!/usr/bin/env Rscript
library(testthat)
source("config.R")
webservice <- "eupathdb"
meta <- download_eupathdb_metadata(
  bioc_version=bioc_version, overwrite=TRUE, webservice=webservice,
  verbose=TRUE, eupathdb_version=eupathdb_version, write_csv=TRUE)
all_metadata <- meta[["valid"]]
write_eupathdb_metadata(all_metadata, bioc_version=bioc_version, eupathdb_version=eupathdb_version)
end <- nrow(all_metadata)

start <- 1
for (it in start:end) {
  entry <- all_metadata[it, ]
  species <- entry[["Species"]]
  info("Starting generation of ", species, ", which is ", it, " of ", end, " species.")
  pkgnames <- get_eupathdb_pkgnames(entry)
  if (isTRUE(bsgenome)) {
    bsgenome_result <- make_eupathdb_bsgenome(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    expected <- "bsgenome_name"
    actual <- names(bsgenome_result)
    testthat::test_that("Does make_eupathdb_bsgenome return something sensible?", {
      expect_equal(expected, actual)
    })
    results[["bsgenome"]][[species]] <- bsgenome_result
  }
  if (isTRUE(orgdb)) {
    orgdb_result <- make_eupathdb_orgdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    if (is.null(orgdb_result)) {
      warn("There is insufficient data for ", species, " to make the OrgDB.")
    } else {
      actual <- orgdb_result[["orgdb_name"]]
      expected <- pkgnames[["orgdb"]]
      test_that("Does make_eupathdb_orgdb return something sensible?", {
        expect_equal(expected, actual)
      })
      results[["orgdb"]][[species]] <- orgdb_result
    }
  }
  if (isTRUE(txdb)) {
    txdb_result <- make_eupathdb_txdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    if (is.null(txdb_result)) {
      warn("Unable to create the txdb package: ", entry$TxdbFile)
    } else {
      actual <- txdb_result[["txdb_name"]]
      expected <- pkgnames[["txdb"]]
      test_that("Does make_eupathdb_txdb return something sensible?", {
        expect_equal(expected, actual)
      })
      results[["txdb"]][[species]] <- txdb_result[["txdb_name"]]
    }
  }
  if (isTRUE(organismdb)) {
    organ_result <- make_eupathdb_organismdbi(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE)
    actual <- organ_result[["organismdb_name"]]
    expected <- pkgnames[["organismdbi"]]
    test_that("Does make_eupathdb_organismdbi return something sensible?", {
      expect_equal(expected, actual)
    })
    results[["organismdbi"]] <- organ_result
  }
} ## End iterating over every entry in the eupathdb metadata.

if (isTRUE(bsgenome)) {
  bs_csv <- check_csv(build_dir, file_type="BSgenome", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  bs_files <- check_files("BSgenome", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", bs_csv)
  file.copy(bs_csv, csv_copy_path)
  bs_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), bs_csv)
  save(list=c("bs_checked"), file=file.path(build_dir, "bsgenome_metadata.rda"))
  bs_checked[[1]]
}
if (isTRUE(orgdb)) {
  org_csv <- check_csv(build_dir, file_type="OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  org_files <- check_files("OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", org_csv)
  file.copy(org_csv, csv_copy_path)
  org_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), orgdb_csv)
  save(list=c("org_checked"), file=file.path(build_dir, "orgdb_metadata.rda"))
  org_checked[[1]]
}
if (isTRUE(txdb)) {
  txdb_csv <- check_csv(build_dir, file_type="TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  tx_files <- check_files("TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", txdb_csv)
  file.copy(txdb_csv, csv_copy_path)
  tx_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), txdb_csv)
  save(list=c("tx_checked"), file=file.path(build_dir, "txdb_metadata.rda"))
  tx_checked[[1]]
}
if (isTRUE(organismdb)) {
  organ_csv <- check_csv(build_dir, file_type="OrganismDbi", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  organ_files <- check_files("OrganismDbi", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", organ_csv)
  file.copy(organ_csv, csv_copy_path)
  organ_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), organ_csv)
  save(list=c("organ_checked"), file=file.path(build_dir, "organismdbi_metadata.rda"))
  organ_checked[[1]]
}
if (isTRUE(granges)) {
  grange_csv <- check_csv(build_dir, file_type="GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  grange_files <- check_files("GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version)
  csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", grange_csv)
  file.copy(grange_csv, csv_copy_path)
  grange_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), grange_csv)
  save(list=c("grange_checked"), file=file.path(build_dir, "granges_metadata.rda"))
  grange_checked[[1]]
}

Test that the ah objects exist and are loadable in R.

if (isTRUE(granges)) {
  grange_ah <- query_s3_ah(file_type="GRanges")
}
pander::pander(sessionInfo())
info("This is hpgltools commit: ", get_git_commit())
this_save <- paste0(gsub("\\.Rmd", "", rmd_file), "-v", ver, ".rda.xz")
info("Saving to ", this_save)
tmp <- sm(saveme(filename=this_save))


khughitt/EuPathDB documentation built on Nov. 4, 2023, 4:19 a.m.