library("hpgltools") tt <- devtools::load_all("/data/hpgltools") knitr::opts_knit$set(width=120, progress=TRUE, verbose=TRUE, echo=TRUE) knitr::opts_chunk$set(error=TRUE, dpi=96) old_options <- options(digits=4, stringsAsFactors=FALSE, knitr.duplicate.label="allow") ggplot2::theme_set(ggplot2::theme_bw(base_size=10)) rundate <- format(Sys.Date(), format="%Y%m%d") ver <- "20180822" rmd_file <- "make_data.Rmd"
In this r markdown file, I am copying my make-data.R script into a R markdown document so that I can attempt to prove in a reproducible fashion that the csv and metadata I generate are valid.
I am removing the portion of my script which actually generated the data, because I do not want to wait for it to finish checking all the packages.
I am currently 'parallelizing' this script by invoking a copy of it for each eupathdb service. When each service completes, I come back and run make-data.R, which checks each data file, tries to regenerate anything which failed, and is responsible for making the final metadata. As a result, I use a single configuration file to define the version of the eupathdb to download, etc. config.R also loads my copy of the EuPathDB package. testthat is used when generating the actual data so that I can check each package.
library(testthat) source("config.R") webservice <- "eupathdb" eupathdb_version bioc_version
In this block I copy/paste the metadata downloader.
meta <- download_eupathdb_metadata( bioc_version=bioc_version, build_dir=build_dir, overwrite=TRUE, webservice=webservice, verbose=TRUE, eupathdb_version=eupathdb_version, write_csv=TRUE) all_metadata <- meta[["valid"]] write_eupathdb_metadata(all_metadata, bioc_version=bioc_version, eupathdb_version=eupathdb_version) end <- nrow(all_metadata) summary(all_metadata) end
The eupathdb provided me apparently 376 entries that my script will then attempt to make the various packages for.
The following block will not be run, but is what actually does the work of making the various rda/sqlite databases.
start <- 1 for (it in start:end) { entry <- all_metadata[it, ] species <- entry[["Species"]] info("Starting generation of ", species, ", which is ", it, " of ", end, " species.") pkgnames <- get_eupathdb_pkgnames(entry) if (isTRUE(bsgenome)) { bsgenome_result <- make_eupathdb_bsgenome(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) expected <- "bsgenome_name" actual <- names(bsgenome_result) testthat::test_that("Does make_eupathdb_bsgenome return something sensible?", { expect_equal(expected, actual) }) results[["bsgenome"]][[species]] <- bsgenome_result } if (isTRUE(orgdb)) { orgdb_result <- make_eupathdb_orgdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) if (is.null(orgdb_result)) { warn("There is insufficient data for ", species, " to make the OrgDB.") } else { actual <- orgdb_result[["orgdb_name"]] expected <- pkgnames[["orgdb"]] test_that("Does make_eupathdb_orgdb return something sensible?", { expect_equal(expected, actual) }) results[["orgdb"]][[species]] <- orgdb_result } } if (isTRUE(txdb)) { txdb_result <- make_eupathdb_txdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) if (is.null(txdb_result)) { warn("Unable to create the txdb package: ", entry$TxdbFile) } else { actual <- txdb_result[["txdb_name"]] expected <- pkgnames[["txdb"]] test_that("Does make_eupathdb_txdb return something sensible?", { expect_equal(expected, actual) }) results[["txdb"]][[species]] <- txdb_result[["txdb_name"]] } } if (isTRUE(organismdb)) { organ_result <- make_eupathdb_organismdbi(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) actual <- organ_result[["organismdb_name"]] expected <- pkgnames[["organismdbi"]] test_that("Does make_eupathdb_organismdbi return something sensible?", { expect_equal(expected, actual) }) results[["organismdbi"]] <- organ_result } } ## End iterating over every entry in the eupathdb metadata.
Once the data has been generated (or not), the final task is to check the metadata and the generated files. The following block accomplishes this:
the variables bsgenome and organismdb are of course, FALSE in this invocation.
One might wonder while I do a file.copy() here. I actually run all this stuff in a separate tree in order to keep it separate from my EuPathDB codebase, mostly because I need to run it on our computing cluster where there is a scratch disk of sufficient size for these files.
if (isTRUE(orgdb)) { org_csv <- check_csv(build_dir, file_type="OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) org_files <- check_files("OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", org_csv) copied <- file.copy(org_csv, csv_copy_path, overwrite=TRUE) expect_true(copied) org_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), org_csv) save(list=c("org_checked"), file=file.path(build_dir, "orgdb_metadata.rda")) class(org_checked) summary(org_checked) }
if (isTRUE(txdb)) { txdb_csv <- check_csv(build_dir, file_type="TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) tx_files <- check_files("TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", txdb_csv) copied <- file.copy(txdb_csv, csv_copy_path, overwrite=TRUE) expect_true(copied) tx_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), txdb_csv) save(list=c("tx_checked"), file=file.path(build_dir, "txdb_metadata.rda")) class(tx_checked) summary(tx_checked) }
if (isTRUE(granges)) { grange_csv <- check_csv(build_dir, file_type="GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version) grange_files <- check_files("GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", grange_csv) copied <- file.copy(grange_csv, csv_copy_path, overwrite=TRUE) expect_true(copied) grange_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), grange_csv) save(list=c("grange_checked"), file=file.path(build_dir, "granges_metadata.rda")) class(grange_checked) summary(grange_checked) }
For full disclosure, here is the full text of my script. In the above, I am just invoking the relevant portions for this discussion.
#!/usr/bin/env Rscript library(testthat) source("config.R") webservice <- "eupathdb" meta <- download_eupathdb_metadata( bioc_version=bioc_version, overwrite=TRUE, webservice=webservice, verbose=TRUE, eupathdb_version=eupathdb_version, write_csv=TRUE) all_metadata <- meta[["valid"]] write_eupathdb_metadata(all_metadata, bioc_version=bioc_version, eupathdb_version=eupathdb_version) end <- nrow(all_metadata) start <- 1 for (it in start:end) { entry <- all_metadata[it, ] species <- entry[["Species"]] info("Starting generation of ", species, ", which is ", it, " of ", end, " species.") pkgnames <- get_eupathdb_pkgnames(entry) if (isTRUE(bsgenome)) { bsgenome_result <- make_eupathdb_bsgenome(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) expected <- "bsgenome_name" actual <- names(bsgenome_result) testthat::test_that("Does make_eupathdb_bsgenome return something sensible?", { expect_equal(expected, actual) }) results[["bsgenome"]][[species]] <- bsgenome_result } if (isTRUE(orgdb)) { orgdb_result <- make_eupathdb_orgdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) if (is.null(orgdb_result)) { warn("There is insufficient data for ", species, " to make the OrgDB.") } else { actual <- orgdb_result[["orgdb_name"]] expected <- pkgnames[["orgdb"]] test_that("Does make_eupathdb_orgdb return something sensible?", { expect_equal(expected, actual) }) results[["orgdb"]][[species]] <- orgdb_result } } if (isTRUE(txdb)) { txdb_result <- make_eupathdb_txdb(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) if (is.null(txdb_result)) { warn("Unable to create the txdb package: ", entry$TxdbFile) } else { actual <- txdb_result[["txdb_name"]] expected <- pkgnames[["txdb"]] test_that("Does make_eupathdb_txdb return something sensible?", { expect_equal(expected, actual) }) results[["txdb"]][[species]] <- txdb_result[["txdb_name"]] } } if (isTRUE(organismdb)) { organ_result <- make_eupathdb_organismdbi(entry, workdir = build_dir, eupathdb_version=eupathdb_version, copy_s3=TRUE) actual <- organ_result[["organismdb_name"]] expected <- pkgnames[["organismdbi"]] test_that("Does make_eupathdb_organismdbi return something sensible?", { expect_equal(expected, actual) }) results[["organismdbi"]] <- organ_result } } ## End iterating over every entry in the eupathdb metadata. if (isTRUE(bsgenome)) { bs_csv <- check_csv(build_dir, file_type="BSgenome", bioc_version=bioc_version, eupathdb_version=eupathdb_version) bs_files <- check_files("BSgenome", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", bs_csv) file.copy(bs_csv, csv_copy_path) bs_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), bs_csv) save(list=c("bs_checked"), file=file.path(build_dir, "bsgenome_metadata.rda")) bs_checked[[1]] } if (isTRUE(orgdb)) { org_csv <- check_csv(build_dir, file_type="OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) org_files <- check_files("OrgDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", org_csv) file.copy(org_csv, csv_copy_path) org_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), orgdb_csv) save(list=c("org_checked"), file=file.path(build_dir, "orgdb_metadata.rda")) org_checked[[1]] } if (isTRUE(txdb)) { txdb_csv <- check_csv(build_dir, file_type="TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) tx_files <- check_files("TxDb", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", txdb_csv) file.copy(txdb_csv, csv_copy_path) tx_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), txdb_csv) save(list=c("tx_checked"), file=file.path(build_dir, "txdb_metadata.rda")) tx_checked[[1]] } if (isTRUE(organismdb)) { organ_csv <- check_csv(build_dir, file_type="OrganismDbi", bioc_version=bioc_version, eupathdb_version=eupathdb_version) organ_files <- check_files("OrganismDbi", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", organ_csv) file.copy(organ_csv, csv_copy_path) organ_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), organ_csv) save(list=c("organ_checked"), file=file.path(build_dir, "organismdbi_metadata.rda")) organ_checked[[1]] } if (isTRUE(granges)) { grange_csv <- check_csv(build_dir, file_type="GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version) grange_files <- check_files("GRanges", bioc_version=bioc_version, eupathdb_version=eupathdb_version) csv_copy_path <- file.path(path.package("EuPathDB"), "inst", "extdata", grange_csv) file.copy(grange_csv, csv_copy_path) grange_checked <- AnnotationHubData::makeAnnotationHubMetadata(path.package("EuPathDB"), grange_csv) save(list=c("grange_checked"), file=file.path(build_dir, "granges_metadata.rda")) grange_checked[[1]] }
if (isTRUE(granges)) { grange_ah <- query_s3_ah(file_type="GRanges") }
pander::pander(sessionInfo()) info("This is hpgltools commit: ", get_git_commit()) this_save <- paste0(gsub("\\.Rmd", "", rmd_file), "-v", ver, ".rda.xz") info("Saving to ", this_save) tmp <- sm(saveme(filename=this_save))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.