#' Generic Bio data set extractor
#'
#' Usually you want to use one of the specific set extractors
#' @inheritParams getBio
#' @param organisms a character vector storing the names of the organisms than shall be retrieved.
#' There are three available options to characterize an organism:
#' @param path character, default location is paste0("set_", toupper(set_type))
#' @param set_type the biological sequence type that shall be retrieved.
#' Available options are
#' \itemize{
#' \item \code{set_type = "genome"}
#' \item \code{set_type = "proteome"}
#' \item \code{set_type = "cds"}
#' \item \code{set_type = "gff"}
#' \item \code{set_type = "rna"}
#' \item \code{set_type = "assembly_stats"}
#' \item \code{set_type = "repeat_masker"}
#' \item \code{set_type = "collection"} (all the others)
#' }
#' @author Hajk-Georg Drost
#' @details Internally this function loads the the overview.txt file from NCBI:
#'
#' refseq: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/
#'
#' genbank: ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/
#'
#' and creates a directory 'set_CDSs' to store
#' the CDSs of interest as fasta files for future processing.
#' In case the corresponding fasta file already exists within the
#' 'set_CDSs' folder and is accessible within the workspace,
#' no download process will be performed.
#' @return File path to downloaded genomes
#' (names are identifiers: 'new' (file was downloaded now),
#' 'old' files did already exist)
#' @examples \dontrun{
#' getBioSet("refseq", organisms = c("Arabidopsis thaliana",
#' "Arabidopsis lyrata",
#' "Capsella rubella"),
#' set_type = "cds")
#' }
#' @family getBioSet
#' @seealso \code{\link{getBio}}
#' @export
getBioSet <- function(db = "refseq",
organisms,
set_type,
reference = FALSE,
release = NULL,
gunzip = TRUE,
update = FALSE,
skip_bacteria = TRUE,
path = paste0("set_", toupper(set_type)),
remove_annotation_outliers = FALSE,
assembly_type = "toplevel",
format = "gff3",
mute_citation = FALSE) {
stopifnot(length(organisms) > 0)
set_upper <- toupper(set_type)
message(
"Starting ", set_upper, " retrieval of the following ", set_upper, "s: ",
paste0(organisms, collapse = ", "),
" ..."
)
if (!dir.exists(path)) {
message("Generating folder ", path, " ...")
dir.create(path, recursive = TRUE, showWarnings = FALSE)
}
if (!dir.exists(file.path(path, "documentation")))
dir.create(file.path(path, "documentation"), showWarnings = FALSE)
message("\n")
organisms <- check_if_tidy_exists(organisms, path, update, set_type)
to_run <- names(organisms) == "new"
if (any(to_run)) {
message("\n")
paths <- c()
for (organism in organisms[to_run]) {
out <- getBio(db, organism, type = set_type, reference = reference,
release = release, gunzip = gunzip, update = update,
skip_bacteria = skip_bacteria, path = path,
remove_annotation_outliers = remove_annotation_outliers,
assembly_type = assembly_type, format = format,
mute_citation = TRUE)
paths <- c(paths, out)
message("\n")
}
make_summary_doc_set(path)
mark_as_complete_set(organisms, path, paths, set_type)
}
dt <- data.table::fread(done_organisms_bioset_file(path, set_type))
paths <- dt[data.table::chmatch(organism, tidy_name2(organisms)),]$file
names(paths) <- names(organisms)
please_cite_biomartr(mute_citation = mute_citation)
return(paths)
}
#' A wrapper to all bio getters, selected with 'type' argument
#'
#' @param db a character string specifying the database from which the genome
#' shall be retrieved:
#' \itemize{
#' \item \code{db = "refseq"}
#' \item \code{db = "genbank"}
#' \item \code{db = "ensembl"}
#' }
#' @param organism Organism selector id,
#' there are three options to characterize an organism:
#' \itemize{
#' \item by \code{scientific name}: e.g. \code{organism = "Homo sapiens"}
#' \item by \code{database specific accession identifier}: e.g. \code{organism = "GCF_000001405.37"} (= NCBI RefSeq identifier for \code{Homo sapiens})
#' \item by \code{taxonomic identifier from NCBI Taxonomy}: e.g. \code{organism = "9606"} (= taxid of \code{Homo sapiens})
#' }
#' @param type biological sequence type. (alternatives are: genome, gff, cds,
#' rna, proteome, assembly_stats, repeat_masker, collection (all the others))
#' @param reference a logical value indicating whether or not a genome shall be downloaded if it isn't marked in the database as either a reference genome or a representative genome.
#' @param release a numeric, the database release version of ENSEMBL (\code{db = "ensembl"}). Default is \code{release = NULL} meaning
#' that the most recent database version is used. \code{release = 75} would for human would give the stable
#' GRCh37 release in ensembl. Value must be > 46, since ensembl did not structure their data
#' if the standard format before that.
#' @param gunzip a logical, indicating whether or not files should be unzipped.
#' @param update logical, default FALSE. Updated backend cached files needed.
#' Usually keep this false, to make ut run much faster. Only set to TRUE, if you
#' believe you cache is outdated (Species only exist in newest release etc)
#' @param skip_bacteria Due to its enormous dataset size (> 700MB as of July 2023),
#' the bacterial summary file will not be loaded by default anymore. If users
#' wish to gain insights for the bacterial kingdom they needs to actively specify \code{skip_bacteria = FALSE}. When \code{skip_bacteria = FALSE} is set then the
#' bacterial summary file will be downloaded.
#' @param path character, default location is paste0("set_", toupper(type))
#' @param remove_annotation_outliers shall outlier lines be removed from the input \code{annotation_file}?
#' If yes, then the initial \code{annotation_file} will be overwritten and the removed outlier lines will be stored at \code{\link{tempdir}}
#' for further exploration.
#' @param analyse_genome logical, default FALSE. If TRUE, get general genome statistics like
#' gc content etc. For more details, see ?summary_genome
#' @param assembly_type character, default c("primary_assembly", "toplevel"). Used for ensembl only,
#' specifies the genome assembly type. Searches for both primary and toplevel, and if both are found, uses the
#' first by order (so primary is prioritized by default).
#' The Primary assembly should usually be used if it exists.
#' The "primary assembly" contains all the top-level sequence regions,
#' excluding alternative haplotypes and patches.
#' If the primary assembly file is not present for a species
#' (only defined for standard model organisms),
#' that indicates that there were no haplotype/patch regions,
#' and in such cases, the 'toplevel file is used.
#' For more details see:
#' \href{https://grch37.ensembl.org/info/genome/genebuild/assembly.html}{ensembl tutorial}
#' @param format "gff3", alternative "gtf" for ensembl.
#' @param mute_citation logical, default FALSE, indicating whether citation message should be muted.
#' @author Hajk-Georg Drost
#' @details Internally this function loads the the overview.txt file from NCBI:
#'
#' refseq: ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/
#'
#' genbank: ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/
#'
#' and creates a directory relative to file type, if you get fasta genomes it will be
#' _ncbi_downloads/genomes'.
#' In case the corresponding fasta file already exists within the
#' '_ncbi_downloads/genomes' folder and is accessible within the workspace,
#' no download process will be performed.
#' For other file types the same rule applies.
#' @family getBio
#' @return File path to downloaded genome.
getBio <- function(db = "refseq",
organism,
type,
reference = FALSE,
release = NULL,
gunzip = FALSE,
update = FALSE,
skip_bacteria = TRUE,
path = paste0("set_", toupper(type)),
remove_annotation_outliers = FALSE,
analyse_genome = FALSE,
assembly_type = "toplevel",
format = "gff3",
mute_citation = FALSE) {
if (type == "genome") {
getGenome(db, organism, reference = reference,
skip_bacteria = skip_bacteria, release = release,
gunzip = gunzip, path = path, assembly_type = assembly_type,
mute_citation = mute_citation, analyse_genome = analyse_genome)
} else if (type == "gff") {
getGFF(db, organism, reference = reference,
skip_bacteria = skip_bacteria, release = release,
gunzip = gunzip,
remove_annotation_outliers = remove_annotation_outliers,
path = path,
mute_citation = mute_citation, format = format)
} else if (type == "cds") {
getCDS(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
gunzip = gunzip,
path = path,
mute_citation = mute_citation)
} else if (type == "rna") {
getRNA(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
gunzip = gunzip,
path = path,
mute_citation = mute_citation)
} else if (type %in% c("protein", "proteome")) {
getProteome(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
gunzip = gunzip,
path = path,
mute_citation = mute_citation)
} else if (type == "assembly_stats") {
getAssemblyStats(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
path = path,
mute_citation = mute_citation)
} else if (type == "repeat_masker") {
getRepeatMasker(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
gunzip = gunzip,
path = path,
mute_citation = mute_citation)
} else if (type == "collection") {
getCollection(db, organism, reference = reference,
skip_bacteria = skip_bacteria,
release = release,
gunzip = gunzip,
remove_annotation_outliers = FALSE,
path = path,
mute_citation = mute_citation)
} else stop("invalid bio getter type selected")
}
make_summary_doc_set <- function(path, out = file.path(path, "documentation", paste0(basename(path), "_summary.csv"))) {
meta_files <- list.files(path)
meta_files <- meta_files[stringr::str_detect(meta_files, "doc_")]
if (length(meta_files) == 0) return(invisible(NULL))
file.rename(from = file.path(path, meta_files), to = file.path(path, "documentation", meta_files))
doc_tsv_files <- file.path(path,"documentation", meta_files[stringr::str_detect(meta_files, "[.]tsv")])
summary_log <- data.table::rbindlist(lapply(doc_tsv_files, function(data) {
suppressMessages(data.table::fread(data))
}))
data.table::fwrite(summary_log, out)
message("A summary file (which can be used as supplementary information file in publications)",
" containig retrieval information for all species has been stored at '", out, "'.")
return(invisible(NULL))
}
check_if_tidy_exists <- function(organisms, path, update, set_type) {
# Load done csv if existing
done_org_list <- done_organisms_bioset_file(path, set_type)
names(organisms) <- rep("new", length(organisms))
if (file.exists(done_org_list)) {
done_org_list <- data.table::fread(done_org_list)
organisms_short <- tidy_name2(organisms)
# If update, remove all
done_orgs <- done_org_list$organism %in% organisms_short
if (any(done_orgs)) {
if (update) {
file.remove(done_org_list[done_orgs,]$file)
} else {
match <- organisms_short %in% done_org_list$organism
names(organisms)[match] <- "exist"
}
}
if (all(done_orgs) & !update) {
message("- Skipping computation of set (all species already done),",
" set update to TRUE if you want to run again")
}
}
return(organisms)
}
mark_as_complete_set <- function(organisms, path, new_files, set_type) {
# Load done csv if existing
stopifnot(names(organisms) %in% c("new", "exist"))
# If update, remove all
to_be_appended <- names(organisms) == "new"
if (any(to_be_appended)) {
done_org_list_file <- done_organisms_bioset_file(path, set_type)
if (file.exists(done_org_list_file)) {
done_org_list <- data.table::fread(done_org_list)
} else done_org_list <- data.table::data.table()
organisms_short <- tidy_name2(organisms)
append <- data.table::data.table(organism = organisms_short[to_be_appended],
file = new_files[to_be_appended])
new_list <- data.table::rbindlist(list(done_org_list, append))
data.table::fwrite(new_list, done_org_list_file)
message("- List of completed organisms updated in: ", done_org_list_file)
}
return(invisible(NULL))
}
done_organisms_bioset_file <- function(path, set_type) {
file.path(path, paste0(set_type, "_done_organisms.csv"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.