#' @title Check Genome Availability
#' @description This function checks the availability of a given genome on the
#' NBCI servers specified as scientific name.
#' @param db a character string specifying the database from which the genome
#' shall be retrieved:
#' \itemize{
#' \item \code{db = "refseq"}
#' \item \code{db = "genbank"}
#' \item \code{db = "ensembl"}
#' \item \code{db = "uniprot"}
#' }
#' @param organism there are three options to characterize an organism:
#' \itemize{
#' \item by \code{scientific name}: e.g. \code{organism = "Homo sapiens"}
#' \item by \code{database specific accession identifier}: e.g. \code{organism = "GCF_000001405.37"} (= NCBI RefSeq identifier for \code{Homo sapiens})
#' \item by \code{taxonomic identifier from NCBI Taxonomy}: e.g. \code{organism = "9606"} (= taxid of \code{Homo sapiens})
#' }
#' @param skip_bacteria Due to its enormous dataset size (> 700MB as of July 2023),
#' the bacterial summary file will not be loaded by default anymore. If users
#' wish to gain insights for the bacterial kingdom they needs to actively specify \code{skip_bacteria = FALSE}. When \code{skip_bacteria = FALSE} is set then the
#' bacterial summary file will be downloaded.
#' @param details a logical value specifying whether or not details on genome
#' size, kingdom, etc. shall be printed to the console intead of a
#' boolean value.
#' @details
#' Internally this function calls the \code{\link{listGenomes}} function to
#' detect all available genomes and checks whether or not the specified organism
#' is available for download.
#' @return a logical value specifing whether or not the genome of the input
#' organism is available. In case \code{details} = \code{TRUE} only a character
#' string specifying the genome details is being returned.
#' @author Hajk-Georg Drost
#' @examples \dontrun{
#' # checking whether the Homo sapiens genome is stored on NCBI
#' is.genome.available(organism = "Homo sapiens", db = "refseq")
#'
#' # and printing details
#' is.genome.available(organism = "Homo sapiens", db = "refseq", details = TRUE)
#'
#' # checking whether the Homo sapiens genome is stored on ENSEMBL
#' is.genome.available(organism = "Homo sapiens", db = "ensembl")
#'
#' # and printing details
#' is.genome.available(organism = "Homo sapiens",
#' details = TRUE,
#' db = "ensembl")
#' }
#' @export
is.genome.available <- function(db = "refseq", organism,
skip_bacteria = TRUE, details = FALSE) {
all_db <- db_hosts()
if (!is.element(db, all_db))
stop(
"Please select one of the available data bases:\n",
paste(all_db, collapse = ", "),
call. = FALSE
)
if (is.element(db, c("refseq", "genbank"))) {
is.genome.available.refseq.genbank(db = db, organism = organism, details = details, skip_bacteria = skip_bacteria)
} else if (db %in% c("ensembl", "ensemblgenomes")) {
is.genome.available.ensembl(db, organism, details)
} else if (db == "uniprot") {
is.genome.available.uniprot(db, organism, details)
}
}
is.genome.available.refseq.genbank <- function(db = "refseq",
organism,
skip_bacteria = TRUE,
details = FALSE) {
organism_name <- assembly_accession <- taxid <- NULL
AssemblyFilesAllKingdoms <- getKingdomAssemblySummary(db = db, skip_bacteria = skip_bacteria)
orgs <-
stringr::str_replace_all(AssemblyFilesAllKingdoms$organism_name,
"\\(", "")
orgs <- stringr::str_replace_all(orgs, "\\)", "")
AssemblyFilesAllKingdoms <-
dplyr::mutate(AssemblyFilesAllKingdoms, organism_name = orgs)
organism <-
stringr::str_replace_all(organism, "\\(", "")
organism <-
stringr::str_replace_all(organism, "\\)", "")
# test if organism specification could be a taxid
if (!is.taxid(organism)) {
FoundOrganism <-
dplyr::filter(AssemblyFilesAllKingdoms,
stringr::str_detect(organism_name, organism) |
assembly_accession == organism)
} else {
FoundOrganism <-
dplyr::filter(AssemblyFilesAllKingdoms,
taxid == as.integer(organism))
}
if (nrow(FoundOrganism) == 0) {
organism_no_hit_message_zero(organism, db)
return(FALSE)
}
if (nrow(FoundOrganism) > 0) {
if (!details) {
if (all(FoundOrganism$refseq_category == "na")) {
message("Only a non-reference genome assembly is available for '", organism, "'.",
" Please make sure to specify the argument 'reference = FALSE' when running any get*() function.")
} else {
message("A reference or representative genome assembly is available for '", organism, "'.")
}
if (nrow(FoundOrganism) > 1) {
organism_no_hit_message_more_than_one(organism, db)
}
return(TRUE)
}
if (details) {
if (all(FoundOrganism$refseq_category == "na")) {
message("Only a non-reference genome assembly is available for '", organism, "'.",
" Please make sure to specify the argument 'reference = FALSE' when running any get*() function.")
}
return(FoundOrganism)
}
}
}
#' Check if genome is available in ensembl
#'
#' @param divisions character, default is all from ensembl_divisions(),
#' alternatives: see \code{ensembl_divisions}
#' @noRd
is.genome.available.ensembl <- function(db = "ensembl", organism,
details = FALSE, divisions = ensembl_divisions()) {
name <- accession <- accession <- assembly <- taxon_id <- NULL
new.organism <- stringr::str_replace_all(organism, " ", "_")
# For each ensembl division, check if it exists
for (division in divisions) {
ensembl.available.organisms <- get.ensembl.info(division = division)
ensembl.available.organisms <- dplyr::filter(ensembl.available.organisms, !is.na(assembly))
if (!is.taxid(organism)) {
selected.organism <-
dplyr::filter(
ensembl.available.organisms,
stringr::str_detect(name,
stringr::str_to_lower(new.organism)) |
accession == organism, !is.na(assembly)
)
} else {
selected.organism <-
dplyr::filter(
ensembl.available.organisms, taxon_id == as.integer(organism), !is.na(assembly))
}
if (nrow(selected.organism) > 0) break
}
if (!details) {
if (nrow(selected.organism) == 0) {
organism_no_hit_message_zero(organism, db)
return(FALSE)
}
if (nrow(selected.organism) > 0) {
message("A reference or representative genome assembly is available for '", organism, "'.")
if (nrow(selected.organism) > 1) {
organism_no_hit_message_more_than_one(organism, db)
}
return(TRUE)
}
}
if (details)
return(selected.organism)
}
is.genome.available.uniprot <- function(db = "uniprot",
organism,
details = FALSE) {
if (is.taxid(organism)) {
uniprot_rest_url <- paste0(
"https://www.ebi.ac.uk/proteins/api/proteomes?offset=0&size=-1&taxid=",
as.integer(organism)
)
rest_status_test <- curl_fetch_memory(uniprot_rest_url)
if (rest_status_test$status_code != 200) {
message(
"Something went wrong when trying to access the API 'https://www.ebi.ac.uk/proteins/api/proteomes'.",
" Sometimes the internet connection isn't stable and re-running the function might help. Otherwise, could there be an issue with the firewall? ", "Is it possbile to access the homepage 'https://www.ebi.ac.uk/' through your browser?"
)
}
uniprot_species_info <-
tibble::as_tibble(jsonlite::fromJSON(
uniprot_rest_url
))
} else {
organism_new <- stringr::str_replace_all(organism, " ", "%20")
uniprot_rest_url_name <- paste0(
"https://www.ebi.ac.uk/proteins/api/proteomes?offset=0&size=-1&name=",
organism_new
)
rest_status_test_name <- curl_fetch_memory(uniprot_rest_url_name)
if ((rest_status_test_name$status_code != 200)) {
message(
"Something went wrong when trying to access the API 'https://www.ebi.ac.uk/proteins/api/proteomes'.",
" Sometimes the internet connection isn't stable and re-running the function might help. Otherwise, could there be an issue with the firewall? ", "Is it possbile to access the homepage 'https://www.ebi.ac.uk/' through your browser?"
)
}
if (rest_status_test_name$status_code == 200) {
uniprot_species_info <-
tibble::as_tibble(jsonlite::fromJSON(
uniprot_rest_url_name
))
}
}
if (!details) {
if (nrow(uniprot_species_info) == 0) {
organism_no_hit_message_zero(organism, db)
return(FALSE)
}
if (nrow(uniprot_species_info) > 0) {
message(
"A reference or representative genome assembly is available for '",
organism,
"'."
)
if (nrow(uniprot_species_info) > 1) {
organism_no_hit_message_more_than_one(organism, db)
}
return(TRUE)
}
}
# If details
return(uniprot_species_info)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.