Nothing
#' Retrieve Reptile Species and Taxonomic Information from RDB
#'
#' @description
#' Retrieves a list of reptile species from The Reptile Database (RDB) based on a search URL, and optionally returns detailed taxonomic information for each species.
#' This function can also save progress to disk during sampling and extract species-specific URLs for further use.
#'
#' @usage herpSpecies(url,
#' showProgress = TRUE,
#' dataList = NULL,
#' taxonomicInfo=FALSE,
#' fullHigher=FALSE,
#' getLink=FALSE,
#' cores = max(1, parallel::detectCores() - 1),
#' checkpoint = NULL,
#' backup_file = NULL
#' )
#'
#' @param url Character string. A search URL generated via an advanced search on the RDB website or with \code{\link{herpAdvancedSearch}}.
#' @param showProgress Logical. If \code{TRUE}, prints sampling progress in the console. Default is \code{FALSE}.
#' @param dataList Optional. A data frame with columns \code{species} and \code{url}, used to extract taxonomic information from previously sampled species links.
#' @param taxonomicInfo Logical. If \code{TRUE}, returns taxonomic information for each species, including order, suborder, family, genus, author, and year. Default is \code{FALSE}.
#' @param fullHigher Logical. If \code{TRUE}, includes the full higher taxonomic hierarchy as reported by RDB (e.g., including subfamilies). Requires \code{taxonomicInfo = TRUE}. Default is \code{FALSE}.
#' @param getLink Logical. If \code{TRUE}, includes the RDB URL for each species (useful for follow-up functions like \code{\link{herpSynonyms}}). Default is \code{FALSE}.
#' @param cores Integer. Number of CPU cores to use for parallel processing. Default is one less than the number of available cores.
#' @param checkpoint Optional. Integer specifying the number of species to process before saving a temporary backup. Backup is only saved if \code{cores = 1}. If set to \code{1}, saves progress after each species (safest but slowest).
#' @param backup_file Optional. Character string specifying the path to an \code{.rds} file for saving intermediate results when \code{checkpoint} is set. Must end in \code{.rds}.
#'
#' @return
#' If \code{taxonomicInfo = FALSE} (default), returns a character vector of species names.
#'
#' If \code{taxonomicInfo = TRUE}, returns a data frame with columns:
#' \code{order}, \code{suborder} (if available), \code{family}, \code{genus}, \code{species}, \code{author}, and \code{year}.
#'
#' If \code{fullHigher = TRUE}, includes an additional column with the full higher taxa classification.
#'
#' If \code{getLink = TRUE}, includes a column with the URL for each species’ page on RDB.
#'
#' @note
#' If \code{checkpoint} is used, progress will only be saved when \code{cores = 1}. This prevents potential write conflicts in parallel mode.
#'
#' @examples
#' \donttest{
#' boa <- herpSpecies(herpAdvancedSearch(genus = "Boa"),
#' taxonomicInfo = TRUE,
#' cores = 2)
#' }
#'
#' @seealso \code{\link{herpAdvancedSearch}}, \code{\link{herpSynonyms}}, \code{\link{herpSearch}}
#'
#' @export
#'
herpSpecies <- function(url=NULL,
showProgress=TRUE,
dataList = NULL,
taxonomicInfo = FALSE,
fullHigher = FALSE,
getLink = FALSE,
cores = max(1, parallel::detectCores() - 1),
checkpoint = NULL,
backup_file = NULL
)
{
if (is.null(backup_file) && !is.null(checkpoint)) {
stop("You must provide a valid backup_file path if checkpoint is defined.")
}else
if(!is.null(backup_file) && !grepl(".rds", backup_file)){
stop("Backup file path must end with 'filename.rds'")
}
if(is.null(dataList))
{
species_list <- c()
genus_list <- c()
url_list <- c()
if(is.null(url)){
stop("\n No search url provided")
}
search <- rvest::read_html(url)
ul_element <- rvest::html_elements(search, "#content > ul:nth-child(6)")
li_nodes <- xml2::xml_children(ul_element[[1]])
for (i in seq_along(li_nodes)) {
target <- xml2::xml_child(li_nodes[[i]], 1)
species <- rvest::html_text(rvest::html_element(target, "em"), trim = TRUE)
genus <- sub(" .*", "", species)
href_raw <- xml2::xml_attrs(target)[["href"]]
href <- sub("&search.*", "", href_raw)
sppLink <- paste0("https://reptile-database.reptarium.cz",href)
species_list <- c(species_list, species)
genus_list <- c(genus_list, genus)
url_list <- c(url_list, sppLink)
if(showProgress == TRUE){
percent <- (i/length(xml2::xml_children(ul_element[[1]]))) * 100
cat(sprintf("\rGetting species links progress: %.1f%%", percent))
utils::flush.console()
}
}
n_species <- length(species_list)
if(taxonomicInfo == FALSE) {
if(getLink == TRUE){
searchResults <- data.frame(species = species_list,
url = url_list,
stringsAsFactors = FALSE)
message_text <- paste0("A total of ", n_species, " species links retrieved.")
}else{
searchResults <- species_list
message_text <- paste0("A total of ", n_species, " species retrieved.")
}
message("Data collection is done!\n", message_text, "\n")
return(searchResults)
}
}else{
species_list <- dataList$species
genus_list <- sub(" .*", "", species_list)
url_list <- dataList$url
n_species <- length(species_list)
}
# taxonomicInfo == TRUE ---------------------------------------------------
if (taxonomicInfo == TRUE) {
if(showProgress == TRUE){
message("Sampling species higher taxa progress:\n")
}
orders <- c("Squamata", "Crocodylia", "Rhychocephalia", "Testudines")
suborders <- c("Sauria", "Serpentes")
if (cores > 1)
{
results_list <- safeParallel(
data = species_list,
FUN = function(x) higherSampleParallel(
x,
species_list = species_list,
genus_list = genus_list,
url_list = url_list,
orders = orders,
suborders = suborders,
fullHigher = fullHigher,
getLink = getLink
),
cores = cores,
showProgress = showProgress
)
results_list <- Filter(Negate(is.null), results_list)
searchResults <- as.data.frame(dplyr::bind_rows(results_list))
# testing and warning for error messages ----------------------------------
# Check which rows have errors flagged TRUE
error_rows <- which(!is.na(searchResults$error) & searchResults$error == TRUE)
if (length(error_rows) > 0) {
# Extract species names and error messages for those rows
species_with_errors <- searchResults$species[error_rows]
messages <- searchResults$message[error_rows]
n_errors <- length(species_with_errors)
max_show <- 5
# Construct warning message
species_msgs <- paste0("- ", species_with_errors[1:min(max_show, n_errors)], ": ",
messages[1:min(max_show, n_errors)])
if (n_errors > max_show) {
species_msgs <- c(species_msgs, paste0("... and ", n_errors - max_show, " others"))
}
warning_msg <-paste0(
"Data sampling completed with errors for the following species:\n",
paste0(species_msgs, collapse = "\n"),
"\n\nTo extract failed species from your original data, use:\n",
"failed_spp <- df[df$species %in% df$species[df$error == TRUE], c('species', 'url')]\n",
"Then ran herpSpecies(dataList = failed_spp)."
)
warning(warning_msg)
}
return(searchResults)
}else{
searchResults <-higherSample(
species_list = species_list,
genus_list = genus_list,
url_list = url_list,
orders = orders,
suborders = suborders,
fullHigher = fullHigher,
getLink = getLink,
backup_file = backup_file,
checkpoint = checkpoint
)
# testing and warning for error messages ----------------------------------
# Check which rows have errors flagged TRUE
error_rows <- which(!is.na(searchResults$error) & searchResults$error == TRUE)
if (length(error_rows) > 0) {
# Extract species names and error messages for those rows
species_with_errors <- searchResults$species[error_rows]
messages <- searchResults$message[error_rows]
n_errors <- length(species_with_errors)
max_show <- 5
# Construct warning message
species_msgs <- paste0("- ", species_with_errors[1:min(max_show, n_errors)], ": ",
messages[1:min(max_show, n_errors)])
if (n_errors > max_show) {
species_msgs <- c(species_msgs, paste0("... and ", n_errors - max_show, " others"))
}
warning_msg <-paste0(
"Data sampling completed with errors for the following species:\n",
paste0(species_msgs, collapse = "\n"),
"\n\nTo extract failed species from your original data, use:\n",
"failed_spp <- df[df$species %in% df$species[df$error == TRUE], c('species', 'url')]\n",
"Then ran herpSpecies(dataList = failed_spp)."
)
warning(warning_msg)
}
return(searchResults)
}
} # <--- closes if (taxonomicInfo == TRUE)
} # <--- closes herpSpecies function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.