R/herpSpecies.R
In letsHerp: An Interface to the Reptile Database

Documented in herpSpecies

#' Retrieve Reptile Species and Taxonomic Information from RDB
#'
#' @description 
#' Retrieves a list of reptile species from The Reptile Database (RDB) based on a search URL, and optionally returns detailed taxonomic information for each species. 
#' This function can also save progress to disk during sampling and extract species-specific URLs for further use.
#' 
#' @usage herpSpecies(url,
#'                    showProgress = TRUE,
#'                    dataList = NULL, 
#'                    taxonomicInfo=FALSE, 
#'                    fullHigher=FALSE, 
#'                    getLink=FALSE,
#'                    cores = max(1, parallel::detectCores() - 1),
#'                    checkpoint = NULL,
#'                    backup_file = NULL
#'                    )
#'                    
#' @param url Character string. A search URL generated via an advanced search on the RDB website or with \code{\link{herpAdvancedSearch}}.
#' @param showProgress Logical. If \code{TRUE}, prints sampling progress in the console. Default is \code{FALSE}.
#' @param dataList Optional. A data frame with columns \code{species} and \code{url}, used to extract taxonomic information from previously sampled species links.
#' @param taxonomicInfo Logical. If \code{TRUE}, returns taxonomic information for each species, including order, suborder, family, genus, author, and year. Default is \code{FALSE}.
#' @param fullHigher Logical. If \code{TRUE}, includes the full higher taxonomic hierarchy as reported by RDB (e.g., including subfamilies). Requires \code{taxonomicInfo = TRUE}. Default is \code{FALSE}.
#' @param getLink Logical. If \code{TRUE}, includes the RDB URL for each species (useful for follow-up functions like \code{\link{herpSynonyms}}). Default is \code{FALSE}.
#' @param cores Integer. Number of CPU cores to use for parallel processing. Default is one less than the number of available cores.
#' @param checkpoint Optional. Integer specifying the number of species to process before saving a temporary backup. Backup is only saved if \code{cores = 1}. If set to \code{1}, saves progress after each species (safest but slowest).
#' @param backup_file Optional. Character string specifying the path to an \code{.rds} file for saving intermediate results when \code{checkpoint} is set. Must end in \code{.rds}.
#' 
#' @return
#' If \code{taxonomicInfo = FALSE} (default), returns a character vector of species names.  
#'  
#' If \code{taxonomicInfo = TRUE}, returns a data frame with columns:
#' \code{order}, \code{suborder} (if available), \code{family}, \code{genus}, \code{species}, \code{author}, and \code{year}.
#'  
#' If \code{fullHigher = TRUE}, includes an additional column with the full higher taxa classification.  
#'  
#' If \code{getLink = TRUE}, includes a column with the URL for each species’ page on RDB.
#'
#' @note
#' If \code{checkpoint} is used, progress will only be saved when \code{cores = 1}. This prevents potential write conflicts in parallel mode.
#'
#' @examples
#' \donttest{
#' boa <- herpSpecies(herpAdvancedSearch(genus = "Boa"),
#'                                       taxonomicInfo = TRUE, 
#'                                       cores = 2)
#' }
#' 
#' @seealso \code{\link{herpAdvancedSearch}}, \code{\link{herpSynonyms}}, \code{\link{herpSearch}}
#' 
#' @export
#'

herpSpecies <- function(url=NULL,
                        showProgress=TRUE,
                        dataList = NULL,
                        taxonomicInfo = FALSE,
                        fullHigher = FALSE,
                        getLink = FALSE,
                        cores = max(1, parallel::detectCores() - 1),
                        checkpoint = NULL,
                        backup_file = NULL
                        )
{
  if (is.null(backup_file) && !is.null(checkpoint)) {
    stop("You must provide a valid backup_file path if checkpoint is defined.")
  }else 
    if(!is.null(backup_file) && !grepl(".rds", backup_file)){
      stop("Backup file path must end with 'filename.rds'")
    }
  if(is.null(dataList))
  {
    species_list <- c()
    genus_list <- c()
    url_list <- c()
    
    if(is.null(url)){
      stop("\n No search url provided")
    }
    search <- rvest::read_html(url)
    ul_element <- rvest::html_elements(search, "#content > ul:nth-child(6)")
    
    li_nodes <- xml2::xml_children(ul_element[[1]])
    for (i in seq_along(li_nodes)) {
      
      target <- xml2::xml_child(li_nodes[[i]], 1)
      
      species <- rvest::html_text(rvest::html_element(target, "em"), trim = TRUE)
      genus <- sub(" .*", "", species)
      href_raw <- xml2::xml_attrs(target)[["href"]]
      href <- sub("&search.*", "", href_raw)
      sppLink <- paste0("https://reptile-database.reptarium.cz",href)
      
      species_list <- c(species_list, species)
      genus_list <- c(genus_list, genus)
      url_list <- c(url_list, sppLink)
      
      if(showProgress == TRUE){
      percent <- (i/length(xml2::xml_children(ul_element[[1]]))) * 100
      cat(sprintf("\rGetting species links progress: %.1f%%", percent))
      utils::flush.console()
      }
    }
    
    n_species <- length(species_list)
    
    if(taxonomicInfo == FALSE) {
      if(getLink == TRUE){
        searchResults <- data.frame(species = species_list,
                                    url = url_list,
                                    stringsAsFactors = FALSE)
        
        message_text <- paste0("A total of ", n_species, " species links retrieved.")
        
      }else{
        searchResults <- species_list
        message_text <- paste0("A total of ", n_species, " species retrieved.")
      }
      message("Data collection is done!\n", message_text, "\n")
      return(searchResults)
    }
  }else{
    species_list <- dataList$species
    genus_list <- sub(" .*", "", species_list)
    url_list <- dataList$url
    n_species <- length(species_list)
  }
  
  # taxonomicInfo == TRUE ---------------------------------------------------
  if (taxonomicInfo == TRUE) {
    if(showProgress == TRUE){
    message("Sampling species higher taxa progress:\n")
    }
    orders <- c("Squamata", "Crocodylia", "Rhychocephalia", "Testudines")
    suborders <- c("Sauria", "Serpentes")
    
    if (cores > 1)
    {
      results_list <- safeParallel(
        data = species_list,
        FUN = function(x) higherSampleParallel(
          x,
          species_list = species_list,
          genus_list = genus_list,
          url_list = url_list,
          orders = orders,
          suborders = suborders,
          fullHigher = fullHigher,
          getLink = getLink
        ),
        cores = cores,
        showProgress = showProgress
      )
      results_list <- Filter(Negate(is.null), results_list)
      searchResults <- as.data.frame(dplyr::bind_rows(results_list))
      
# testing and warning for error messages ----------------------------------
      # Check which rows have errors flagged TRUE
      error_rows <- which(!is.na(searchResults$error) & searchResults$error == TRUE)
      
      if (length(error_rows) > 0) {
        # Extract species names and error messages for those rows
        species_with_errors <- searchResults$species[error_rows]
        messages <- searchResults$message[error_rows]
        
        n_errors <- length(species_with_errors)
        max_show <- 5
        
        # Construct warning message
        species_msgs <- paste0("- ", species_with_errors[1:min(max_show, n_errors)], ": ",
                              messages[1:min(max_show, n_errors)])
        
        if (n_errors > max_show) {
          species_msgs <- c(species_msgs, paste0("... and ", n_errors - max_show, " others"))
        }
          warning_msg <-paste0(
          "Data sampling completed with errors for the following species:\n",
          paste0(species_msgs, collapse = "\n"),
          "\n\nTo extract failed species from your original data, use:\n",
          "failed_spp <- df[df$species %in% df$species[df$error == TRUE], c('species', 'url')]\n",
          "Then ran herpSpecies(dataList = failed_spp)."
        )
        warning(warning_msg)
      }
      return(searchResults)
    }else{
      searchResults <-higherSample(
                      species_list = species_list,
                      genus_list = genus_list,
                      url_list = url_list,
                      orders = orders,
                      suborders = suborders,
                      fullHigher = fullHigher,
                      getLink = getLink,
                      backup_file = backup_file,
                      checkpoint = checkpoint
      )
      # testing and warning for error messages ----------------------------------
      # Check which rows have errors flagged TRUE
      error_rows <- which(!is.na(searchResults$error) & searchResults$error == TRUE)
      
      if (length(error_rows) > 0) {
        # Extract species names and error messages for those rows
        species_with_errors <- searchResults$species[error_rows]
        messages <- searchResults$message[error_rows]
        
        n_errors <- length(species_with_errors)
        max_show <- 5
        
        # Construct warning message
        species_msgs <- paste0("- ", species_with_errors[1:min(max_show, n_errors)], ": ",
                              messages[1:min(max_show, n_errors)])
        
        if (n_errors > max_show) {
          species_msgs <- c(species_msgs, paste0("... and ", n_errors - max_show, " others"))
        }
        warning_msg <-paste0(
          "Data sampling completed with errors for the following species:\n",
          paste0(species_msgs, collapse = "\n"),
          "\n\nTo extract failed species from your original data, use:\n",
          "failed_spp <- df[df$species %in% df$species[df$error == TRUE], c('species', 'url')]\n",
          "Then ran herpSpecies(dataList = failed_spp)."
        )
        warning(warning_msg)
      }
      return(searchResults)
    }
    
  } # <--- closes if (taxonomicInfo == TRUE)
} # <--- closes herpSpecies function