R/internal_functions.R
In letsHerp: An Interface to the Reptile Database

# match_taxon -------------------------------------------------------------

#' Match higher taxonomic ranks
#'
#' Helper for \code{herpSpecies()}. Matches known taxonomic ranks (e.g., orders or suborders) from a character vector of taxonomic information.
#'
#' @param taxa_vector A character vector containing taxonomic strings.
#' @param rank_list A character vector of known rank names to match (e.g., \code{c("Squamata", "Serpentes")}).
#'
#' @return A single character string with matched ranks, comma-separated. Returns \code{NA} if none are found.
#' 
#' @keywords internal
#' @noRd
#' 
match_taxon <- function(taxa_vector, rank_list) {
  matches <- rank_list[sapply(rank_list, function(rank) stringr::str_detect(taxa_vector, rank))]
  if(length(matches)==0){
    return(NA)
  }
  match_positions <- sapply(matches, function(rank) stringr::str_locate(taxa_vector, rank)[1])
  sorted_matches <- matches[order(match_positions)]
  return(paste(sorted_matches, collapse = ", "))
}


# safeParallel ------------------------------------------------------------

#' Safe parallel execution
#'
#' Helper for functions like \code{herpSpecies()} and \code{herpSynonyms()}. Selects a safe parallel backend depending on the user's OS and optionally shows progress bars.
#'
#' @param data A list or vector of items to process in parallel.
#' @param FUN A function to be applied to each element of \code{data}.
#' @param cores Integer. Number of processor cores to use.
#' @param showProgress Logical. Whether to display a progress bar if supported. Default is \code{TRUE}.
#'
#' @return A list containing the results of applying \code{FUN} to each element of \code{data}.
#' 
#' @keywords internal
#' @noRd
safeParallel <- function(data, FUN, cores = 1, showProgress = TRUE) {
  if (.Platform$OS.type == "unix") {
    if (showProgress && requireNamespace("pbmcapply", quietly = TRUE)) {
      pbmcapply::pbmclapply(data, FUN, mc.cores = cores)
    } else {
      parallel::mclapply(data, FUN, mc.cores = cores)
    }
  } else {
    cl <- parallel::makeCluster(cores)
    on.exit(parallel::stopCluster(cl))
    if (showProgress && requireNamespace("pbapply", quietly = TRUE)) {
      pbapply::pblapply(data, FUN, cl = cl)
    } else {
      parallel::parLapply(cl, data, FUN)
    }
  }
}

# higherSampleParallel ----------------------------------------------------


#' Extract taxonomic info for one species (parallel worker)
#'
#' Internal function used by \code{herpSpecies()} to scrape taxonomic data for multiple species in parallel.
#' This function handles a single species per call and is designed to be mapped over a vector of species using parallel execution.
#'
#' @param x A species name to match.
#' @param species_list Character vector of species names.
#' @param genus_list Character vector of genus names.
#' @param url_list Character vector of corresponding species URLs.
#' @param orders Character vector of order names to match.
#' @param suborders Character vector of suborder names to match.
#' @param fullHigher Logical. Whether to return the full higher taxonomic string.
#' @param getLink Logical. Whether to include the species URL.
#'
#' @return A named list with taxonomic information (e.g., order, suborder, family, genus, species, author, year).
#' Returns a list with an error message on failure.
#'
#' @note Called once per species as part of a parallelized taxonomic sampling routine. Not intended for standalone use.
#' 
#' @keywords internal
#' @noRd
#' 
higherSampleParallel <- function(x, species_list, genus_list, url_list, orders = orders, suborders = suborders, fullHigher = FALSE, getLink = FALSE) {
  
  j <- which(species_list == x)
  tryCatch({
    sp_page <- rvest::read_html(url_list[j])
    title <- rvest::html_element(sp_page, "h1")
    
    sppAuthor <- rvest::html_text(title, trim = TRUE)
    sppAuthor <- gsub("^([A-Z][a-z]+\\s+[a-z\\-]+)\\s*", "", sppAuthor)
    sppAuthor <- gsub("\\s{2,}", " ", sppAuthor)
    
    sppAuthor <- trimws(gsub("\\s+", " ", sppAuthor))
    
    sppYear <- stringr::str_extract(sppAuthor, "\\d{4}")
    
    element <- rvest::html_element(sp_page, "table")
    
    taxa <- xml2::xml_child(element, 1)
    td_taxa <- rvest::html_element(taxa, "td:nth-child(2)")
    children <- xml2::xml_contents(td_taxa)
    
    taxa_vector <- rvest::html_text(children[xml2::xml_name(children) == "text"], trim = TRUE)
    taxa_vector <- paste(taxa_vector, collapse = ", ")
    
    family <- stringr::str_extract(taxa_vector, "\\b[A-Z][a-z]+idae\\b")
    order <- match_taxon(taxa_vector, orders)
    suborder <- match_taxon(taxa_vector, suborders)
    
    return(list(
              order = order,
              suborder = suborder,
              family = family,
              genus = genus_list[j],
              species = species_list[j],
              year = sppYear,
              author = sppAuthor,
              taxa_vector = if (fullHigher) taxa_vector else NULL,
              url = if (getLink) url_list[j] else NULL
              ))
    
  }, error = function(e) {
    return(list(error = TRUE,
                species = species_list[j],
                message = e$message,
                url = url_list[j]))
  })
}

# higherSample ------------------------------------------------------------


#' Extract taxonomic info from the Reptile Database (single-core with backup)
#'
#' Internal function used by \code{herpSpecies()} when \code{cores = 1}. Loops over species pages and extracts
#' taxonomic information (order, suborder, family, author, etc.). Supports progress printing and checkpoint-based backups.
#'
#' @param species_list Character vector of species names.
#' @param genus_list Character vector of genus names.
#' @param url_list Character vector of corresponding species URLs.
#' @param orders Character vector of order names to match.
#' @param suborders Character vector of suborder names to match.
#' @param fullHigher Logical. Whether to return the full higher taxonomic string (default is \code{FALSE}).
#' @param getLink Logical. Whether to include the species URL in the output (default is \code{FALSE}).
#' @param backup_file Optional. File path to save periodic backup using \code{saveRDS()}.
#' @param checkpoint Optional. Integer specifying how often (in number of species) to save backup. If \code{NULL}, saves only once at the end.
#' @param showProgress Logical. if \code{TRUE}, prints data sampling progress. Default is \code{TRUE}.
#'
#' @return A data frame containing the scraped taxonomic information. If vector lengths mismatch, returns a named list instead.
#'
#' @note Intended for internal use only. Called by \code{herpSpecies()} when not using parallelization.
#'
#' @keywords internal
#' @noRd
higherSample <- function(species_list,
                         genus_list,
                         url_list,
                         orders = orders,
                         suborders = suborders,
                         fullHigher = FALSE,
                         getLink = FALSE,
                         backup_file = NULL,
                         checkpoint = NULL,
                         showProgress = TRUE)
{
  taxa_vector_list <- c()
  order_list <- c()
  suborder_list <- c()
  family_list <- c()
  sppAuthor_list <- c()
  sppYear_list <- c()
  n_species <- length(species_list)
  
  if (is.null(checkpoint)) checkpoint <- n_species
  
  for (j in seq_along(species_list)) {
    tryCatch({
      sp_page <- rvest::read_html(url_list[j])
      title <- rvest::html_element(sp_page, "h1")
      
      sppAuthor <- rvest::html_text(title, trim = TRUE)
      sppAuthor <- gsub("^([A-Z][a-z]+\\s+[a-z\\-]+)\\s*", "", sppAuthor)
      sppAuthor <- gsub("\\s{2,}", " ", sppAuthor)
      
      sppAuthor <- trimws(gsub("\\s+", " ", sppAuthor))
      
      sppYear <- stringr::str_extract(sppAuthor, "\\d{4}")
      
      element <- rvest::html_element(sp_page, "table")
      
      taxa <- xml2::xml_child(element, 1)
      td_taxa <- rvest::html_element(taxa, "td:nth-child(2)")
      children <- xml2::xml_contents(td_taxa)
      
      taxa_vector <- rvest::html_text(children[xml2::xml_name(children) == "text"], trim = TRUE)
      taxa_vector <- paste(taxa_vector, collapse = ", ")
      
      family <- stringr::str_extract(taxa_vector, "\\b[A-Z][a-z]+idae\\b")
      order <- match_taxon(taxa_vector, orders)
      suborder <- match_taxon(taxa_vector, suborders)
      
      ############################## DETECTING ERROR
      if (all(is.na(c(family, order, suborder)))) {
        message(sprintf("No higher taxa information for", species_list[j]))
      }
      ##############################
      
      taxa_vector_list <- c(taxa_vector_list, taxa_vector)
      order_list <- c(order_list, order)
      suborder_list <- c(suborder_list, suborder)
      family_list <- c(family_list, family)
      sppAuthor_list <- c(sppAuthor_list, sppAuthor)
      sppYear_list <- c(sppYear_list, sppYear)
      
      if(showProgress == TRUE){
      percent <- (j / n_species) * 100
      cat(sprintf("\rGetting higher taxa progress: %.1f%%", percent))
      flush.console()
      }
      
      # back up chunk
      # Save backup every checkpoint or at the end
      if(!is.null(backup_file) &&
         (j %% checkpoint == 0 || j == n_species)){
        
        n <- length(family_list)
        backup <- data.frame(order = order_list,
                             suborder = suborder_list,
                             family = family_list,
                             genus = genus_list[1:n],
                             species = species_list[1:n],
                             author = sppAuthor_list,
                             year = sppYear_list)
        
        # Conditionally add URL if requested
        if(getLink==TRUE){
          backup$url <- url_list[1:n]
        }
        
        # Conditionally add full higher taxa vector if requested
        if(fullHigher==TRUE){
          backup$taxa_vector <- taxa_vector_list[1:n]
        }
        saveRDS(backup, file = backup_file)
        message(sprintf("Saved progress at species %d.", j, n))
      }
      
    }, error = function(e) {
      # Immediately print a concise error message for this species
      message(sprintf("Error scraping '%s': %s", species_list[j], e$message))
      # Return an error list so that you can filter or handle later
      return(list(error = TRUE,
                  species = species_list[j],
                  message = e$message,
                  url=url_list[j]))
    })
  }
  n <- length(family_list)
  all_vectors <- list(order = order_list[1:n], suborder = suborder_list[1:n],
                      family = family_list[1:n], genus = genus_list[1:n], species = species_list[1:n],
                      year = sppYear_list[1:n], author = sppAuthor_list[1:n],
                      taxa_vector = if(fullHigher) taxa_vector_list[1:n] else NULL,
                      url = if (getLink) url_list[1:n] else NULL)
  all_vectors <- all_vectors[!sapply(all_vectors, is.null)]
  lengths_vec <- sapply(all_vectors, length)
  expected <- lengths_vec["species"]
  if (all(lengths_vec == expected)) {
    searchResults <- as.data.frame(all_vectors, stringsAsFactors = FALSE)
  }
  else {
    message("Some vectors have different lengths! Returning list instead.")
    print(lengths_vec)
    diffs <- lengths_vec - expected
    bad <- names(diffs[diffs != 0])
    message("Mismatched vectors: ", paste(bad, collapse = ", "))
    searchResults <- all_vectors
    return(searchResults)
  }
}

# clean_species_names -----------------------------------------------------

#' Clean taxonomic species names from synonym strings
#'
#' Helper function for \code{herpSynonyms()}. Extracts and cleans species names from raw synonym strings,
#' removing authorship, special characters, Unicode quotes, and redundant elements.
#'
#' @param names_vec Character vector of raw synonym name strings.
#'
#' @return A character vector of cleaned species names.
#'
#' @keywords internal
#' @noRd
clean_species_names <- function(names_vec) {
  # Build Unicode chars dynamically
  left_quote   <- intToUtf8(0x2018)
  right_quote  <- intToUtf8(0x2019)
  left_dquote  <- intToUtf8(0x201C)
  right_dquote <- intToUtf8(0x201D)
  emdash       <- intToUtf8(0x2014)
  endash       <- intToUtf8(0x2013)
  acute_e      <- intToUtf8(0x00E9)
  
  pattern <- paste0(
    "^((?:\\p{Lu}[a-z]+)\\s*(?:\\([A-Za-z]+\\))?(?:\\s+(?:",
    "[a-z]\\.|[a-z]+|\\p{Lu}[a-z]+|",
    left_quote, "?[A-Za-z]+", right_quote, "?|\\'[A-Za-z]+\\'|\\[.*?\\]|gr\\.\\s*\\w+|sp\\.\\s*nov\\.?|",
    "subsp\\.\\s*nov\\.?|var\\.\\s*\\w+|vari", acute_e, "t", acute_e, "\\.?(?:\\s*\\w+)?|",
    "aff\\.\\s*\\w+|cf\\.\\s*\\w+|\"[^\"]+\"|",
    left_dquote, "[^", right_dquote, "]+", right_dquote,
    "))+)\\s*(?:[-", endash, emdash, "]|\\(|\\b\\p{Lu}{2,}\\b|\\d{4}|\\bet al\\.\\b|\\bin\\b).*"
  )
  extracted <- sub(pattern, "\\1", names_vec, perl = TRUE)
  
  cleaned <- sub(
    "\\s+((?:\\p{Lu}{2,}|\\p{Lu}{1})\\s*(?:and|&)?\\s*)+\\b(in|et al\\.|et al|and|&)?\\b.*$",
    "",
    extracted,
    perl = TRUE
  )
  
  cleaned <- sub("^\\?\\s*", "", cleaned, perl = TRUE)
  
  cleaned <- sub(paste0("\\s*[-", endash, emdash, "-]\\s*$"), "", cleaned, perl = TRUE)
  
  # Final cleanup: collapse multiple spaces and trim
  cleaned <- gsub("\\s{2,}", " ", cleaned)
  cleaned <- trimws(gsub("\\s+", " ", cleaned))
  
  return(cleaned)
}

# getSynonymsParallel -----------------------------------------------------

#' Get reptile species synonyms with parallel processing
#' 
#' Internal helper to retrieve synonyms for reptile species from The Reptile Database.
#' Designed to be called in parallel over a data frame of species and URLs.
#' 
#' @param i Integer index for the species to process (used internally by parallel loops).
#' @param x Data frame with columns \code{species} and \code{url} (e.g., output from \code{herpSpecies()}).
#' @param getRef Logical; if \code{TRUE}, returns synonyms along with their source references. Default \code{FALSE}.
#' 
#' @return A data frame with columns \code{species}, \code{synonyms}, and optionally \code{ref}.
#'         Returns a row with \code{"failed"} if an error occurs.
#' 
#' @keywords internal
#' @noRd
getSynonymsParallel <- function(i, x, getRef) {
  tryCatch({
    url <- rvest::read_html(httr::GET(x$url[i], httr::user_agent("Mozilla/5.0")))
    element <- rvest::html_element(url, "table")
    
    syn <- xml2::xml_child(element, 4)
    td2 <- rvest::html_element(syn, "td:nth-child(2)")
    children <- xml2::xml_contents(td2)
    
    synonyms_vector <- unique(rvest::html_text(children[xml2::xml_name(children) == "text"], trim = TRUE))
    synonyms_vector <- synonyms_vector[!is.na(synonyms_vector) & trimws(synonyms_vector) != ""]
    
    synonyms <- clean_species_names(synonyms_vector)
    species <- rep(x$species[i], times = length(synonyms))
    
    if (getRef) {
      return(data.frame(species = species, synonyms = synonyms, ref = synonyms_vector, stringsAsFactors = FALSE))
    } else {
      return(data.frame(species = species, synonyms = synonyms, stringsAsFactors = FALSE))
    }
  }, error = function(e) {
    warning(sprintf("Error scraping %s: %s", x$species[i], e$message))
    if (getRef) {
      df <- data.frame(species = x$species[i], synonyms = "failed", ref = "failed", stringsAsFactors = FALSE)
      return(df)
    } else {
      df <- data.frame(species = x$species[i], synonyms = "failed", stringsAsFactors = FALSE)
      return(df)
    }
  })
}

# getSynonyms -------------------------------------------------------------

#' Get reptile species synonyms
#' 
#' Scrapes synonyms of reptile species from The Reptile Database.
#' 
#' @param x A data frame with columns \code{species} and \code{url} (the respective Reptile Database URLs).  
#'   Typically, the output of \code{letsHerp::herpSpecies()}.
#' @param checkpoint Integer specifying how many species to process before saving progress to \code{backup_file}.  
#'   Helps avoid data loss if the function stops unexpectedly. Backups are saved only if \code{checkpoint} is not \code{NULL}.
#' @param resume Logical. If \code{TRUE}, resumes processing from a previous backup saved at \code{backup_file}.
#' @param backup_file Character string path to save or load the backup file.
#' @param getRef Logical. If \code{TRUE}, returns synonyms along with the references mentioning them. Default is \code{FALSE}.
#' @param showProgress Logical. If \code{TRUE}, prints data sampling progress. Default is \code{TRUE}.
#' 
#' @return A data frame with columns \code{species} and \code{synonyms}, optionally \code{ref} if \code{getRef = TRUE},  
#'   containing the synonyms for each species scraped from The Reptile Database.
#' 
#' @keywords internal
#' @noRd
#' 
getSynonyms <- function(x, checkpoint = NULL, resume=FALSE, backup_file = NULL, getRef=FALSE, showProgress=TRUE){
  
  species_list <- c()
  synonyms_list <- c()
  synonymsRef_list <- c()
  
  start_index <- 1
  # Load backup if resuming
  if (resume && file.exists(backup_file)) {
    x <- readRDS(backup_file)
    species <- x$species
    synonyms <- x$synonyms
    if ("synonymRef" %in% names(backup_file)) {
      synonymsRef <- backup_file$synonymsRef
    }
    
    start_index <- length(species_list) + 1
    message(sprintf("Resuming from species %d: %s", start_index, x$species[start_index]))
  }
  
  for (i in seq_along(x$species)) {
    result <- tryCatch({
      url <- rvest::read_html(httr::GET(x$url[i], httr::user_agent("Mozilla/5.0")))
      element <- rvest::html_element(url, "table") # species table
      
      # Extract synonyms
      syn <- xml2::xml_child(element, 4)
      td2 <- rvest::html_element(syn, "td:nth-child(2)")
      children <- xml2::xml_contents(td2)
      
      synonyms_vector <- unique(rvest::html_text(children[xml2::xml_name(children) == "text"], trim = TRUE))
      synonyms_vector <- synonyms_vector[!is.na(synonyms_vector) & trimws(synonyms_vector) != ""]
      
      synonyms <- clean_species_names(synonyms_vector)
      species <- rep(x$species[i], times = length(synonyms))
      
      species_list <- c(species_list, species)
      synonyms_list <- c(synonyms_list, synonyms)
      synonymsRef_list <- c(synonymsRef_list, synonyms_vector)
      
      if(showProgress == TRUE){
      msg <- sprintf("Progress: %.1f%%  - %s done! ", (i / length(x$species)) * 100, x$species[i])
      cat("\r", format(msg, width = 60), sep = "")
      utils::flush.console()
      }
      # Save backup every checkpoint
      if(!is.null(checkpoint)){
        if ((i %% checkpoint) == 0 || i == length(x$species)) {
          backup <- data.frame(species = species_list,
                               synonyms = synonyms_list,
                               stringsAsFactors = FALSE)
          if(getRef==FALSE){
            backup$combined <- paste(backup$species, backup$synonyms, sep="_")
            uniquerec <- data.frame(unique(backup$combined))
            
            backup_uniqueSynonyms <- tidyr::separate(data=uniquerec, col="unique.backup.combined.", 
                                                     into=c("species", "synonyms"), sep="_",
                                                     convert=TRUE)
            saveRDS(backup_uniqueSynonyms, backup_file)
          }else{
            backup$synonymsRef <- synonymsRef_list
            saveRDS(backup, backup_file)
          }
        }else{}
      }
    }, error = function(e) {
      message(sprintf("Error scraping %s: %s", x$species[i], e$message))
      species_list <<- c(species_list, x$species[i])
      synonyms_list <<- c(synonyms_list, "failed")
      synonymsRef_list <<- c(synonymsRef_list, "failed")
    })
  }
  
  if(getRef==TRUE){
    synonymsResults <- data.frame(species = species_list,
                                  synonyms = synonyms_list,
                                  ref = synonymsRef_list,
                                  stringsAsFactors = FALSE)
    return(synonymsResults)
  }else{
    synonymsResults <- data.frame(species = species_list,
                                  synonyms = synonyms_list,
                                  stringsAsFactors = FALSE)
    
    synonymsResults$combined <- paste((synonymsResults)$species, (synonymsResults)$synonyms, sep="_")
    
    uniquerec <- data.frame(unique(synonymsResults$combined))
    
    uniqueSynonyms <- tidyr::separate(data=uniquerec, col="unique.synonymsResults.combined.", 
                                      into=c("species", "synonyms"), sep="_",
                                      convert=TRUE)
    
    if(showProgress == TRUE){
    cat("\nSynonyms sampling complete!\n")
    }
    return(uniqueSynonyms)
  }
}

# End of internal functions -----------------------------------------------