expowo: An R package for mining global plant diversity and distribution data

Documented in get_year_pubs

#' Secondary function to transform data generated with powoSpecies
#'
#' @author Debora Zuanny & Domingos Cardoso
#'
#' @description Produces a spreadsheet with extra data about the
#' publication, adding the corrected year of publication and the counting of
#' synonyms of each species. It is an optional function to be used after run
#' expowo's \code{powoSpecies} and to use as input data in the expowo's
#' \code{accGraph}.
#'
#' @usage
#' get_year_pubs(inputdf = NULL,
#'               verbose = TRUE,
#'               save = FALSE,
#'               dir = "results_POWO_with_year_pubs",
#'               filename = "POWO_with_year_pubs")
#'
#' @param inputdf A dataframe generated using \code{powoSpecies} function with
#' the argument synonyms set as \code{TRUE} and containing the genus and species
#' column and the associated information about the protologue of each species.
#' The species name must be binomial, i.e. must contain both the genus name and
#' specific epithet.
#'
#' @param verbose Logical, if \code{FALSE}, a message showing each step during
#' the POWO search will not be printed in the console in full.

#' @param save Logical, if \code{TRUE}, the transformed input POWO database
#' will be saved on disk.
#'
#' @param dir Pathway to the computer's directory, where the file will be saved
#' provided that the argument \code{save} is set up in \code{TRUE}. The default
#' is to create a directory named **results_POWO_with_year_pubs** and the search
#' results will be saved within a subfolder named after the current date.
#'
#' @param filename Name of the output file to be saved. The default is to create
#' a file entitled **POWO_with_year_pubs**.
#'
#' @return Objects of class dataframe and saves the output on disk.
#'
#' @seealso \code{\link{megaGen}}
#' @seealso \code{\link{powoSpecies}}
#' @seealso \code{\link{powoFam}}
#' @seealso \code{\link{powoGenera}}
#' @seealso \code{\link{accGraph}}
#'
#' @examples
#' \dontrun{
#'
#' library(expowo)
#'
#' output <- powoSpecies(family = "Fabaceae",
#'                       genus = NULL,
#'                       synonyms = TRUE,
#'                       country = NULL,
#'                       save = FALSE,
#'                       dir = "Fabaceae_results_powoSpecies",
#'                       filename = "Fabaceae_spp")
#'
#'
#' output_year_pubs <- get_year_pubs(inputdf = output,
#'                                   verbose = TRUE,
#'                                   save = FALSE,
#'                                   dir = "results_POWO_with_year_pubs",
#'                                   filename = "POWO_with_year_pubs")
#' }
#'
#' @importFrom magrittr "%>%"
#' @importFrom tibble add_column
#'
#' @export
#'

get_year_pubs <- function(inputdf = NULL,
                          verbose = TRUE,
                          save = FALSE,
                          dir = "results_POWO_with_year_pubs",
                          filename = "POWO_with_year_pubs") {
  
  # inputdf check if it is a dataframe and if was generated by powoSpecies
  .arg_check_inputdf(inputdf)
  
  # dir check
  dir <- .arg_check_dir(dir)
  
  # Get name of folder and file name if the results are intended to be saved
  foldername <- paste0(dir, "/", format(Sys.time(), "%d%b%Y"))
  
  # Fill in with NAs the empty cells
  df <- data.frame(apply(inputdf, 2, function(x) gsub("^$", NA, x)))
  
  # Creating new columns for the year and corrected year of publication
  df <- df %>% tibble::add_column(year = NA, .after = "publication")
  df <- df %>% tibble::add_column(year_basionym = NA, .after = "year")
  df <- df %>% tibble::add_column(number_synonyms = NA, .after = "accepted_name")
  
  # Extracting the year from the full publication information
  # Clean examples such as "non C. caerulea Jacq.", ", non Engelh. (1898)"
  df$publication <- gsub(",\\snon\\s[[:upper:]].*", "",  df$publication)
  
  # Get examples of not validly published names
  tf <- grepl("[0-9][)],\\s", df$publication)
  p <- unique(gsub(".*[)],\\s", "", df$publication[tf]))
  tf_inval <- !grepl(paste0(p, collapse = "|"), df$publication)
  
  if (any(tf_inval)) {
    df$year[tf_inval] <- gsub(",\\scontrary.*", "", df$publication[tf_inval])
    df$year[tf_inval] <- gsub(".*[(]|[)].*|.*.\\s|.*[.]", "", df$year[tf_inval])
    df$year[tf_inval] <- gsub("unknown\\spublication|publication", "unknown",
                              df$year[tf_inval])
  }
  df$year[!tf_inval] <- gsub(".*[(]|,.*", "", df$publication[!tf_inval])
  df$year[!tf_inval] <- gsub(".*[.]\\s|[)].*", "", df$year[!tf_inval])
  df$year[!tf_inval] <- gsub("unknown\\spublication|publication", "unknown",
                             df$year[!tf_inval])
  
  # Extracting the correct year for species with synonyms
  # Fill corrected year for species without synonyms
  tf_accepted <- df$status %in% "Accepted"
  
  # Get the accepted names for species without any synonym
  tftf <- !df$scientific_name[tf_accepted] %in% df$accepted_name
  df$year_basionym[tf_accepted][tftf] <- df$year[tf_accepted][tftf]
  
  # Count the number of synonyms
  df$number_synonyms[tf_accepted][tftf] <- 0

  acc_clean <- na.omit(df$accepted_name)
  unique_taxa <- character()
  counts <- integer()
  last_seen <- NULL
  for (taxon in acc_clean) {
    if (is.null(last_seen) || taxon != last_seen) {
      # If the taxon is different from the previous one, add to results
      unique_taxa <- c(unique_taxa, taxon)
      counts <- c(counts, 1)
    } else {
      # If the taxon is the same as the previous one, increment the count
      counts[length(counts)] <- counts[length(counts)] + 1
    }
    last_seen <- taxon
  }
  res <- setNames(counts, unique_taxa)
  df$number_synonyms[tf_accepted][!tftf] <- res
  #df$number_synonyms[tf_accepted][!tftf] <- table(df$accepted_name)
  
  # Extracting the correct year for species with synonyms
  n_au <- unique(df$accepted_name[!is.na(df$accepted_name)])
  for (i in seq_along(n_au)) { 

    tf <- df$scientific_name[tf_accepted] %in% n_au[i]
    year_acc <- df$year[tf_accepted][tf]
    
    if (any(tf_inval)) {
      tftf <- df$accepted_name[tf_inval] %in% n_au[i]
      temp <- df[tf_inval, ][tftf, ]
    } else {
      tftf <- df$accepted_name %in% n_au[i]
      temp <- df[tftf, ]
    }
    temp <- temp[!grepl("\\svar[.]|\\sf[.]\\s|\\ssubsp[.]\\s",
                        temp$scientific_name), ]
    if (length(temp$year) == 0) {
      year_syn <- NA
    } else {
      year_syn <- min(temp$year)
    }
    
    if (any(year_acc < year_syn) | is.na(year_syn)) {
      df$year_basionym[tf_accepted][tf] <- year_acc
    } else {
      df$year_basionym[tf_accepted][tf] <- year_syn
    }
    
  }
  
  # Correct the publication year for "nomina conservanda" examples
  tf <- grepl(", nom. cons.", df$publication[tf_accepted])
  df$year[tf_accepted][tf] <- gsub(".*[(]|[)],\\snom[.] cons[.]*", "",
                                   df$publication[tf_accepted][tf])
  df$year_basionym[tf_accepted][tf] <- gsub(".*[(]|[)],\\snom[.] cons[.]*", "",
                                            df$publication[tf_accepted][tf])
  
  # Identify each change in the corrected year of species discovery
  df$year <- as.numeric(df$year)
  tf <- is.na(df$year_basionym)
  df$year_basionym[!tf] <- as.numeric(df$year_basionym[!tf])
  
  df$year <- as.numeric(df$year)
  df$year_basionym <- as.numeric(df$year_basionym)
  
  tf <- df$year == df$year_basionym
  
  df <- df %>% tibble::add_column(year_changed = NA, .after = "year_basionym")
  df$year_changed[tf] <- "no"
  df$year_changed[!tf] <- "yes"
  
  if (save) {
    
    if (!dir.exists(dir)) {
      dir.create(dir)
    }
    if (!dir.exists(foldername)) {
      dir.create(foldername)
    }
    
    # Saving the new spreadsheet
    saveCSV(df,
            dir = dir,
            filename = filename,
            verbose = verbose,
            save = save,
            foldername = foldername)
  }
  
  return(df)
}

DBOSlab/expowo documentation built on April 12, 2025, 1:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

DBOSlab/expowo
An R package for mining global plant diversity and distribution data

R/get_year_pubs.R
In DBOSlab/expowo: An R package for mining global plant diversity and distribution data

Defines functions get_year_pubs

Documented in get_year_pubs

R Package Documentation

Browse R Packages

We want your feedback!

DBOSlab/expowo An R package for mining global plant diversity and distribution data

R/get_year_pubs.R In DBOSlab/expowo: An R package for mining global plant diversity and distribution data

Defines functions get_year_pubs

Documented in get_year_pubs

R Package Documentation

Browse R Packages

We want your feedback!

DBOSlab/expowo
An R package for mining global plant diversity and distribution data

R/get_year_pubs.R
In DBOSlab/expowo: An R package for mining global plant diversity and distribution data