Nothing
#' Direct Match Species Names Against Peru Mammals Database
#'
#' @description
#' Performs direct matching of species names against the peru_mammals database.
#' Matches binomial names (genus + species) and handles special "sp." cases
#' (e.g., "Akodon sp. Ancash"). Peru mammals database does not include
#' infraspecific taxa.
#'
#' @param df A data frame or tibble containing the species data to be matched.
#' Must include columns: Orig.Genus, Orig.Species, Rank
#' @param target_df A data frame representing the peru_mammals database.
#' Must include columns: genus, species
#'
#' @return
#' A tibble with an additional logical column `direct_match` indicating whether
#' the name was successfully matched (`TRUE`) or not (`FALSE`), plus columns
#' `Matched.Genus` and `Matched.Species` for matched records.
#'
#' @details
#' This function only matches Rank 2 (binomial) names since peru_mammals does
#' not include infraspecific taxa. It handles:
#' - Regular binomials: "Panthera onca"
#' - Special "sp." cases: "Akodon sp. Ancash", "Oligoryzomys sp. B"
#'
#' Names at Rank 1 (genus only) are not matched by this function; use
#' `genus_match()` instead.
#'
#'
#' @keywords internal
direct_match <- function(df, target_df = NULL) {
# ==========================================================================
# SECTION 1: Validate Input Columns
# ==========================================================================
# Required columns for peru_mammals (no infraspecies)
required_cols <- c('Orig.Genus', 'Orig.Species', 'Rank')
# Check for missing columns
missing_cols <- setdiff(required_cols, colnames(df))
if (length(missing_cols) > 0) {
stop(
"direct_match() requires columns: ",
paste(required_cols, collapse = ", "),
"\nProvided dataframe is missing: ",
paste(missing_cols, collapse = ", "),
call. = FALSE
)
}
# Validate target_df
if (is.null(target_df)) {
stop("target_df (peru_mammals) must be provided", call. = FALSE)
}
target_required <- c('genus', 'species')
missing_target <- setdiff(target_required, colnames(target_df))
if (length(missing_target) > 0) {
stop(
"target_df must include columns: ",
paste(target_required, collapse = ", "),
"\nMissing: ", paste(missing_target, collapse = ", "),
call. = FALSE
)
}
# Handle empty dataframe
if (nrow(df) == 0) {
if (!'direct_match' %in% colnames(df)) {
df <- tibble::add_column(df,
direct_match = logical(0),
Matched.Genus = character(0),
Matched.Species = character(0))
}
return(df)
}
# ==========================================================================
# SECTION 2: Initialize Matched Columns
# ==========================================================================
# Ensure Matched columns exist
if (!'Matched.Genus' %in% colnames(df)) {
df$Matched.Genus <- NA_character_
}
if (!'Matched.Species' %in% colnames(df)) {
df$Matched.Species <- NA_character_
}
# ==========================================================================
# SECTION 3: Prepare Target Database
# ==========================================================================
# Standardize target database for case-insensitive matching
# peru_mammals has: genus = "Panthera", species = "onca"
# We need to match against: Orig.Genus = "PANTHERA", Orig.Species = "ONCA"
target_prepared <- target_df |>
dplyr::mutate(
genus_upper = toupper(genus),
species_upper = toupper(species)
)
# ==========================================================================
# SECTION 4: Match Binomial Names (Rank 2)
# ==========================================================================
# Only match Rank 2 (binomial names)
# Peru mammals only has binomials, including special "sp." cases
matched_bino <- df |>
dplyr::filter(Rank == 2) |>
dplyr::semi_join(
target_prepared,
by = c('Orig.Genus' = 'genus_upper',
'Orig.Species' = 'species_upper')
) |>
dplyr::mutate(
Matched.Genus = Orig.Genus,
Matched.Species = Orig.Species
)
# ==========================================================================
# SECTION 5: Identify Unmatched Records
# ==========================================================================
# Unmatched Rank 2: Binomial names that didn't match
unmatched_bino <- df |>
dplyr::filter(Rank == 2) |>
dplyr::anti_join(
target_prepared,
by = c('Orig.Genus' = 'genus_upper',
'Orig.Species' = 'species_upper')
)
# Rank 1 (genus only) are not matched here - they go to genus_match()
rank1_records <- df |>
dplyr::filter(Rank == 1)
# Combine all unmatched (Rank 2 that didn't match + all Rank 1)
unmatched <- dplyr::bind_rows(unmatched_bino, rank1_records)
# ==========================================================================
# SECTION 6: Validate Row Counts
# ==========================================================================
assertthat::assert_that(
nrow(df) == (nrow(matched_bino) + nrow(unmatched)),
msg = paste0(
"Row count mismatch in direct_match():\n",
"Input: ", nrow(df), " rows\n",
"Matched (Rank 2): ", nrow(matched_bino), " rows\n",
"Unmatched: ", nrow(unmatched), " rows\n"
)
)
# ==========================================================================
# SECTION 7: Combine and Return Results
# ==========================================================================
combined <- dplyr::bind_rows(
matched_bino,
unmatched,
.id = 'direct_match'
) |>
dplyr::mutate(direct_match = (direct_match == 1)) |>
dplyr::relocate(c('Orig.Genus', 'Orig.Species'))
return(combined)
}
#' Match Genus Names Against Peru Mammals Database
#'
#' @description
#' Performs direct matching of genus names against the unique genera listed
#' in the peru_mammals database. Useful for Rank 1 (genus-only) names.
#'
#' @param df A data frame or tibble containing the genus names to be matched.
#' Must include column: Orig.Genus
#' @param target_df A data frame representing the peru_mammals database.
#' Must include column: genus
#'
#' @return
#' A tibble with an additional logical column `genus_match` indicating whether
#' the genus was successfully matched (`TRUE`) or not (`FALSE`), plus column
#' `Matched.Genus` for matched records.
#'
#' @details
#' This function is typically used for names submitted at the genus level
#' (Rank 1). When a genus is matched, all species of that genus in peru_mammals
#' can be retrieved for further processing (e.g., suggesting possible species
#' to the user).
#'
#'
#'
#' @keywords internal
genus_match <- function(df, target_df = NULL) {
# ==========================================================================
# SECTION 1: Validate Input
# ==========================================================================
assertthat::assert_that(
'Orig.Genus' %in% colnames(df),
msg = "genus_match() requires column: Orig.Genus"
)
# Validate target_df
if (is.null(target_df)) {
stop("target_df (peru_mammals) must be provided", call. = FALSE)
}
assertthat::assert_that(
'genus' %in% colnames(target_df),
msg = "target_df must include column: genus"
)
# Handle empty input
if (nrow(df) == 0) {
if (!'genus_match' %in% colnames(df)) {
return(tibble::add_column(df,
genus_match = logical(0),
Matched.Genus = character(0)))
} else {
return(df)
}
}
# ==========================================================================
# SECTION 2: Ensure Matched.Genus Column Exists
# ==========================================================================
if (!'Matched.Genus' %in% colnames(df)) {
df$Matched.Genus <- NA_character_
}
# ==========================================================================
# SECTION 3: Prepare Target Database
# ==========================================================================
# Get unique genera from peru_mammals (case-insensitive)
target_prepared <- target_df |>
dplyr::distinct(genus) |>
dplyr::mutate(genus_upper = toupper(genus))
# ==========================================================================
# SECTION 4: Match Genera
# ==========================================================================
matched <- df |>
dplyr::semi_join(
target_prepared,
by = c('Orig.Genus' = 'genus_upper')
) |>
dplyr::mutate(Matched.Genus = Orig.Genus)
# ==========================================================================
# SECTION 5: Identify Unmatched
# ==========================================================================
unmatched <- df |>
dplyr::anti_join(
target_prepared,
by = c('Orig.Genus' = 'genus_upper')
)
# ==========================================================================
# SECTION 6: Validate and Combine
# ==========================================================================
assertthat::assert_that(
nrow(df) == (nrow(matched) + nrow(unmatched)),
msg = paste0(
"Row count mismatch in genus_match():\n",
"Input: ", nrow(df), " rows\n",
"Matched: ", nrow(matched), " rows\n",
"Unmatched: ", nrow(unmatched), " rows"
)
)
combined <- dplyr::bind_rows(
matched,
unmatched,
.id = 'genus_match'
) |>
dplyr::mutate(genus_match = (genus_match == 1)) |>
dplyr::relocate(Orig.Genus)
return(combined)
}
#' Get All Species for Matched Genera from Peru Mammals
#'
#' @description
#' Helper function to retrieve all species belonging to matched genera from
#' the peru_mammals database. Useful for suggesting possible species when
#' only genus is provided.
#'
#' @param matched_genera Character vector of matched genus names (uppercase)
#' @param target_df A data frame representing the peru_mammals database
#'
#' @return
#' A data frame with genus and species columns for all species in the
#' matched genera.
#'
#'
#'
#' @keywords internal
get_species_for_genera <- function(matched_genera, target_df = NULL) {
if (is.null(target_df)) {
stop("target_df (peru_mammals) must be provided", call. = FALSE)
}
# Convert matched genera to proper format for comparison
genera_formatted <- .str_to_simple_cap(matched_genera)
# Filter species
result <- target_df |>
dplyr::filter(genus %in% genera_formatted) |>
dplyr::select(genus, species, scientific_name, common_name) |>
dplyr::distinct()
return(result)
}
# Memoised version for performance
#' @importFrom memoise memoise
#' @keywords internal
memoised_get_species_for_genera <- memoise::memoise(get_species_for_genera)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.