R/fuzzy_match.R

Defines functions get_ambiguous_matches fuzzy_match_infraspecies2_within_infraspecies_helper fuzzy_match_infraspecies2_within_infraspecies fuzzy_match_infraspecies_within_species_helper fuzzy_match_infraspecies_within_species fuzzy_match_species_within_genus_helper fuzzy_match_species_within_genus fuzzy_match_genus

Documented in fuzzy_match_genus fuzzy_match_infraspecies2_within_infraspecies fuzzy_match_infraspecies2_within_infraspecies_helper fuzzy_match_infraspecies_within_species fuzzy_match_infraspecies_within_species_helper fuzzy_match_species_within_genus fuzzy_match_species_within_genus_helper get_ambiguous_matches

#' Fuzzy Match Genus Name
#'
#' @description
#' This function performs a fuzzy match of genus names against the threatened
#' species database using fuzzyjoin::stringdist() to account for slight
#' variations in spelling.
#'
#' @param df A tibble containing the genus names to be matched.
#' @param target_df A tibble representing the threatened species database
#'   containing the reference list of threatened species.
#'
#' @return
#' A tibble with two additional columns:
#' - fuzzy_match_genus: A logical column indicating whether the genus was
#'   successfully matched (`TRUE`) or not (`FALSE`).
#' - fuzzy_genus_dist: A numeric column representing the distance for each match.
#'
#' @details
#' If multiple genera match with the same string distance (ambiguous matches),
#' a warning is issued and the first match is automatically selected. To
#' examine ambiguous matches in detail, use \code{\link{get_ambiguous_matches}}
#' on the result object.
#'
#' **IMPROVED**: Ambiguous match attributes now include database information
#' such as family and representative species for better manual curation.
#'
#' @seealso \code{\link{get_ambiguous_matches}} to retrieve ambiguous match details
#'
#' @keywords internal
fuzzy_match_genus <- function(df, target_df = NULL) {

  # ========================================================================
  # SECTION 1: Input Validation
  # ========================================================================

  assertthat::assert_that(all(c('Orig.Genus',
                                'Orig.Species',
                                'Orig.Infraspecies',
                                'Orig.Infraspecies_2') %in% colnames(df)))

  # Handle empty input tibble
  if (nrow(df) == 0) {
    if (!all(c('fuzzy_match_genus', 'fuzzy_genus_dist') %in% colnames(df))) {
      return(tibble::add_column(df,
                                fuzzy_match_genus = NA,
                                fuzzy_genus_dist = NA))
    } else {
      return(df)
    }
  }

  # Remove existing fuzzy_genus_dist column if present (for sequential matching)
  if ('fuzzy_genus_dist' %in% colnames(df)) {
    df <- df |>
      dplyr::mutate(fuzzy_genus_dist = NULL)
  }

  # ========================================================================
  # SECTION 2: Fuzzy Matching
  # ========================================================================

  Threatened.Genera <- target_df |>
    dplyr::distinct(genus)

  # Perform fuzzy match
  matched_temp <- df |>
    fuzzyjoin::stringdist_left_join(Threatened.Genera,
                                    by = c('Orig.Genus' = 'genus'),
                                    max_dist = 1,
                                    distance_col = 'fuzzy_genus_dist') |>
    dplyr::mutate(Matched.Genus = genus) |>
    dplyr::select(-c('genus')) |>
    dplyr::group_by(Orig.Genus, Orig.Species) |>
    dplyr::filter(fuzzy_genus_dist == min(fuzzy_genus_dist))

  # ========================================================================
  # SECTION 3: Handle Ambiguous Matches (IMPROVED WITH DATABASE INFO)
  # ========================================================================

  # Detect ambiguous matches (multiple genera with same distance)
  ambiguous_matches <- matched_temp |>
    dplyr::filter(dplyr::n() > 1)

  if (nrow(ambiguous_matches) > 0) {
    # Count unique genera with ambiguous matches
    n_ambiguous_genera <- ambiguous_matches |>
      dplyr::distinct(Orig.Genus) |>
      nrow()

    # Issue informative warning
    warning(
      "Found ", n_ambiguous_genera, " genera with multiple fuzzy matches ",
      "(tied string distances).\n",
      "  The algorithm will automatically select the first match.\n",
      "  To examine ambiguous matches, use: ",
      "get_ambiguous_matches(result, type = 'genus')\n",
      "  Consider manual curation for critical applications.",
      call. = FALSE,
      immediate. = TRUE
    )

    # IMPROVED: Store enriched ambiguous match information
    attr(matched_temp, "ambiguous_genera") <- ambiguous_matches |>
      dplyr::ungroup() |>
      dplyr::select(Orig.Genus, Orig.Species, Matched.Genus, fuzzy_genus_dist) |>
      dplyr::distinct() |>
      # Join with target_df to get additional database information
      dplyr::left_join(
        target_df |>
          dplyr::distinct(genus, family) |>
          dplyr::rename(Matched.Genus = genus, Matched.Family = family),
        by = "Matched.Genus"
      ) |>
      dplyr::distinct() |>
      # Add representative species count per genus
      dplyr::left_join(
        target_df |>
          dplyr::group_by(genus) |>
          dplyr::summarise(
            n_species_in_db = dplyr::n_distinct(species)#,
            #example_species = paste(head(unique(species), 2), collapse = ", ")
          ) |>
          dplyr::rename(Matched.Genus = genus),
        by = "Matched.Genus"
      ) |>
      dplyr::distinct() |>
      dplyr::arrange(Orig.Genus, fuzzy_genus_dist, Matched.Genus)
  }

  # ========================================================================
  # SECTION 4: Select First Match for Ambiguous Cases
  # ========================================================================

  matched <- matched_temp |>
    dplyr::group_modify(
      ~ifelse(nrow(.x) == 0,
              return(.x),
              return(dplyr::slice_head(.x, n = 1)))
    ) |>
    dplyr::ungroup()

  # Preserve ambiguous match attribute if it exists
  if (!is.null(attr(matched_temp, "ambiguous_genera"))) {
    attr(matched, "ambiguous_genera") <- attr(matched_temp, "ambiguous_genera")
  }

  # ========================================================================
  # SECTION 5: Identify Unmatched and Combine Results
  # ========================================================================

  unmatched <- df |>
    fuzzyjoin::stringdist_anti_join(Threatened.Genera,
                                    by = c('Orig.Genus' = 'genus'),
                                    max_dist = 1)

  assertthat::assert_that(nrow(df) == (nrow(matched) + nrow(unmatched)))

  res <- dplyr::bind_rows(matched, unmatched,
                          .id = 'fuzzy_match_genus') |>
    dplyr::mutate(fuzzy_match_genus = (fuzzy_match_genus == 1)) |>
    dplyr::arrange(Orig.Genus, Orig.Species) |>
    dplyr::relocate(c('Orig.Genus',
                      'Orig.Species',
                      'Orig.Infraspecies'))

  # Preserve ambiguous match attribute in final result
  if (!is.null(attr(matched, "ambiguous_genera"))) {
    attr(res, "ambiguous_genera") <- attr(matched, "ambiguous_genera")
  }

  return(res)
}


# =============================================================================
# FUZZY MATCH SPECIES WITHIN GENUS
# =============================================================================

#' Fuzzy Match Species within Genus
#'
#' @description
#' This function attempts to fuzzy match species names within a genus to the
#' threatened species database using fuzzyjoin::stringdist for fuzzy matching.
#'
#' @param df A tibble containing the species data to be matched.
#' @param target_df A tibble representing the threatened species database
#'   containing the reference list of threatened species.
#'
#' @return
#' A tibble with an additional logical column fuzzy_match_species_within_genus,
#' indicating whether the specific epithet was successfully fuzzy matched within
#' the matched genus (`TRUE`) or not (`FALSE`).
#'
#' @details
#' If multiple species match with the same string distance (ambiguous matches),
#' a warning is issued and the first match is automatically selected. To
#' examine ambiguous matches in detail, use \code{\link{get_ambiguous_matches}}
#' on the result object with \code{type = "species"}.
#'
#' **IMPROVED**: Ambiguous match attributes now include threat category and
#' accepted names for better decision-making.
#'
#' @seealso \code{\link{get_ambiguous_matches}} to retrieve ambiguous match details
#'
#' @keywords internal

fuzzy_match_species_within_genus <- function(df, target_df = NULL) {

  assertthat::assert_that(all(c('Orig.Genus',
                                'Orig.Species',
                                'Orig.Infraspecies',
                                'Matched.Genus') %in% colnames(df)))

  if (nrow(df) == 0) {
    if (!all(c('fuzzy_match_species_within_genus',
               'fuzzy_species_dist') %in% colnames(df))) {
      return(tibble::add_column(df,
                                fuzzy_match_species_within_genus = NA,
                                fuzzy_species_dist = NA))
    } else {
      return(df)
    }
  }

  if ('fuzzy_species_dist' %in% colnames(df)) {
    df <- df |>
      dplyr::mutate(fuzzy_species_dist = NULL)
  }

  res_list <- df |>
    dplyr::group_by(Matched.Genus) |>
    dplyr::group_split()

  # Process each genus group
  res_with_attrs <- lapply(res_list, function(chunk) {
    fuzzy_match_species_within_genus_helper(chunk, target_df)
  })

  # Extract ambiguous attributes
  all_ambiguous <- lapply(res_with_attrs, function(chunk_result) {
    attr(chunk_result, "ambiguous_species")
  })

  all_ambiguous <- Filter(Negate(is.null), all_ambiguous)

  consolidated_ambiguous <- if (length(all_ambiguous) > 0) {
    dplyr::bind_rows(all_ambiguous)
  } else {
    NULL
  }

  # Combine results
  res <- dplyr::bind_rows(res_with_attrs) |>
    dplyr::relocate(c('Orig.Genus',
                      'Orig.Species',
                      'Orig.Infraspecies'))

  # Re-attach consolidated attribute
  if (!is.null(consolidated_ambiguous) && nrow(consolidated_ambiguous) > 0) {
    attr(res, "ambiguous_species") <- consolidated_ambiguous
  }

  return(res)
}


#' Fuzzy Match Species within Genus - Helper
#' @keywords internal
fuzzy_match_species_within_genus_helper <- function(df, target_df) {

  genus <- df |>
    dplyr::distinct(Matched.Genus) |>
    unlist()

  database_subset <- memoised_get_threatened_genus(genus, target_df)

  matched <- df |>
    fuzzyjoin::stringdist_left_join(database_subset,
                                    by = c('Orig.Species' = 'species'),
                                    distance_col = 'fuzzy_species_dist') |>
    dplyr::mutate(Matched.Species = species) |>
    dplyr::select(-c('species', 'genus')) |>
    dplyr::group_by(Orig.Genus, Orig.Species) |>
    dplyr::filter(fuzzy_species_dist == min(fuzzy_species_dist))

  # IMPROVED: Handle ambiguous species matches with database info
  ambiguous_matches <- matched |>
    dplyr::filter(dplyr::n() > 1)

  if (nrow(ambiguous_matches) > 0) {
    n_ambiguous_species <- ambiguous_matches |>
      dplyr::distinct(Orig.Genus, Orig.Species) |>
      nrow()

    warning(
      "Found ", n_ambiguous_species, " species with multiple fuzzy matches ",
      "within genus '", genus, "' (tied string distances).\n",
      "  The algorithm will automatically select the first match.\n",
      "  To examine ambiguous matches, use: ",
      "get_ambiguous_matches(result, type = 'species')\n",
      "  Consider manual curation for critical applications.",
      call. = FALSE,
      immediate. = TRUE
    )

    # IMPROVED: Store enriched information
    attr(matched, "ambiguous_species") <- ambiguous_matches |>
      dplyr::ungroup() |>
      dplyr::select(Orig.Genus, Orig.Species, Matched.Species,
                    fuzzy_species_dist) |>
      dplyr::distinct() |>
      # Join with target_df to get threat category and accepted names
      dplyr::left_join(
        target_df |>
          dplyr::select(genus, species, threat_category, accepted_name,
                        taxonomic_status) |>
          dplyr::distinct(genus, species, .keep_all = TRUE) |>
          dplyr::rename(Matched.Genus = genus, Matched.Species = species),
        by = c("Matched.Genus", "Matched.Species")
      ) |>
      dplyr::distinct() |>
      dplyr::arrange(Orig.Genus, Orig.Species, fuzzy_species_dist, Matched.Species)
  }

  matched_final <- matched |>
    dplyr::group_modify(
      ~ifelse(nrow(.x) == 0,
              return(.x),
              return(dplyr::slice_head(.x, n = 1)))
    ) |>
    dplyr::ungroup()

  if (!is.null(attr(matched, "ambiguous_species"))) {
    attr(matched_final, "ambiguous_species") <- attr(matched, "ambiguous_species")
  }

  unmatched <- fuzzyjoin::stringdist_anti_join(df,
                                               database_subset,
                                               by = c('Orig.Species' = 'species'))

  assertthat::assert_that(nrow(df) == (nrow(matched_final) + nrow(unmatched)))

  combined <- dplyr::bind_rows(matched_final, unmatched,
                               .id = 'fuzzy_match_species_within_genus') |>
    dplyr::mutate(fuzzy_match_species_within_genus =
                    (fuzzy_match_species_within_genus == 1)) |>
    dplyr::relocate(c('Orig.Genus',
                      'Orig.Species',
                      'Orig.Infraspecies'))

  # Preserve attribute after bind_rows()
  if (!is.null(attr(matched_final, "ambiguous_species"))) {
    attr(combined, "ambiguous_species") <- attr(matched_final, "ambiguous_species")
  }

  return(combined)
}


# =============================================================================
# FUZZY MATCH INFRASPECIES WITHIN SPECIES (IMPROVED)
# =============================================================================

#' Fuzzy Match Infraspecific Epithet within Species
#' @keywords internal
fuzzy_match_infraspecies_within_species <- function(df,
                                                    target_df = NULL,
                                                    source = "original") {

  use_infraspecies_2 <- (source == "original")

  required_cols <- c(
    'Orig.Genus', 'Orig.Species', 'Orig.Infra.Rank', 'Orig.Infraspecies',
    'Orig.Infraspecies_2', 'Matched.Genus', 'Matched.Species'
  )

  assertthat::assert_that(
    all(required_cols %in% colnames(df)),
    msg = paste(
      "Missing required columns:",
      paste(setdiff(required_cols, colnames(df)), collapse = ", ")
    )
  )

  if (nrow(df) == 0) {
    if (!all(c('fuzzy_match_infraspecies', 'fuzzy_infraspecies_dist') %in% colnames(df))) {
      return(
        tibble::add_column(
          df,
          fuzzy_match_infraspecies = logical(0),
          fuzzy_infraspecies_dist = numeric(0)
        )
      )
    } else {
      return(df)
    }
  }

  if ('fuzzy_infraspecies_dist' %in% colnames(df)) {
    df <- df |>
      dplyr::mutate(fuzzy_infraspecies_dist = NULL)
  }

  res_list <- df |>
    dplyr::group_by(Matched.Species) |>
    dplyr::group_split()

  # Process each species group
  res_with_attrs <- lapply(res_list, function(chunk) {
    fuzzy_match_infraspecies_within_species_helper(chunk, target_df, source = source)
  })

  # Extract ambiguous attributes
  all_ambiguous <- lapply(res_with_attrs, function(chunk_result) {
    attr(chunk_result, "ambiguous_infraspecies")
  })

  all_ambiguous <- Filter(Negate(is.null), all_ambiguous)

  consolidated_ambiguous <- if (length(all_ambiguous) > 0) {
    dplyr::bind_rows(all_ambiguous)
  } else {
    NULL
  }

  # Combine results
  res <- dplyr::bind_rows(res_with_attrs) |>
    dplyr::relocate(c(
      'Orig.Genus', 'Orig.Species', 'Orig.Infra.Rank',
      'Orig.Infraspecies', 'Orig.Infraspecies_2'
    ))

  # Re-attach consolidated attribute
  if (!is.null(consolidated_ambiguous) && nrow(consolidated_ambiguous) > 0) {
    attr(res, "ambiguous_infraspecies") <- consolidated_ambiguous
  }

  return(res)
}

#' Helper: Fuzzy Match Infraspecific Epithet within Species (IMPROVED)
#' @keywords internal
fuzzy_match_infraspecies_within_species_helper <- function(df,
                                                           target_df,
                                                           source = "original") {

  use_infraspecies_2 <- (source == "original")

  species_matched <- df |>
    dplyr::distinct(Matched.Species) |>
    dplyr::pull(Matched.Species)

  get_threatened_infraspecies <- function(species_matched,
                                          target_df = NULL,
                                          source = source) {
    use_infraspecies_2 <- (source == "original")

    if (use_infraspecies_2 == TRUE) {
      return(
        target_df |>
          dplyr::filter(species %in% species_matched) |>
          dplyr::select(c('genus', 'species', 'tag', 'infraspecies',
                          'scientific_name', 'accepted_name',
                          'threat_category', 'taxonomic_status')) |>
          dplyr::mutate(tag = toupper(tag)) |>
          tidyr::drop_na(tag, infraspecies)
      )
    } else {
      return(
        target_df |>
          dplyr::filter(species %in% species_matched) |>
          dplyr::select(c('genus', 'species', 'tag_acc', 'infraspecies',
                          'scientific_name', 'accepted_name',
                          'threat_category', 'taxonomic_status')) |>
          dplyr::mutate(tag_acc = toupper(tag_acc)) |>
          tidyr::drop_na(tag_acc, infraspecies)
      )
    }
  }

  memoised_get_threatened_infrasp <- memoise::memoise(get_threatened_infraspecies)

  database_subset <- memoised_get_threatened_infrasp(species_matched,
                                                     target_df,
                                                     source = source)

  if (nrow(database_subset) == 0) {
    return(
      df |>
        dplyr::mutate(
          fuzzy_match_infraspecies = FALSE,
          fuzzy_infraspecies_dist = NA_real_
        )
    )
  }

  tag_col_to_remove <- if (use_infraspecies_2) "tag" else "tag_acc"

  matched <- df |>
    fuzzyjoin::stringdist_left_join(database_subset,
                                    by = c('Orig.Infraspecies' = 'infraspecies'),
                                    distance_col = 'fuzzy_infraspecies_dist') |>
    dplyr::mutate(Matched.Infraspecies = infraspecies) |>
    dplyr::select(-c('species', 'genus', 'infraspecies'),
                  -dplyr::all_of(tag_col_to_remove)) |>
    dplyr::group_by(Orig.Genus, Orig.Species, Orig.Infra.Rank, Orig.Infraspecies) |>
    dplyr::filter(fuzzy_infraspecies_dist == min(fuzzy_infraspecies_dist))

  # IMPROVED: Handle ambiguous infraspecies matches with full database info
  ambiguous_matches <- matched |>
    dplyr::filter(dplyr::n() > 1)

  if (nrow(ambiguous_matches) > 0) {
    n_ambiguous <- ambiguous_matches |>
      dplyr::distinct(Orig.Genus, Orig.Species, Orig.Infraspecies) |>
      nrow()

    warning(
      "Found ", n_ambiguous, " infraspecies with multiple fuzzy matches ",
      "within species '", species_matched, "' (tied string distances).\n",
      "  The algorithm will automatically select the first match.\n",
      "  To examine ambiguous matches, use: ",
      "get_ambiguous_matches(result, type = 'infraspecies')\n",
      "  Consider manual curation for critical applications.",
      call. = FALSE,
      immediate. = TRUE
    )

    # IMPROVED: Store complete database information
    attr(matched, "ambiguous_infraspecies") <- ambiguous_matches |>
      dplyr::ungroup() |>
      dplyr::select(
        Orig.Genus, Orig.Species, Orig.Infra.Rank, Orig.Infraspecies,
        Matched.Infraspecies, fuzzy_infraspecies_dist,
        scientific_name, accepted_name, threat_category, taxonomic_status
      ) |>
      dplyr::distinct() |>
      # Add infraspecies_2 information if available (for Rank 4 matches)
      dplyr::left_join(
        target_df |>
          dplyr::select(infraspecies, infraspecies_2) |>
          dplyr::filter(!is.na(infraspecies_2)) |>
          dplyr::distinct() |>
          dplyr::rename(Matched.Infraspecies = infraspecies#,
                        #Matched.Infraspecies_2_example = infraspecies_2
                        ),
        by = "Matched.Infraspecies"
      ) |>
      dplyr::arrange(Orig.Genus, Orig.Species, Orig.Infraspecies,
                     fuzzy_infraspecies_dist, Matched.Infraspecies) |>
      dplyr::distinct()
  }

  matched_final <- matched |>
    dplyr::group_modify(
      ~ifelse(nrow(.x) == 0, return(.x),
              return(dplyr::slice_head(.x, n = 1)))
    ) |>
    dplyr::ungroup()

  # Clean up extra columns before returning
  matched_final <- matched_final |>
    dplyr::select(-dplyr::any_of(c('scientific_name', 'accepted_name',
                                   'threat_category', 'taxonomic_status')))

  if (!is.null(attr(matched, "ambiguous_infraspecies"))) {
    attr(matched_final, "ambiguous_infraspecies") <- attr(matched, "ambiguous_infraspecies")
  }

  unmatched <- fuzzyjoin::stringdist_anti_join(
    dplyr::filter(df, !is.na(Orig.Infraspecies)),
    database_subset,
    by = c('Orig.Infraspecies' = 'infraspecies')
  )

  assertthat::assert_that(nrow(df) == (nrow(matched_final) + nrow(unmatched)))

  combined <- dplyr::bind_rows(matched_final,
                               unmatched,
                               .id = 'fuzzy_match_infraspecies') |>
    dplyr::mutate(fuzzy_match_infraspecies = (fuzzy_match_infraspecies == "1")) |>
    dplyr::relocate(c('Orig.Genus', 'Orig.Species', 'Orig.Infraspecies',
                      'Orig.Infra.Rank', 'Orig.Infraspecies_2'))

  # Preserve attribute after bind_rows()
  if (!is.null(attr(matched_final, "ambiguous_infraspecies"))) {
    attr(combined, "ambiguous_infraspecies") <- attr(matched_final, "ambiguous_infraspecies")
  }

  return(combined)
}


# =============================================================================
# FUZZY MATCH INFRASPECIES LEVEL 2 (Keep original implementation)
# =============================================================================

#' Fuzzy Match Infraspecies Level 2 within Infraspecies Level 1
#' @keywords internal
fuzzy_match_infraspecies2_within_infraspecies <- function(df, target_df = NULL) {

  assertthat::assert_that(all(c('Orig.Genus', 'Orig.Species', 'Orig.Infraspecies',
                                'Orig.Infraspecies_2', 'Matched.Genus',
                                'Matched.Species', 'Matched.Infraspecies') %in% colnames(df)))

  if (nrow(df) == 0) {
    if (!all(c('fuzzy_match_infraspecies_2',
               'fuzzy_infraspecies_2_dist') %in% colnames(df))) {
      return(tibble::add_column(df,
                                fuzzy_match_infraspecies_2 = NA,
                                fuzzy_infraspecies_2_dist = NA))
    } else {
      return(df)
    }
  }

  if ('fuzzy_infraspecies_2_dist' %in% colnames(df)) {
    df <- df |>
      dplyr::mutate(fuzzy_infraspecies_2_dist = NULL)
  }

  res_list <- df |>
    dplyr::group_by(Matched.Infraspecies) |>
    dplyr::group_split()

  res_with_attrs <- lapply(res_list, function(chunk) {
    fuzzy_match_infraspecies2_within_infraspecies_helper(chunk, target_df)
  })

  all_ambiguous <- lapply(res_with_attrs, function(chunk_result) {
    attr(chunk_result, "ambiguous_infraspecies_2")
  })

  all_ambiguous <- Filter(Negate(is.null), all_ambiguous)

  consolidated_ambiguous <- if (length(all_ambiguous) > 0) {
    dplyr::bind_rows(all_ambiguous)
  } else {
    NULL
  }

  res <- dplyr::bind_rows(res_with_attrs)

  if (!is.null(consolidated_ambiguous) && nrow(consolidated_ambiguous) > 0) {
    attr(res, "ambiguous_infraspecies_2") <- consolidated_ambiguous
  }

  return(res)
}


#' Helper function for fuzzy matching infraspecies level 2
#' @keywords internal
fuzzy_match_infraspecies2_within_infraspecies_helper <- function(df, target_df) {

  infraspecies1 <- df |>
    dplyr::distinct(Matched.Infraspecies) |>
    unlist()

  get_threatened_infraspecies2 <- function(infraspecies1, target_df = NULL) {
    return(target_df |>
             dplyr::filter(infraspecies %in% infraspecies1) |>
             dplyr::select(c('genus', 'species',
                             'infraspecies', 'infraspecies_2',
                             'scientific_name', 'accepted_name',
                             'threat_category', 'taxonomic_status')))
  }

  memoised_get_threatened_infrasp2 <- memoise::memoise(get_threatened_infraspecies2)

  database_subset <- memoised_get_threatened_infrasp2(infraspecies1, target_df) |>
    tidyr::drop_na(infraspecies_2)

  if (nrow(database_subset) == 0) {
    return(df |>
             dplyr::mutate(fuzzy_match_infraspecies_2 = FALSE,
                           fuzzy_infraspecies_2_dist = NA_real_))
  }

  matched <- df |>
    fuzzyjoin::stringdist_left_join(database_subset,
                                    by = c('Orig.Infraspecies_2' = 'infraspecies_2'),
                                    distance_col = 'fuzzy_infraspecies_2_dist') |>
    dplyr::mutate(Matched.Infraspecies_2 = infraspecies_2) |>
    dplyr::select(-c('species', 'genus', 'infraspecies', 'infraspecies_2')) |>
    dplyr::group_by(Orig.Genus, Orig.Species, Orig.Infraspecies, Orig.Infraspecies_2) |>
    dplyr::filter(fuzzy_infraspecies_2_dist == min(fuzzy_infraspecies_2_dist))

  # IMPROVED: Handle ambiguous infraspecies 2 matches
  ambiguous_matches <- matched |>
    dplyr::filter(dplyr::n() > 1)

  if (nrow(ambiguous_matches) > 0) {
    n_ambiguous <- ambiguous_matches |>
      dplyr::distinct(Orig.Genus, Orig.Species, Orig.Infraspecies,
                      Orig.Infraspecies_2) |>
      nrow()

    warning(
      "Found ", n_ambiguous, " infraspecies level 2 with multiple fuzzy matches ",
      "within infraspecies '", infraspecies1, "' (tied string distances).\n",
      "  The algorithm will automatically select the first match.\n",
      "  To examine ambiguous matches, use: ",
      "get_ambiguous_matches(result, type = 'infraspecies')\n",
      "  Consider manual curation for critical applications.",
      call. = FALSE,
      immediate. = TRUE
    )

    # IMPROVED: Store complete information
    attr(matched, "ambiguous_infraspecies_2") <- ambiguous_matches |>
      dplyr::ungroup() |>
      dplyr::select(
        Orig.Genus, Orig.Species, Orig.Infraspecies, Orig.Infraspecies_2,
        Matched.Infraspecies_2, fuzzy_infraspecies_2_dist,
        scientific_name, accepted_name, threat_category, taxonomic_status
      ) |>
      dplyr::arrange(Orig.Genus, Orig.Species, Orig.Infraspecies,
                     Orig.Infraspecies_2, fuzzy_infraspecies_2_dist,
                     Matched.Infraspecies_2)
  }

  matched_final <- matched |>
    dplyr::group_modify(
      ~ifelse(nrow(.x) == 0, return(.x),
              return(dplyr::slice_head(.x, n = 1)))
    ) |>
    dplyr::ungroup()

  # Clean up extra columns
  matched_final <- matched_final |>
    dplyr::select(-dplyr::any_of(c('scientific_name', 'accepted_name',
                                   'threat_category', 'taxonomic_status')))

  if (!is.null(attr(matched, "ambiguous_infraspecies_2"))) {
    attr(matched_final, "ambiguous_infraspecies_2") <- attr(matched, "ambiguous_infraspecies_2")
  }

  unmatched <- fuzzyjoin::stringdist_anti_join(
    dplyr::filter(df, !is.na(Orig.Infraspecies_2)),
    database_subset,
    by = c('Orig.Infraspecies_2' = 'infraspecies_2')
  )

  assertthat::assert_that(
    nrow(df) == (nrow(matched_final) + nrow(unmatched)),
    msg = "Row count mismatch in fuzzy_match_infraspecies2"
  )

  combined <- dplyr::bind_rows(matched_final,
                               unmatched,
                               .id = 'fuzzy_match_infraspecies_2') |>
    dplyr::mutate(fuzzy_match_infraspecies_2 = (fuzzy_match_infraspecies_2 == "1")) |>
    dplyr::relocate(c('Orig.Genus',
                      'Orig.Species',
                      'Orig.Infraspecies',
                      'Orig.Infraspecies_2'))

  if (!is.null(attr(matched_final, "ambiguous_infraspecies_2"))) {
    attr(combined, "ambiguous_infraspecies_2") <- attr(matched_final, "ambiguous_infraspecies_2")
  }

  return(combined)
}

# =============================================================================
# =============================================================================
# GET AMBIGUOUS MATCHES - COMPANION FUNCTION
# =============================================================================

#' Retrieve Ambiguous Match Information
#'
#' @description
#' Extracts information about ambiguous matches (multiple candidates with
#' tied distances) from matching results. This is useful for quality control
#' and manual curation of uncertain matches.
#'
#' @param match_result A tibble returned by matching functions such as
#'   \code{\link{matching_threatenedperu}} or internal matching functions.
#' @param type Character. Type of ambiguous matches to retrieve:
#'   \itemize{
#'     \item \code{"genus"} (default): Ambiguous genus-level matches
#'     \item \code{"species"}: Ambiguous species-level matches
#'     \item \code{"infraspecies"}: Ambiguous infraspecies-level matches (includes level 2)
#'     \item \code{"all"}: All types of ambiguous matches
#'   }
#' @param save_to_file Logical. If TRUE, saves results to a CSV file.
#'   Default is FALSE (CRAN compliant - no automatic file writing).
#' @param output_dir Character. Directory to save the file if save_to_file = TRUE.
#'   Defaults to \code{tempdir()} for safe file operations.
#'
#' @return
#' A tibble with ambiguous match details, or NULL if no ambiguous matches exist.
#' Columns depend on the match type but typically include original names,
#' matched names, and distance metrics.
#'
#' @details
#' During fuzzy matching, multiple candidates may have identical string distances,
#' making the choice of match ambiguous. The matching algorithm automatically
#' selects the first candidate, but this function allows you to:
#' \itemize{
#'   \item Review all ambiguous matches for quality control
#'   \item Export them for manual curation
#'   \item Make informed decisions about match quality
#' }
#'
#' @section File Output:
#' When \code{save_to_file = TRUE}, a timestamped CSV file is created:
#' \itemize{
#'   \item Filename format: "threatenedperu_ambiguous_[type]_[timestamp].csv"
#'   \item Location: \code{output_dir} (defaults to tempdir())
#'   \item Contains all ambiguous matches with metadata
#' }
#'
#' @export
get_ambiguous_matches <- function(match_result,
                                     type = c("genus", "species", "infraspecies", "all"),
                                     save_to_file = FALSE,
                                     output_dir = tempdir()) {
  type <- match.arg(type)

  if (!inherits(match_result, "data.frame")) {
    stop("match_result must be a data frame or tibble.", call. = FALSE)
  }

  # Helper to extract and standardize
  extract_ambiguous <- function(attr_name, match_type_label) {
    data <- attr(match_result, attr_name)
    if (is.null(data) || !is.data.frame(data) || nrow(data) == 0) {
      return(NULL)
    }
    tibble::as_tibble(data) |>
      dplyr::mutate(Match_Type = match_type_label, .before = 1)
  }

  # Extract based on type
  result_list <- list()

  if (type %in% c("genus", "all")) {
    result_list$genus <- extract_ambiguous("ambiguous_genera", "Genus")
  }

  if (type %in% c("species", "all")) {
    result_list$species <- extract_ambiguous("ambiguous_species", "Species")
  }

  if (type %in% c("infraspecies", "all")) {
    lvl1 <- extract_ambiguous("ambiguous_infraspecies", "Infraspecies")
    lvl2 <- extract_ambiguous("ambiguous_infraspecies_2", "Infraspecies_2")

    if (!is.null(lvl1) && !is.null(lvl2)) {
      # Combine both levels
      combined <- dplyr::bind_rows(lvl1, lvl2)

      # Identify all key columns dynamically
      orig_keys <- grep("^Orig\\.", names(combined), value = TRUE)
      matched_keys <- grep("^Matched\\.", names(combined), value = TRUE)

      # For infraspecies, we need to keep DIFFERENT candidates for the same input
      # A candidate is unique by: Original name + Matched name + distance
      dedup_keys <- c(orig_keys, matched_keys, "Match_Type")
      dedup_keys <- intersect(dedup_keys, names(combined))

      result_list$infraspecies <- combined |>
        dplyr::distinct(dplyr::across(dplyr::all_of(dedup_keys)), .keep_all = TRUE)

    } else if (!is.null(lvl1)) {
      result_list$infraspecies <- lvl1
    } else if (!is.null(lvl2)) {
      result_list$infraspecies <- lvl2
    }
  }

  # Remove NULLs
  result_list <- purrr::compact(result_list)

  if (length(result_list) == 0) {
    message("No ambiguous ", type, " matches found.")
    return(invisible(NULL))
  }

  # Combine
  result <- if (type == "all") {
    dplyr::bind_rows(result_list)
  } else {
    result_list[[type]]
  }
  result <- result |>
    dplyr::select(dplyr::matches("^Orig\\.", perl = TRUE),
                  dplyr::matches("_name$", perl = TRUE)) |>
    dplyr::distinct()

  # Summary
  n_rows <- nrow(result)
  orig_cols <- grep("^Orig\\.", names(result), value = TRUE)
  n_orig <- if (length(orig_cols) > 0) {
    result |> dplyr::distinct(dplyr::across(dplyr::all_of(orig_cols))) |> nrow()
  } else {
    NA_integer_
  }

  message(
    "Found ", n_rows, " ambiguous match(es)",
    if (!is.na(n_orig)) paste0(" for ", n_orig, " unique input name(s)") else "",
    "."
  )

  # Optional file save
  if (save_to_file) {
    if (!dir.exists(output_dir)) {
      dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
    }

    filename <- sprintf(
      "threatenedperu_ambiguous_%s_%s.csv",
      type,
      format(Sys.time(), "%Y%m%d_%H%M%S")
    )
    filepath <- file.path(output_dir, filename)

    tryCatch({
      readr::write_csv(result, filepath)
      message("Saved to: ", filepath)
    }, error = function(e) {
      warning("Failed to save file: ", e$message, call. = FALSE)
    })
  }

  return(result)
}

Try the peruflorads43 package in your browser

Any scripts or data that you put into this service are public.

peruflorads43 documentation built on Nov. 24, 2025, 10:15 a.m.