#' Fuzzymatch varieties using stringdist_join
#'
#' Fuzzymatch the varieties from the raw data with the database using an internal id.
#' This function uses stringdist_join and returns the closest match (smallest distance)
#' To have enough flexibility without returning too many fuzzy matches,
#' this function returns the closest match measured in 2 different ways:
#' 1. select the best_n varieties with the smallest distance, and then select the
#' best_n varieties that have the smallest scaled distance
#' 2. select the best_n varieties with the smallest scaled distance, irrespective
#' of the raw distance
#' See the effect of these different selection methods in test_match_variety_fns.R
#' @param raw_data A data.frame
#' @param database A data.frame
#' @param suffix A string denoting the name of the raw data
#' @param intid_col A string denoting the name of the internal id column
#' used to join the raw data and database
#' @param best_n A numeric denoting the number of best matches to select
#' @param method A string corresponding to a string metric defined by stringdist
#' e.g. "lw", "jw"
#' @importFrom fuzzyjoin stringdist_join
#' @keywords internal
stringdist.variety <- function(raw_data,
database,
intid_col = "name",
best_n,
method_stringdist){
quo_initid_col <- sym(intid_col)
all_stringdist_matches1 <- stringdist_join(raw_data, database,
by = intid_col,
mode = "left",
ignore_case = FALSE,
method = method_stringdist,
max_dist = 99,
distance_col = "dist")
all_stringdist_matches2 <- all_stringdist_matches1 %>%
rename_with(.cols = ends_with(".x"),
~ gsub("\\.x", "", .x)) %>%
rename_with(.cols = ends_with(".y"),
~ gsub("\\.y", "_db", .x))
all_stringdist_matches3 <- all_stringdist_matches2 %>%
group_by(!!quo_initid_col) %>%
mutate(method = paste(method_stringdist, "dist")) %>%
mutate(dist_scaled = ifelse(method == "jw",
dist,
dist / nchar(intid_db))) %>%
slice_min(order_by = dist, n = best_n) %>%
slice_min(order_by = dist_scaled, n = best_n)
all_stringdist_matches4 <- all_stringdist_matches2 %>%
group_by(!!quo_initid_col) %>%
mutate(method = paste(method_stringdist, "dist_scaled")) %>%
mutate(dist_scaled = dist / nchar(intid_db)) %>%
slice_min(order_by = dist_scaled, n = best_n)
all_stringdist_matches5 <- bind_rows(all_stringdist_matches3, all_stringdist_matches4)
return(all_stringdist_matches5)
}
#' Check for any matches
#'
#' After a variety name match (exact or fuzzymatch), check whether there are any
#' exact or approved matches.
#' @param df A data.frame with variety names matched to database names
#' @param group A column name by which to group and search for a match
#' (e.g. to determine if there are any matches for the variety column, use: variety)
#' @param checkfor A column name to check whether there is a match
#' (This parameter is used only for exact matching. For fuzzymatches, the status of the match
#' must be manually entered)
#' @param match_type A string that is either 'db' or 'raw', which refers to processing
#' database entries (only one entry per cultivar) or raw entries which may have many
#' alternate spellings for one cultivar
#' @keywords internal
check.anymatch <- function(df, group, checkfor = NULL, match_type){
quo_checkfor <- enquo(checkfor)
if (!("is_truematch" %in% colnames(df))) {
df <- df %>% mutate(is_truematch = ifelse(is.na(!!quo_checkfor),
FALSE, TRUE))
}
# Standardize is_truematch to logical
df <- df %>% mutate(is_truematch =
ifelse(!is_truematch %in% c(TRUE, FALSE),
NA, is_truematch)) %>%
mutate(is_truematch = as.logical(is_truematch))
if (match_type == "raw") {
quo_group = sym("var_id")
df_check <- df %>%
group_by(!!quo_group)
} else if (match_type == "db") {
quo_group = syms(c("intid", "var_id"))
df_check <- df %>%
group_by(!!!quo_group)
}
df_check2 <- df_check %>%
mutate(any_match = any(is_truematch == "TRUE")) %>%
ungroup()
return(df_check2)
}
#' Return the variety names by whether they have a match, no match, or need to be checked
#'
#' Return the variety names. This function takes a data.frame of variety names
#' and their potential database matches. Given the columns status, which denotes
#' whether the variety has a database match, and any_match, which denotes whether
#' the variety has at least one match, this function separates the data.frame into
#' a list of 4 data.frames: the varieties with database matches, the varieties without
#' database matches, the varieties that still need to be checked manually (with a collaborator),
#' and the duplicate rows of varieties that already have matches
#' @param df_any_match A data.frame that has been processed by check.anymatch()
#' @param is_blends A logical that specifies whether the varieties are blends.
#' Default is FALSE
#' @keywords internal
return.matchgroups <- function(df_any_match, is_blends = FALSE){
df_any_match <- df_any_match %>% mutate(var_id = as.character(var_id))
# Set the group (match, nomatch, check, or not_needed)
if (is_blends) {
df_any_match_groups <-
df_any_match %>%
ungroup() %>%
mutate(group = case_when(
is_truematch == "TRUE" ~ "match",
is_truematch == "FALSE" ~ "nomatch",
!is_truematch %in% c("TRUE", "FALSE", "alias") ~ "check"
))
} else {
df_any_match_groups <-
df_any_match %>%
ungroup() %>%
mutate(group = case_when(
(is_truematch == "TRUE" & any_match == TRUE) ~ "match",
((is_truematch == "FALSE" | is_truematch == "alias") &
any_match == FALSE) ~ "nomatch",
(!is_truematch %in% c("TRUE", "FALSE", "alias") &
any_match == FALSE) ~ "check",
(!is_truematch %in% c("TRUE") & any_match == TRUE) ~ "not_needed"
)) %>%
mutate(group = case_when(is.na(group) & is.na(is_truematch) ~ "check",
is.na(group) & is_truematch == "FALSE" ~ "not_needed",
TRUE ~ group))
}
# Separate groups into individual data.frames
match <- df_any_match_groups %>% filter(group == "match")
nomatch <- df_any_match_groups %>% filter(group == "nomatch") %>%
select(!matches("_db$|db_id|is_truematch|any_match|method")) %>%
unique(.)
match_check <- df_any_match_groups %>% filter(group == "check")
not_needed <- df_any_match_groups %>% filter(group == "not_needed")
# Add the groups into a list
match_list <- list(match, nomatch, match_check, not_needed) %>%
map(., ~.x %>% select(-group))
names(match_list) <- c("match", "nomatch", "check", "not_needed")
# Check that only one database entry matches a given var_id
# This helps to catch duplicate entries in the controlled vocabularies
# Note: if is_blends = TRUE, this is not checked because one entry is supposed
# to match multiple db_ids
if (!is_blends & all(c("db_id", "var_id") %in% names(match_list[["match"]]))) {
test_multmatch <- check.matches(match_list[["match"]])
if (nrow(test_multmatch > 0)) {
warning(paste("Warning: var_id matches with more than one db_id. Please reconcile potential error in database. Use factcuratoR:::check.matches(result[[1]]) to see the varieties that match more than one database name.",
paste0(capture.output(test_multmatch), collapse = "\n")))
}
# Remove alias if there is a match with variety
test_aliasdup <- rm.alias_dupmatch(match_list[["match"]])
if (nrow(test_aliasdup[["not_needed"]] > 0)) {
match_list[["match"]] <- test_aliasdup[["match"]]
match_list[["not_needed"]] <- rbind(match_list[["not_needed"]], test_aliasdup[["not_needed"]])
}
} else if (!is_blends) {
message("var_id and db_id do not exist. test_multmatch and test_aliasdup not run")
}
return(match_list)
}
#' Check that matches have one database match
#'
#' @param match_df A data.frame with variety names and database matches
#' (including volumns var_id and db_id)
#' @keywords internal
check.matches <- function(match_df){
mult_matches <- match_df %>%
select(var_id, db_id) %>%
unique(.) %>%
group_by(var_id) %>%
add_count() %>%
filter(n > 1)
mult_entries <- match_df %>%
filter(var_id %in% mult_matches$var_id)
return(mult_entries)
}
#' Remove alias if there is a match with variety
#'
#' If a variety is listed in the raw data with an alias,
#' both names may match to names in the database.
#' This function will remove the alias and return the listed variety name
#'
#' @param match_df A data.frame that contains var_id, db_id, and type_db
#' ("alias" or "variety")
#' @keywords internal
rm.alias_dupmatch <- function(match_df) {
mult_matches <- match_df %>%
select(var_id, db_id, type_db) %>%
unique(.) %>%
group_by(var_id, db_id) %>%
add_count() %>%
filter(n > 1) %>%
filter(type_db == "alias") %>% select(-n) %>% ungroup()
df_clean <- anti_join(match_df, mult_matches, by = c("var_id", "db_id", "type_db"))
df_rm_alias <- left_join(mult_matches, match_df, by = c("var_id", "db_id", "type_db"))
ls <- list(match = df_clean, not_needed = df_rm_alias)
return(ls)
}
#' Clean the file of fuzzy matches with manual checks
#'
#' Fuzzy matching outputs a file of potential/fuzzy matches.
#' A user must go in a assign whether the fuzzy match is a true match.
# @param curation_folder A path to the curation folder
#' @inheritParams check.anymatch
#' @inheritParams return.matchgroups
#' @param df A data.frame, which can be provided in lieu of the curation_folder
#' and filename
#' @keywords internal
clean_manualfuzzy <- function(match_type = "raw", df){
if (match_type == "raw") {
return_unique <- sym("var_id")
} else if (match_type == "db") {
return_unique <- sym("intid")
}
# Unsure why previously used eval(substitute()), however, this code doesn't work for process_fuzzymatch()
#fuzzy_check2 <- eval(substitute(check.anymatch(df = df,
# match_type = match_type)))
fuzzy_check2 <- check.anymatch(df = df, match_type = match_type)
fuzzy_groups <- return.matchgroups(fuzzy_check2)
message("Varieties left to check: ", nrow(fuzzy_groups[["check"]]))
return(fuzzy_groups)
}
#' Create a data.frame of the names of varieties with no match
#'
#' @param df_nomatch A data.frame of varieties without cultivar matches
#' from the database. This will generally be the output of the function
#' clean_manualfuzzy().
# @param variety_col_name A column name containing varieties
# @param intid_col_name A column name containing internal identifiers
#' @param ... Extra columns to select
#' @keywords internal
create_names_nomatch <- function(df_nomatch, ...){
dots <- enquos(...)
nomatch1 <- df_nomatch %>%
select(var_id,
variety,
intid,
crop_type,
type,
!!!dots) %>%
unique(.) %>%
group_by(var_id) %>% ungroup() %>%
group_by(intid) %>%
mutate(instance = row_number()) %>% ungroup() %>%
arrange(crop_type, intid) %>%
mutate(new_std_name = variety)
}
#' Combine all the matches from all of the cultivar matching steps
#' @param ls A list of results created with list2 from any of the cultivar matching steps
#' @param ... Any number of bare variables to be retained in the output of the matches
#' @keywords internal
collect_matches <- function(ls, ...){
#temp_ls <- rlang::list2(...)
dots <- enquos(...)
dots_string <- purrr::map_chr(dots, ~rlang::as_name(.x))
temp_ls2 <- ls %>%
map(., pluck("match")) %>%
imap(., function(x,y){ x %>% mutate(match_step = y) %>%
# Not every step may have the columns specified by dots, so use one_of()
select(var_id, variety, variety_db, match_step, type, crop_type, one_of(dots_string))
})
matches <- bind_rows(temp_ls2) %>% arrange(var_id)
}
#' Bind the status of fuzzymatches with the fuzzymatches created in this session
#' so that the two stay in sync even if the fuzzymatch function is changed
#' Note: this function may have to be updated to handle multiple auxiliary fuzzymatch files
#' @param output_fuzzymatch_df A data.frame with the newly generated fuzzymatches
#' @param aux_fuzzy_status A data.frame containing the fuzzymatches with is_truematch manually checked
#' @keywords internal
bind_fuzzymatches <- function(output_fuzzymatch_df, aux_fuzzy_status){
output_fuzzymatch_status <-
full_join(output_fuzzymatch_df,
aux_fuzzy_status,
# Need to join by as few variables as possible due to
# changes between the output_fuzzymatch_df and aux_fuzzy_status
# e.g. Cannot bind by db_id this variable is currently newly
# generated on each new run (it depends if changes have
# been made in the controlled_vocab)
by = c("intid",
"intid_db",
"var_id",
"variety"),
suffix = c("", "_status")) %>%
relocate(contains("is_truematch")) %>%
mutate(is_truematch = ifelse(is_truematch == "", NA, is_truematch)) %>%
mutate(is_truematch = ifelse(is.na(is_truematch), is_truematch_status, is_truematch)) %>%
select(is_truematch, intid, intid_db,
variety, variety_db, var_id, db_id,
type, type_db, crop_type)
return(output_fuzzymatch_status)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.