# Generated by fusen: do not edit by hand
#' Deduplicate Data
#'
#' Description
#'
#' @param .score
#' Dataframe generated by scores_data()
#' @param .source
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match
#' A character vector of columns to perform fuzzy matching.
#' @param .min_sim
#' Named vector with minimum similarities
#' @param .col_score
#' Score column generated by scores_data().\cr
#' Options are:\cr
#' - sms: Simple Mean (mean over all fuzzy columns)\cr
#' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr
#' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr
#' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr
#' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr
#' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights)
#'
#' @return A dataframe
#'
#' @importFrom rlang :=
#'
#' @export
#' @examples
#' tab_source <- table_source[1:100, ]
#' tab_target <- table_target[1:999, ]
#' cols_match <- c("name", "iso3", "city", "address")
#' cols_exact <- "iso3"
#' cols_join <- c("name", "iso3")
#' tab_match <- match_data(
#' .source = tab_source,
#' .target = tab_target,
#' .cols_match = cols_match,
#' .cols_exact = cols_exact,
#' .cols_join = cols_join,
#' .method = "soundex"
#' )
#' tab_score <- scores_data(
#' .matches = tab_match,
#' .source = tab_source,
#' .target = tab_target,
#' .cols_match = cols_match,
#' .cols_exact = cols_exact
#' )
#'
#' dedup_data(
#' .score = tab_score,
#' .source = tab_source,
#' .target = tab_target,
#' .cols_match = cols_match,
#' .col_score = "sms"
#' )
dedup_data <- function(
.score, .source, .target, .cols_match, .min_sim = NULL,
.col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
) {
id_s <- id_t <- name_s <- name_t <- all_s <- all_t <- score <-
`_id_` <- len_s <- len_t <- n_s <- n_t <- sms <- smw <- smc <- sss <-
ssw <- ssc <- NULL
check_id(.source, .target)
cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc"))
source_ <- prep_tables(.source, .cols_match)
target_ <- prep_tables(.target, .cols_match)
score_ <- tibble::as_tibble(.score)
col_s_ <- colnames(source_)[!colnames(source_) == "_id_"]
col_t_ <- colnames(target_)[!colnames(target_) == "_id_"]
col_e_ <- col_s_[col_s_ %in% col_t_]
col_e_ <- col_e_[!col_e_ == "id"]
col_e_ <- unlist(purrr::map2(paste0(col_e_, "_s"), paste0(col_e_, "_t"), c))
tab_ <- dplyr::filter(score_, !!dplyr::sym(cols_score_) > 0)
if (!is.null(.min_sim)) {
for (i in seq_len(length(.min_sim))) {
n_ <- paste0("sim_", names(.min_sim)[i])
v_ <- .min_sim[i]
tab_ <- dplyr::filter(tab_, !!dplyr::sym(n_) >= v_)
}
}
tab_ <- tab_ %>%
dplyr::group_by(id_t) %>%
dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
dplyr::mutate(n_t = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::group_by(id_s) %>%
dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
dplyr::mutate(n_s = dplyr::n()) %>%
dplyr::ungroup() %>%
dplyr::left_join(
y = dplyr::select(source_, -c(`_id_`)),
by = c("id_s" = "id"),
suffix = c("_s", "_t")
) %>%
dplyr::left_join(
y = dplyr::select(target_, -c(`_id_`)),
by = c("id_t" = "id"),
suffix = c("_s", "_t")
) %>%
dplyr::mutate(
len_s = lengths(all_s),
len_t = lengths(all_t)
)
cols_use_ <- colnames(tab_)
cols_use_ <- cols_use_[cols_use_ %in% c(
"id_s", "id_t", "n_s", "n_t", "all_s", "all_t", "len_s", "len_t",
paste0("sim_", .cols_match), "sms", "smw", "smc", "sss", "ssw", "ssc"
)]
tab_[, c(cols_use_, col_e_)]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.