R/dedup_data.R

Defines functions dedup_data

Documented in dedup_data

# Generated by fusen: do not edit by hand

#' Deduplicate Data
#' 
#' Description
#' 
#' @param .score 
#' Dataframe generated by scores_data()
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.
#' @param .min_sim
#' Named vector with minimum similarities
#' @param .col_score 
#' Score column generated by scores_data().\cr
#' Options are:\cr
#' - sms: Simple Mean (mean over all fuzzy columns)\cr
#' - smw: Weighted Mean (mean over all fuzzy columns, weighted by get_weights())\cr
#' - smc: Custom Mean (mean over all fuzzy columns, weighted custom weights)\cr
#' - sss: Simple Mean, squared (mean over all fuzzy columns, scores are squared)\cr
#' - ssw: Weighted Mean, squared (mean over all fuzzy columns, scores are squared before weighted by get_weights())\cr
#' - ssc: Custom Mean, squared (mean over all fuzzy columns, scores are squared before weighted custom weights)
#'
#' @return A dataframe
#' 
#' @importFrom rlang :=
#' 
#' @export
#' @examples
#' tab_source <- table_source[1:100, ]
#' tab_target <- table_target[1:999, ]
#' cols_match <- c("name", "iso3", "city", "address")
#' cols_exact <- "iso3"
#' cols_join  <- c("name", "iso3")
#' tab_match <- match_data(
#'   .source = tab_source,
#'   .target = tab_target,
#'   .cols_match = cols_match,
#'   .cols_exact = cols_exact,
#'   .cols_join = cols_join,
#'   .method = "soundex"
#' )
#' tab_score <- scores_data(
#'   .matches = tab_match, 
#'   .source = tab_source, 
#'   .target = tab_target, 
#'   .cols_match = cols_match,
#'   .cols_exact = cols_exact
#'   )
#' 
#' dedup_data(
#'   .score = tab_score, 
#'   .source = tab_source, 
#'   .target = tab_target,
#'   .cols_match = cols_match,
#'   .col_score = "sms"
#'   )
dedup_data <- function(
  .score, .source, .target, .cols_match, .min_sim = NULL, 
  .col_score = c("sms", "smw", "smc", "sss", "ssw", "ssc")
  ) {
  id_s <- id_t <- name_s <- name_t <- all_s <- all_t <- score <- 
    `_id_` <- len_s <- len_t <- n_s <- n_t <- sms <- smw <- smc <- sss <- 
    ssw <- ssc <- NULL
  check_id(.source, .target)
  
  cols_score_ <- match.arg(.col_score, c("sms", "smw", "smc", "sss", "ssw", "ssc"))
  
  source_  <- prep_tables(.source, .cols_match)
  target_  <- prep_tables(.target, .cols_match)
  score_  <- tibble::as_tibble(.score)
  
  
  col_s_ <- colnames(source_)[!colnames(source_) == "_id_"]
  col_t_ <- colnames(target_)[!colnames(target_) == "_id_"]
  
  
  col_e_ <- col_s_[col_s_ %in% col_t_]
  col_e_ <- col_e_[!col_e_ == "id"]
  col_e_ <- unlist(purrr::map2(paste0(col_e_, "_s"), paste0(col_e_, "_t"), c))
  
  tab_ <- dplyr::filter(score_, !!dplyr::sym(cols_score_) > 0)
  
  if (!is.null(.min_sim)) {
    for (i in seq_len(length(.min_sim))) {
      n_ <- paste0("sim_", names(.min_sim)[i])
      v_ <- .min_sim[i]
      tab_ <- dplyr::filter(tab_, !!dplyr::sym(n_) >= v_)
    }
  }
  tab_ <- tab_ %>%
    dplyr::group_by(id_t) %>%
    dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
    dplyr::mutate(n_t = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::group_by(id_s) %>%
    dplyr::slice_max(!!dplyr::sym(cols_score_)) %>%
    dplyr::mutate(n_s = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::left_join(
      y = dplyr::select(source_, -c(`_id_`)),
      by = c("id_s" = "id"),
      suffix = c("_s", "_t")
    ) %>%
    dplyr::left_join(
      y = dplyr::select(target_, -c(`_id_`)),
      by = c("id_t" = "id"),
      suffix = c("_s", "_t")
    ) %>%
    dplyr::mutate(
      len_s = lengths(all_s),
      len_t = lengths(all_t)
    )
  
  cols_use_ <- colnames(tab_)
  cols_use_ <- cols_use_[cols_use_ %in% c(
    "id_s", "id_t", "n_s", "n_t", "all_s", "all_t", "len_s", "len_t",
    paste0("sim_", .cols_match), "sms", "smw", "smc", "sss", "ssw", "ssc"
  )]
  
  tab_[, c(cols_use_, col_e_)]
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.