R/scores_data.R

Defines functions scores_data

Documented in scores_data

# Generated by fusen: do not edit by hand

#' Score Data
#' 
#' Description
#' 
#' @param .matches 
#' Dataframe produced by match_data()
#' @param .source 
#' The Source Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .target 
#' The Target Dataframe.\cr
#' (Must contain a unique column id and the columns you want to match on)
#' @param .cols_match 
#' A character vector of columns to perform fuzzy matching.  
#' @param .cols_exact 
#' Columns that must be matched perfectly.\cr
#' (Data will be partitioned using those columns)
#' @param .w_unique 
#' Weights calculated by get_weights()
#' @param .w_custom 
#' A named numeric vector that matches the columns of .cols_match w/o the columns of .cols_exact
#'
#' @return A dataframe
#' 
#' @export
#' @examples
#' tab_source <- table_source[1:100, ]
#' tab_target <- table_target[1:999, ]
#' cols_match <- c("name", "iso3", "city", "address")
#' cols_exact <- "iso3"
#' cols_join  <- c("name", "iso3")
#' tab_match <- match_data(
#'   .source = tab_source,
#'   .target = tab_target,
#'   .cols_match = cols_match,
#'   .cols_exact = cols_exact,
#'   .cols_join = cols_join,
#'   .method = "soundex",
#' )
#' scores_data(
#'   .matches = tab_match, 
#'   .source = tab_source, 
#'   .target = tab_target, 
#'   .cols_match = cols_match,
#'   .cols_exact = cols_exact
#'   )
scores_data <- function(.matches, .source, .target, .cols_match, .cols_exact = NULL, 
                        .w_unique = NULL, .w_custom = NULL) {
  id_s <- id_t <- . <- n_s <- add_t <- NULL
  
  check_id(.source, .target)
  source_  <- prep_tables(.source, .cols_match)
  target_  <- prep_tables(.target, .cols_match)
  matches_ <- tibble::as_tibble(.matches)

  # cols_ <- colnames(matches_)
  # cols_ <- gsub("sim_", "", cols_[grepl("^sim_", cols_)])
  cols_ <- .cols_match[!.cols_match %in% .cols_exact]
  
  if (!is.null(.w_unique)) {
    help_check_weights(.w_unique, cols_)
    wu_ <- .w_unique
  } else {
    wu_ <- (get_weights(source_, cols_) + get_weights(target_, cols_)) / 2
  }
  
  if (!is.null(.w_custom)) {
    help_check_weights(.w_custom, cols_)
    wc_ <- .w_custom[order(match(names(.w_custom), cols_))]
    wc_ <- wc_ / sum(wc_)
  } else {
    wc_ <- rep(NA_real_, length(cols_))
  }

  mat_ <- as.matrix(matches_[, paste0("sim_", cols_)])
  
  matches_ %>%
    dplyr::mutate(
      sms = rowMeans(mat_, na.rm = TRUE),
      smw = rowSums(mat_ * wu_, na.rm = TRUE),
      smc = rowSums(mat_ * wc_, na.rm = TRUE),
      
      sss = rowMeans(mat_ ^ 2, na.rm = TRUE),
      ssw = rowSums(mat_  ^ 2 * wu_, na.rm = TRUE),
      ssc = rowSums(mat_  ^ 2 * wc_, na.rm = TRUE),
    )
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.