R/check_dup.R

Defines functions check_dup

# Generated by fusen: do not edit by hand

#' Check Duplicates
#' 
#' Description
#'
#' @param .source
#' The Source Dataframe. 
#' Must contain a unique column id and the columns you want to match on
#' @param .target
#' The Target Dataframe.
#' Must contain a unique column id and the columns you want to match on 
#' @param .check 
#' Check only column that are also in source, or all columns
#' @return A list with duplicates
#' 
#' @noRd
#' @examples
#' check_dup(table_source, table_target)
check_dup <- function(.source, .target, .check = c("source", "all")) {
  check_ <- match.arg(.check, c("source", "all"))
  
  .source <- tibble::as_tibble(.source)
  .target <- tibble::as_tibble(.target)
  
  cols_s_ <- stats::setNames(colnames(.source), paste0("s_", colnames(.source)))
  cols_t_ <- stats::setNames(colnames(.target), paste0("t_", colnames(.target)))
  cols_s_ <- cols_s_[!cols_s_ == "id"]
  cols_t_ <- cols_t_[!cols_t_ == "id"]
  cols_t_ <- cols_t_[order(match(cols_t_,cols_s_))]
  
  if (check_ == "source") {
    cols_t_ <- cols_t_[cols_t_ %in% cols_s_]
  }
  
  s_ <- tibble::as_tibble(.source)
  t_ <- tibble::as_tibble(.target)

  

  ind_ <- c(
    purrr::map_int(cols_s_, ~ sum(duplicated(s_[[.x]]))),
    purrr::map_int(cols_t_, ~ sum(duplicated(t_[[.x]])))
  )

  cum_ <- c(
    purrr::map_int(
      .x = stats::setNames(seq_len(length(cols_s_)), names(cols_s_)),
      .f = ~ sum(duplicated(apply(s_[, cols_s_[1:.x]], 1, paste, collapse = "-")))
    ),
    purrr::map_int(
      .x = stats::setNames(seq_len(length(cols_t_)), names(cols_t_)),
      .f = ~ sum(duplicated(apply(t_[, cols_t_[1:.x]], 1, paste, collapse = "-")))
    )
  )
  
  list(ind = ind_, cum = cum_)
  
}
MatthiasUckert/Rmatch documentation built on Jan. 3, 2022, 11:09 p.m.