R/deduplicate.R

#' @export
get_approx_dup_ids <- function(d, id_cols, uidName = NULL, customIdName = NULL,
                               max_dist = 0.1, asList = FALSE){
  d1 <- create_idcols(d, id_cols)
  if(max_dist == 0){
    dd <- left_join(d1,d1 %>% select(.custom_id))


  }
  dist <- str_dist(d1$.custom_id)
  dist_replace <- dist %>%
    filter(distance <= max_dist) %>% select(from = item1, to = item2)
  idx <- match_replace(d1$.row_id, dic = dist_replace, force = FALSE)
  d1$.new_id <- d1$.custom_id[idx]
  if(!is.null(customIdName))
    names(d1)[2] <- uidName
  if(!is.null(uidName))
    names(d1)[3] <- uidName
  d1
}


#' #' new_or_dup
#' #' Find if new or duplicate records
#' #' @name new_or_dup
#' #' @param x A number.
#' #' @param y A number.
#' #' @export
#' #' @return The sum of \code{x} and \code{y}.
#' #' @examples
#' #' add(1, 1)
#' new_or_dup <- function(dnew, dold, id_cols,id=NULL, max_dist = 0.1){
#'   if(!identical(sort(names(dnew)),sort(names(dold))))
#'     stop("Same structure needed for new and old data frames")
#'   dnew <- add_row_id(dnew, id = id)
#'   dnew <- create_idcols(dnew, id_cols)
#'   dold <- add_row_id(dold, id = id)
#'   dold <- create_idcols(dold, id_cols)
#'   if(max_dist != 0){
#'     dups <- stringdist_left_join(dnew,dold, by = "custom_id",
#'                                  method = "jw",
#'                                  max_dist = max_dist) %>%
#'       distinct() %>%
#'       select(custom_id = custom_id.x, .row_id = .row_id.x,
#'              match_id = .row_id.y, match = custom_id.y)
#'   }else{
#'     dups <- left_join(dnew,dold, by = "custom_id") %>%
#'       select(.row_id = .row_id.x, custom_id = custom_id, match_id = .row_id.y)
#'   }
#'   newdup <- dups %>% mutate(is_new = ifelse(is.na(match_id),"new","duplicate"))
#'   newdup
#' }

#' #' add_approx_unique_id
#' #' Add an approximate unique id to rows for a given column
#' #' @name add_approx_unique_id
#' #' @param x A number.
#' #' @param y A number.
#' #' @export
#' #' @return The sum of \code{x} and \code{y}.
#' #' @examples
#' #' add(1, 1)
#' add_approx_unique_id <- function(d, col,id=NULL, max_dist = 0.1, uidName = NULL, uidPrefix = NULL){
#'   uidName <- uidName %||% ".unique_id"
#'   d1 <- create_idcols(d, col, id = id, keepCols = TRUE)
#'   dic <- distinct(d1,custom_id,.keep_all = FALSE)
#'   if(max_dist != 0){
#'     dups <- stringdist_left_join(dic,d1, by = "custom_id",
#'                                  method = "jw",max_dist = max_dist) %>%
#'       select(custom_id = custom_id.x, .row_id)
#'   }else{
#'     dups <- left_join(dic,d1, by = "custom_id")
#'   }
#'   x <- dups %>%
#'     arrange(.row_id,custom_id) %>%
#'     group_by(.row_id) %>%
#'     summarise(members = paste0(custom_id,collapse = "-")) %>%
#'     add_unique_id(col = "members",uidName = NULL) %>%
#'     select(.row_id,.unique_id)
#'   d2 <- left_join(d1,x, by = ".row_id")
#'   if(!is.null(uidPrefix))
#'     d2 <- d2 %>% mutate(.unique_id=paste0(uidPrefix,.unique_id))
#'
#'   d2 <- d2 %>%
#'     rename_(.dots = setNames(".unique_id", uidName)) %>%
#'     move_first(uidName)
#'   d2$.row_id <- NULL
#'   d2
#' }


#' #' get_approx_dups
#' #' Find possible duplicates
#' #' @name approx_dup
#' #' @param x A number.
#' #' @param y A number.
#' #' @export
#' #' @return The sum of \code{x} and \code{y}.
#' #' @examples
#' #' add(1, 1)
#' get_approx_dups <- function(d, id_cols,id=NULL, max_dist = 0.1){
#'   dups <- get_approx_dup_ids(d, id_cols = id_cols, id = id,
#'                                  max_dist = max_dist, asList = FALSE)
#'   d$.row_id <- 1:nrow(d)
#'   left_join(dups,d, by = c(".row_id"=".row_id"))
#' }


#' #' get_approx_dup_ids
#' #' Find possible duplicates
#' #' @name approx_dup
#' #' @param x A number.
#' #' @param y A number.
#' #' @export
#' #' @return The sum of \code{x} and \code{y}.
#' #' @examples
#' #' add(1, 1)
#' get_approx_dup_ids <- function(d, id_cols, id = NULL,
#'                                max_dist = 0.1, asList = FALSE){
#'   d1 <- create_idcols(d, id_cols, id)
#'   dic <- distinct(d1,custom_id,.keep_all = FALSE)
#'   if(max_dist != 0){
#'     dups <- stringdist_left_join(dic,d1, by = "custom_id",
#'                                  method = "jw",max_dist = max_dist) %>%
#'       select(custom_id = custom_id.x, .row_id)
#'   }else{
#'     dups <- left_join(dic,d1,by = "custom_id")
#'   }
#'   dupsDf <- dups %>%
#'     group_by(custom_id) %>%
#'     filter(n()>1) %>%
#'     select(custom_id,.row_id) %>%
#'     slice_rows("custom_id") %>%
#'     by_slice(function(x) x$.row_id, .collate="list",.to = ".row_id")
#'   if(asList){
#'     dupsList <- dupsDf %>% .$.row_id
#'     names(dupsList) <- dupsDf$custom_id
#'     dupsList
#'     return(dupsList)
#'   }
#'   unnest(dupsDf)
#' }
jpmarindiaz/deduplicate documentation built on May 19, 2019, 10:46 p.m.