R/proxim_join.R

Defines functions proxim_join

Documented in proxim_join

#' Join two tibbles based on an approximate (rather than perfect) matching of two columns.
#' @param x       Tibble. First dataset.
#' @param y       Tibble. Second dataset.
#' @param match_x Character string. Name of the column to match in the first dataset.
#' @param match_y Character string. Name of the column to match in the second dataset.
#' @param maxdist Integer. Maximum number of character differences between matching strings.
#' @param comvar  Character vector. Names of the variables used for preliminary grouping.
#' @return A tibble resulting for the matching of two files.
#' @importFrom dplyr mutate
#' @importFrom dplyr full_join
#' @importFrom dplyr select
#' @importFrom dplyr everything
#' @importFrom dplyr group_by_at
#' @importFrom dplyr vars
#' @importFrom dplyr group_by
#' @importFrom dplyr inner_join
#' @importFrom dplyr rename
#' @importFrom stringr str_replace_all
#' @importFrom stringdist amatch
#' @importFrom purrr map_int
#' @importFrom purrr map2
#' @importFrom tidyr nest
#' @importFrom utils write.csv
#' @importFrom stats na.omit
#' @export


proxim_join <- function(x = NA,
                        y = NA,
                        match_x = NA,
                        match_y = NA,
                        maxdist = 100,
                        comvar = "None") {
  
  # Bind variables
  data <- NULL
  defgrp <- NULL
  data_x <- NULL
  data_y <- NULL
  data_z <- NULL
  
  
  matching <- function(x, y, maxdist){
    
    x <- x %>%
      mutate(match = 1:nrow(x))
    
    z <- y %>%
      mutate(match = map_int(match_y, amatch, table = x$match_x, method = "lv", maxDist = maxdist)) %>%
      full_join(x, by = "match")
    
    return(z)
  }
  
  x <- x %>%
    dplyr::select(match_x = match_x, everything()) %>%
    mutate(match_x = str_replace_all(match_x, "[[:punct:]]", " ")) %>%
    mutate(match_x = str_replace_all(match_x, "[^[:alnum:]]", " "))
  y <- y %>%
    dplyr::select(match_y = match_y, everything()) %>%
    mutate(match_y = str_replace_all(match_y, "[[:punct:]]", " ")) %>%
    mutate(match_y = str_replace_all(match_y, "[^[:alnum:]]", " "))
  
  if (comvar[1] != "None") {
    x <- x %>%
      group_by_at(dplyr::vars(comvar)) %>%
      nest() %>%
      dplyr::rename(data_x = data) %>%
      na.omit()
    y <- y %>%
      group_by_at(dplyr::vars(comvar)) %>%
      nest() %>%
      dplyr::rename(data_y = data) %>%
      na.omit()
  } else {
    comvar <- c("defgrp")
    x <- x %>%
      mutate(defgrp = "All") %>%
      group_by(defgrp) %>%
      nest() %>%
      dplyr::select(defgrp, data_x = data) %>%
      na.omit()
    y <- y %>%
      mutate(defgrp = "All") %>%
      group_by(defgrp) %>%
      dplyr::select(defgrp, data_y = data) %>%
      na.omit()
  }
  
  for (i in 1:length(comvar)) {
    x[,comvar[[i]]] <- as.character(unlist(x[,comvar[[i]]]))
    y[,comvar[[i]]] <- as.character(unlist(y[,comvar[[i]]]))
  }
  
  z <- inner_join(x, y, by = comvar) %>%
    mutate(data_z = map2(data_x, data_y, matching, maxdist = maxdist)) %>%
    dplyr::select(comvar, data_z) %>%
    unnest() %>%
    dplyr::select(match, match_x, match_y, everything())
  
  names(z) <- c("match", match_x, match_y, names(z)[4:length(z)])

  z
}
NicolasJBM/datexp documentation built on May 14, 2019, 10:36 a.m.