#' Join two tibbles based on an approximate (rather than perfect) matching of two columns.
#' @param x Tibble. First dataset.
#' @param y Tibble. Second dataset.
#' @param match_x Character string. Name of the column to match in the first dataset.
#' @param match_y Character string. Name of the column to match in the second dataset.
#' @param maxdist Integer. Maximum number of character differences between matching strings.
#' @param comvar Character vector. Names of the variables used for preliminary grouping.
#' @return A tibble resulting for the matching of two files.
#' @importFrom dplyr mutate
#' @importFrom dplyr full_join
#' @importFrom dplyr select
#' @importFrom dplyr everything
#' @importFrom dplyr group_by_at
#' @importFrom dplyr vars
#' @importFrom dplyr group_by
#' @importFrom dplyr inner_join
#' @importFrom dplyr rename
#' @importFrom stringr str_replace_all
#' @importFrom stringdist amatch
#' @importFrom purrr map_int
#' @importFrom purrr map2
#' @importFrom tidyr nest
#' @importFrom utils write.csv
#' @importFrom stats na.omit
#' @export
proxim_join <- function(x = NA,
y = NA,
match_x = NA,
match_y = NA,
maxdist = 100,
comvar = "None") {
# Bind variables
data <- NULL
defgrp <- NULL
data_x <- NULL
data_y <- NULL
data_z <- NULL
matching <- function(x, y, maxdist){
x <- x %>%
mutate(match = 1:nrow(x))
z <- y %>%
mutate(match = map_int(match_y, amatch, table = x$match_x, method = "lv", maxDist = maxdist)) %>%
full_join(x, by = "match")
return(z)
}
x <- x %>%
dplyr::select(match_x = match_x, everything()) %>%
mutate(match_x = str_replace_all(match_x, "[[:punct:]]", " ")) %>%
mutate(match_x = str_replace_all(match_x, "[^[:alnum:]]", " "))
y <- y %>%
dplyr::select(match_y = match_y, everything()) %>%
mutate(match_y = str_replace_all(match_y, "[[:punct:]]", " ")) %>%
mutate(match_y = str_replace_all(match_y, "[^[:alnum:]]", " "))
if (comvar[1] != "None") {
x <- x %>%
group_by_at(dplyr::vars(comvar)) %>%
nest() %>%
dplyr::rename(data_x = data) %>%
na.omit()
y <- y %>%
group_by_at(dplyr::vars(comvar)) %>%
nest() %>%
dplyr::rename(data_y = data) %>%
na.omit()
} else {
comvar <- c("defgrp")
x <- x %>%
mutate(defgrp = "All") %>%
group_by(defgrp) %>%
nest() %>%
dplyr::select(defgrp, data_x = data) %>%
na.omit()
y <- y %>%
mutate(defgrp = "All") %>%
group_by(defgrp) %>%
dplyr::select(defgrp, data_y = data) %>%
na.omit()
}
for (i in 1:length(comvar)) {
x[,comvar[[i]]] <- as.character(unlist(x[,comvar[[i]]]))
y[,comvar[[i]]] <- as.character(unlist(y[,comvar[[i]]]))
}
z <- inner_join(x, y, by = comvar) %>%
mutate(data_z = map2(data_x, data_y, matching, maxdist = maxdist)) %>%
dplyr::select(comvar, data_z) %>%
unnest() %>%
dplyr::select(match, match_x, match_y, everything())
names(z) <- c("match", match_x, match_y, names(z)[4:length(z)])
z
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.